ingonyama-zk · aviadingo · Jun 9, 2024 · Jun 16, 2024 · Jun 16, 2024 · Aug 4, 2024
diff --git a/icicle/include/hash/blake2s/blake2s.cuh b/icicle/include/hash/blake2s/blake2s.cuh
@@ -0,0 +1,51 @@
+/*
+ * blake2b.cuh CUDA Implementation of BLAKE2B Hashing
+ *
+ * Date: 12 June 2019
+ * Revision: 1
+ *
+ * This file is released into the Public Domain.
+ */
+
+#pragma once
+typedef unsigned char BYTE;
+typedef unsigned int WORD;
+typedef unsigned long long LONG;
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdint.h>
+#include "gpu-utils/device_context.cuh"
+#include "gpu-utils/error_handler.cuh"
+
+#include "hash/hash.cuh"
+using namespace hash;
+
+namespace blake2s {
+#define BLAKE2S_ROUNDS       10
+#define BLAKE2S_BLOCK_LENGTH 64
+#define BLAKE2S_CHAIN_SIZE   8
+#define BLAKE2S_CHAIN_LENGTH (BLAKE2S_CHAIN_SIZE * sizeof(uint32_t))
+#define BLAKE2S_STATE_SIZE   16
+#define BLAKE2S_STATE_LENGTH (BLAKE2S_STATE_SIZE * sizeof(uint32_t))
+
+  class Blake2s : public Hasher<BYTE, BYTE>
+  {
+  public:
+    cudaError_t run_hash_many_kernel(
+      const BYTE* input,
+      BYTE* output,
+      WORD number_of_states,
+      WORD input_len,
+      WORD output_len,
+      const device_context::DeviceContext& ctx) const override;
+
+    Blake2s() : Hasher<BYTE, BYTE>(BLAKE2S_STATE_SIZE, BLAKE2S_STATE_SIZE, BLAKE2S_STATE_SIZE, 0) {}
+  };
+
+  extern "C" {
+  void
+  mcm_cuda_blake2s_hash_batch(BYTE* key, WORD keylen, BYTE* in, WORD inlen, BYTE* out, WORD output_len, WORD n_batch);
+  }
+} // namespace blake2s
diff --git a/icicle/src/hash/blake2s/Makefile b/icicle/src/hash/blake2s/Makefile
@@ -0,0 +1,26 @@
+test_blake2s: test_blake2s.cu blake2s.cu 
+	nvcc -o test_blake2s -I. -I../../../include test_blake2s.cu blake2s.cu -g
+	./test_blake2s
+
+test_blake2s_batched: test_blake2s_batched.cu blake2s.cu 
+	nvcc -o test_blake2s_batched -I. -I../../../include test_blake2s_batched.cu -g
+	./test_blake2s_batched ./batched_test_vectors.csv
+
+test_blake2s_integ: test_blake2s_integ.cu blake2s.cu 
+	nvcc -o test_blake2s_integ -I. -I../../../include test_blake2s_integ.cu -g
+	./test_blake2s_integ
+
+test_blake2s_seq: test_blake2s_seq.cu blake2s.cu 
+	nvcc -o test_blake2s_seq -I. -I../../../include test_blake2s_seq.cu -g
+	./test_blake2s_seq
+
+test_blake2s_seq_sa: test_blake2s_seq_sa.cu blake2s.cu 
+	nvcc -o test_blake2s_seq_sa -I. -I../../../include test_blake2s_seq_sa.cu -g
+	./test_blake2s_seq_sa
+
+test_blake2s_tree: test_tree.cu blake2s.cu ../../merkle-tree/merkle.cu
+	nvcc -DMERKLE_DEBUG -o test_blake2s_tree -I../../../include test_tree.cu
+	./test_blake2s_tree
+
+clear:
+	rm test_blake2s test_blake2s_tree test_blake2s_integ test_blake2s_seq test_blake2s_seq_sa test_blake2s_batched
diff --git a/icicle/src/hash/blake2s/batched_test_vectors.csv b/icicle/src/hash/blake2s/batched_test_vectors.csv
@@ -0,0 +1,10 @@
+9301876542, 1e95b837356f67e9b456d636dd3d3f55bfff01eb78b375f613db5819f27e5972
+5279608431, 000386abd221e7049e78091c4b964719dd45fda6a70ad88194c8ecc7fb5ec4bc
+1842976503, b35ce7baa5c5c620be9bf0d03a7a5b43fd18e786e1678ad797a30b50dc48ccc7
+6498302715, 489412fcb1a74c14dbe06aaad61cf2d3ed0eaa6a6154afc9f8b58fe92ffcebf1
+7023598146, 6b9a45147bd1c61f8d1d3d110cb705ae85ddc31ac7cb18e47306bc51d4d807ba
+3150729846, 5d9d597b956a26fd79cd8bdf38e306db068b6089b305268b90fd1a304a5b2224
+9583402167, 584cd56b727e14ccc7fcaf406982faab08529b6789748c9ffc74748b033cf44f
+8760134295, 2cba2adb552cc89312c614c3d720edaa5cf03bc5fc2a012511cfee013636ee40
+5402987631, d6d4920f85f5e286f3add452fab5b19f31e66293ec612b29389643f78b4ace2a
+2739810546, 69aee09804f37a477b34f9a4447b39e9caaae49bbc5056b8018dd513c8cec263
diff --git a/icicle/src/hash/blake2s/blake2s.cu b/icicle/src/hash/blake2s/blake2s.cu
@@ -0,0 +1,278 @@
+#include <assert.h>
+#include "gpu-utils/device_context.cuh"
+#include "gpu-utils/error_handler.cuh"
+#include "gpu-utils/modifiers.cuh"
+#include "hash/hash.cuh"
+
+#include "hash/blake2s/blake2s.cuh"
+
+using namespace hash;
+
+namespace blake2s {
+
+  typedef struct {
+    WORD digestlen;
+    BYTE key[32];
+    WORD keylen;
+    BYTE buff[BLAKE2S_BLOCK_LENGTH];
+    uint32_t chain[BLAKE2S_CHAIN_SIZE];
+    uint32_t state[BLAKE2S_STATE_SIZE];
+    WORD pos;
+    uint32_t t0;
+    uint32_t t1;
+    uint32_t f0;
+  } cuda_blake2s_ctx_t;
+
+  typedef cuda_blake2s_ctx_t CUDA_BLAKE2S_CTX;
+
+  __constant__ CUDA_BLAKE2S_CTX c_CTX;
+
+  __constant__ uint32_t BLAKE2S_IVS[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
+                                          0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL};
+
+  const uint32_t CPU_BLAKE2S_IVS[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
+                                       0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL};
+
+  void cpu_blake2s_init(cuda_blake2s_ctx_t* ctx, BYTE* key, WORD keylen, WORD digestbitlen)
+  {
+    memset(ctx, 0, sizeof(cuda_blake2s_ctx_t));
+    if (keylen > 0) {
+      memcpy(ctx->buff, key, keylen);
+      memcpy(ctx->key, key, keylen);
+    }
+    ctx->keylen = keylen;
+    ctx->digestlen = digestbitlen >> 3;
+    ctx->pos = 0;
+    ctx->t0 = 0;
+    ctx->t1 = 0;
+    ctx->f0 = 0;
+    ctx->chain[0] = CPU_BLAKE2S_IVS[0] ^ (ctx->digestlen | (ctx->keylen << 8) | 0x1010000);
+    ctx->chain[1] = CPU_BLAKE2S_IVS[1];
+    ctx->chain[2] = CPU_BLAKE2S_IVS[2];
+    ctx->chain[3] = CPU_BLAKE2S_IVS[3];
+    ctx->chain[4] = CPU_BLAKE2S_IVS[4];
+    ctx->chain[5] = CPU_BLAKE2S_IVS[5];
+    ctx->chain[6] = CPU_BLAKE2S_IVS[6];
+    ctx->chain[7] = CPU_BLAKE2S_IVS[7];
+
+    ctx->pos = (keylen > 0) ? BLAKE2S_BLOCK_LENGTH : 0;
+  }
+
+  __constant__ uint8_t BLAKE2S_SIGMA[10][16] = {
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
+    {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
+    {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13}, {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
+    {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11}, {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
+    {6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5}, {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0}};
+
+  __device__ uint32_t cuda_blake2s_leuint32(const BYTE* in)
+  {
+    uint32_t a;
+    memcpy(&a, in, 4);
+    return a;
+  }
+
+  __device__ uint32_t cuda_blake2s_ROTR32(uint32_t a, uint8_t b) { return (a >> b) | (a << (32 - b)); }
+
+  __device__ void
+  cuda_blake2s_G(cuda_blake2s_ctx_t* ctx, uint32_t m1, uint32_t m2, int32_t a, int32_t b, int32_t c, int32_t d)
+  {
+    ctx->state[a] = ctx->state[a] + ctx->state[b] + m1;
+    ctx->state[d] = cuda_blake2s_ROTR32(ctx->state[d] ^ ctx->state[a], 16);
+    ctx->state[c] = ctx->state[c] + ctx->state[d];
+    ctx->state[b] = cuda_blake2s_ROTR32(ctx->state[b] ^ ctx->state[c], 12);
+    ctx->state[a] = ctx->state[a] + ctx->state[b] + m2;
+    ctx->state[d] = cuda_blake2s_ROTR32(ctx->state[d] ^ ctx->state[a], 8);
+    ctx->state[c] = ctx->state[c] + ctx->state[d];
+    ctx->state[b] = cuda_blake2s_ROTR32(ctx->state[b] ^ ctx->state[c], 7);
+  }
+
+  __device__ __forceinline__ void cuda_blake2s_init_state(cuda_blake2s_ctx_t* ctx)
+  {
+    memcpy(ctx->state, ctx->chain, BLAKE2S_CHAIN_LENGTH);
+    // ctx->state[8] = ctx->t0;
+    // ctx->state[9] = ctx->t1;
+    // ctx->state[10] = ctx->f0;
+    // ctx->state[11] = BLAKE2S_IVS[4];
+    ctx->state[8] = BLAKE2S_IVS[0];
+    ctx->state[9] = BLAKE2S_IVS[1];
+    ctx->state[10] = BLAKE2S_IVS[2];
+    ctx->state[11] = BLAKE2S_IVS[3];
+    ctx->state[12] = ctx->t0 ^ BLAKE2S_IVS[4];
+    ctx->state[13] = ctx->t1 ^ BLAKE2S_IVS[5];
+    ctx->state[14] = ctx->f0 ^ BLAKE2S_IVS[6];
+    ctx->state[15] = BLAKE2S_IVS[7];
+    // ctx->state[12] = BLAKE2S_IVS[5];
+    // ctx->state[13] = BLAKE2S_IVS[6];
+    // ctx->state[14] = BLAKE2S_IVS[7];
+  }
+
+  __device__ __forceinline__ void cuda_blake2s_compress(cuda_blake2s_ctx_t* ctx, const BYTE* in, WORD inoffset)
+  {
+    cuda_blake2s_init_state(ctx);
+    uint32_t m[16] = {0};
+    for (int j = 0; j < 16; j++)
+      m[j] = cuda_blake2s_leuint32(in + inoffset + (j << 2));
+
+    for (int round = 0; round < BLAKE2S_ROUNDS; round++) {
+      cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][0]], m[BLAKE2S_SIGMA[round][1]], 0, 4, 8, 12);
+      cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][2]], m[BLAKE2S_SIGMA[round][3]], 1, 5, 9, 13);
+      cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][4]], m[BLAKE2S_SIGMA[round][5]], 2, 6, 10, 14);
+      cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][6]], m[BLAKE2S_SIGMA[round][7]], 3, 7, 11, 15);
+      cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][8]], m[BLAKE2S_SIGMA[round][9]], 0, 5, 10, 15);
+      cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][10]], m[BLAKE2S_SIGMA[round][11]], 1, 6, 11, 12);
+      cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][12]], m[BLAKE2S_SIGMA[round][13]], 2, 7, 8, 13);
+      cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][14]], m[BLAKE2S_SIGMA[round][15]], 3, 4, 9, 14);
+    }
+
+    for (int offset = 0; offset < BLAKE2S_CHAIN_SIZE; offset++)
+      ctx->chain[offset] = ctx->chain[offset] ^ ctx->state[offset] ^ ctx->state[offset + 8];
+  }
+
+  __device__ void cuda_blake2s_init(cuda_blake2s_ctx_t* ctx, BYTE* key, WORD keylen, WORD digestbitlen)
+  {
+    memset(ctx, 0, sizeof(cuda_blake2s_ctx_t));
+    ctx->keylen = keylen;
+    ctx->digestlen = digestbitlen >> 3;
+    ctx->pos = 0;
+    ctx->t0 = 0;
+    ctx->t1 = 0;
+    ctx->f0 = 0;
+    ctx->chain[0] = BLAKE2S_IVS[0] ^ (ctx->digestlen | (ctx->keylen << 8) | 0x1010000);
+    ctx->chain[1] = BLAKE2S_IVS[1];
+    ctx->chain[2] = BLAKE2S_IVS[2];
+    ctx->chain[3] = BLAKE2S_IVS[3];
+    ctx->chain[4] = BLAKE2S_IVS[4];
+    ctx->chain[5] = BLAKE2S_IVS[5];
+    ctx->chain[6] = BLAKE2S_IVS[6];
+    ctx->chain[7] = BLAKE2S_IVS[7];
+
+    if (keylen > 0) {
+      memcpy(ctx->buff, key, keylen);
+      memcpy(ctx->key, key, keylen);
+    }
+    ctx->pos = (keylen > 0) ? BLAKE2S_BLOCK_LENGTH : 0;
+  }
+
+  __device__ void cuda_blake2s_update(cuda_blake2s_ctx_t* ctx, const BYTE* in, LONG inlen)
+  {
+    if (inlen == 0) return;
+
+    WORD start = 0;
+    int64_t in_index = 0, block_index = 0;
+
+    if (ctx->pos) {
+      start = BLAKE2S_BLOCK_LENGTH - ctx->pos;
+      if (start < inlen) {
+        memcpy(ctx->buff + ctx->pos, in, start);
+        ctx->t0 += BLAKE2S_BLOCK_LENGTH;
+
+        if (ctx->t0 == 0) ctx->t1++;
+
+        cuda_blake2s_compress(ctx, ctx->buff, 0);
+        ctx->pos = 0;
+        memset(ctx->buff, 0, BLAKE2S_BLOCK_LENGTH);
+      } else {
+        memcpy(ctx->buff + ctx->pos, in, inlen);
+        ctx->pos += inlen;
+        return;
+      }
+    }
+
+    block_index = inlen - BLAKE2S_BLOCK_LENGTH;
+    for (in_index = start; in_index < block_index; in_index += BLAKE2S_BLOCK_LENGTH) {
+      ctx->t0 += BLAKE2S_BLOCK_LENGTH;
+      if (ctx->t0 == 0) ctx->t1++;
+
+      cuda_blake2s_compress(ctx, in, in_index);
+    }
+
+    memcpy(ctx->buff, in + in_index, inlen - in_index);
+    ctx->pos += inlen - in_index;
+  }
+
+  __device__ void cuda_blake2s_final(cuda_blake2s_ctx_t* ctx, BYTE* out)
+  {
+    ctx->f0 = 0xFFFFFFFFUL;
+    ctx->t0 += ctx->pos;
+    if (ctx->pos > 0 && ctx->t0 == 0) ctx->t1++;
+
+    cuda_blake2s_compress(ctx, ctx->buff, 0);
+    memset(ctx->buff, 0, BLAKE2S_BLOCK_LENGTH);
+    memset(ctx->state, 0, BLAKE2S_STATE_LENGTH);
+
+    int i4 = 0;
+    for (int i = 0; i < BLAKE2S_CHAIN_SIZE && ((i4 = i * 4) < ctx->digestlen); i++) {
+      BYTE* BYTEs = (BYTE*)(&ctx->chain[i]);
+      if (i4 < ctx->digestlen - 4)
+        memcpy(out + i4, BYTEs, 4);
+      else
+        memcpy(out + i4, BYTEs, ctx->digestlen - i4);
+    }
+  }
+
+  __global__ void
+  kernel_blake2s_hash(const BYTE* indata, WORD inlen, BYTE* outdata, WORD n_batch, WORD BLAKE2S_BLOCK_SIZE)
+  {
+    WORD thread = blockIdx.x * blockDim.x + threadIdx.x;
+    if (thread >= n_batch) { return; }
+    BYTE key[32] = ""; // Null key
+    WORD keylen = 0;
+    CUDA_BLAKE2S_CTX blake_ctx;
+    const BYTE* in = indata + thread * inlen;
+    BYTE* out = outdata + thread * BLAKE2S_BLOCK_SIZE;
+
+    cuda_blake2s_init(&blake_ctx, key, keylen, (BLAKE2S_BLOCK_SIZE << 3));
+    cuda_blake2s_update(&blake_ctx, in, inlen);
+    cuda_blake2s_final(&blake_ctx, out);
+  }
+
+  extern "C" {
+  void
+  mcm_cuda_blake2s_hash_batch(BYTE* key, WORD keylen, BYTE* in, WORD inlen, BYTE* out, WORD output_len, WORD n_batch)
+  {
+    BYTE* cuda_indata;
+    BYTE* cuda_outdata;
+    const WORD BLAKE2S_BLOCK_SIZE = output_len;
+    cudaMalloc(&cuda_indata, inlen * n_batch);
+    cudaMalloc(&cuda_outdata, BLAKE2S_BLOCK_SIZE * n_batch);
+    assert(keylen <= 32);
+
+    // CUDA_BLAKE2S_CTX ctx;
+    // cpu_blake2s_init(&ctx, key, keylen, n_outbit);
+    // cudaMemcpyToSymbol(c_CTX, &ctx, sizeof(CUDA_BLAKE2S_CTX), 0, cudaMemcpyHostToDevice);
+
+    cudaMemcpy(cuda_indata, in, inlen * n_batch, cudaMemcpyHostToDevice);
+
+    WORD thread = 256;
+    WORD block = (n_batch + thread - 1) / thread;
+    kernel_blake2s_hash<<<block, thread>>>(cuda_indata, inlen, cuda_outdata, n_batch, BLAKE2S_BLOCK_SIZE);
+    cudaMemcpy(out, cuda_outdata, BLAKE2S_BLOCK_SIZE * n_batch, cudaMemcpyDeviceToHost);
+    cudaDeviceSynchronize();
+    cudaError_t error = cudaGetLastError();
+    if (error != cudaSuccess) { printf("Error cuda blake2s hash: %s \n", cudaGetErrorString(error)); }
+    cudaFree(cuda_indata);
+    cudaFree(cuda_outdata);
+  }
+  }
+
+  cudaError_t Blake2s::run_hash_many_kernel(
+    const BYTE* input,
+    BYTE* output,
+    WORD number_of_states,
+    WORD input_len,
+    WORD output_len,
+    const device_context::DeviceContext& ctx) const
+  {
+    const WORD BLAKE2S_BLOCK_SIZE = output_len;
+    WORD thread = 256;
+    WORD block = (number_of_states + thread - 1) / thread;
+
+    kernel_blake2s_hash<<<block, thread, 0, ctx.stream>>>(
+      input, input_len, output, number_of_states, BLAKE2S_BLOCK_SIZE);
+
+    CHK_IF_RETURN(cudaPeekAtLastError());
+    return CHK_LAST();
+  }
+
+} // namespace blake2s
diff --git a/icicle/src/hash/blake2s/expected_hashes.csv b/icicle/src/hash/blake2s/expected_hashes.csv
@@ -0,0 +1,10 @@
+652e530edee5893b576f72b875ea1c918e85e29d859e7e3fa78b623d8abca3de
+57e36dae300302953c953b59a1b263cb314326db44e919ca4acd57e1da8f0543
+9fbfc63ab34b8c35c58e9178c60e2bb165dde7340cb063e9567e4f6bef5eacbe
+f26bbce62eaed81606f72000d95f0eea0eac23893f9db0c1a65af173c5095cb7
+9a21619cac392bd9c80725b2161b033efc0dad3b57d8a9b4c5103ed1cd065a38
+b2e524f32a0b8eeb72737da8ea0075c4e7e11936289954d571622f23e3df9076
+d78d4fe12e21ce58226c2707b86b167e237b24f1f84a5e39b073460998c5359d
+0d74da2a1062445822cbc8ec7bf424714e09923b4c1eba0ca2170504f56c4331
+f5205d77b033111f1e15f585a86b7a4c292c0ec39addb3b2fcb0de4a0bf61003
+410381eb72313f23f9f62478d62ec7635f4166ab5e53a20af5c9e8f7ee445de8