Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Aviad/blake2s #576

Closed
wants to merge 26 commits into from
Closed
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions icicle/include/hash/blake2s/blake2s.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/*
* blake2b.cuh CUDA Implementation of BLAKE2B Hashing
*
* Date: 12 June 2019
* Revision: 1
*
* This file is released into the Public Domain.
*/

#pragma once
typedef unsigned char BYTE;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please move these inside blake2s namespace

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

typedef unsigned int WORD;
typedef unsigned long long LONG;

#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <stdint.h>
#include "gpu-utils/device_context.cuh"
#include "gpu-utils/error_handler.cuh"

#include "hash/hash.cuh"
using namespace hash;

namespace blake2s {
#define BLAKE2S_ROUNDS 10
#define BLAKE2S_BLOCK_LENGTH 64
#define BLAKE2S_CHAIN_SIZE 8
#define BLAKE2S_CHAIN_LENGTH (BLAKE2S_CHAIN_SIZE * sizeof(uint32_t))
#define BLAKE2S_STATE_SIZE 16
#define BLAKE2S_STATE_LENGTH (BLAKE2S_STATE_SIZE * sizeof(uint32_t))

class Blake2s : public Hasher<BYTE, BYTE>
{
public:
cudaError_t run_hash_many_kernel(
const BYTE* input,
BYTE* output,
WORD number_of_states,
WORD input_len,
WORD output_len,
const device_context::DeviceContext& ctx) const override;

Blake2s() : Hasher<BYTE, BYTE>(BLAKE2S_STATE_SIZE, BLAKE2S_STATE_SIZE, BLAKE2S_STATE_SIZE, 0) {}
};

extern "C" {
void
mcm_cuda_blake2s_hash_batch(BYTE* key, WORD keylen, BYTE* in, WORD inlen, BYTE* out, WORD output_len, WORD n_batch);
}
} // namespace blake2s
26 changes: 26 additions & 0 deletions icicle/src/hash/blake2s/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
test_blake2s: test_blake2s.cu blake2s.cu
nvcc -o test_blake2s -I. -I../../../include test_blake2s.cu blake2s.cu -g
./test_blake2s

test_blake2s_batched: test_blake2s_batched.cu blake2s.cu
nvcc -o test_blake2s_batched -I. -I../../../include test_blake2s_batched.cu -g
./test_blake2s_batched ./batched_test_vectors.csv

test_blake2s_integ: test_blake2s_integ.cu blake2s.cu
nvcc -o test_blake2s_integ -I. -I../../../include test_blake2s_integ.cu -g
./test_blake2s_integ

test_blake2s_seq: test_blake2s_seq.cu blake2s.cu
nvcc -o test_blake2s_seq -I. -I../../../include test_blake2s_seq.cu -g
./test_blake2s_seq

test_blake2s_seq_sa: test_blake2s_seq_sa.cu blake2s.cu
nvcc -o test_blake2s_seq_sa -I. -I../../../include test_blake2s_seq_sa.cu -g
./test_blake2s_seq_sa

test_blake2s_tree: test_tree.cu blake2s.cu ../../merkle-tree/merkle.cu
nvcc -DMERKLE_DEBUG -o test_blake2s_tree -I../../../include test_tree.cu
./test_blake2s_tree

clear:
rm test_blake2s test_blake2s_tree test_blake2s_integ test_blake2s_seq test_blake2s_seq_sa test_blake2s_batched
10 changes: 10 additions & 0 deletions icicle/src/hash/blake2s/batched_test_vectors.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
9301876542, 1e95b837356f67e9b456d636dd3d3f55bfff01eb78b375f613db5819f27e5972
5279608431, 000386abd221e7049e78091c4b964719dd45fda6a70ad88194c8ecc7fb5ec4bc
1842976503, b35ce7baa5c5c620be9bf0d03a7a5b43fd18e786e1678ad797a30b50dc48ccc7
6498302715, 489412fcb1a74c14dbe06aaad61cf2d3ed0eaa6a6154afc9f8b58fe92ffcebf1
7023598146, 6b9a45147bd1c61f8d1d3d110cb705ae85ddc31ac7cb18e47306bc51d4d807ba
3150729846, 5d9d597b956a26fd79cd8bdf38e306db068b6089b305268b90fd1a304a5b2224
9583402167, 584cd56b727e14ccc7fcaf406982faab08529b6789748c9ffc74748b033cf44f
8760134295, 2cba2adb552cc89312c614c3d720edaa5cf03bc5fc2a012511cfee013636ee40
5402987631, d6d4920f85f5e286f3add452fab5b19f31e66293ec612b29389643f78b4ace2a
2739810546, 69aee09804f37a477b34f9a4447b39e9caaae49bbc5056b8018dd513c8cec263
278 changes: 278 additions & 0 deletions icicle/src/hash/blake2s/blake2s.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,278 @@
#include <assert.h>
#include "gpu-utils/device_context.cuh"
#include "gpu-utils/error_handler.cuh"
#include "gpu-utils/modifiers.cuh"
#include "hash/hash.cuh"

#include "hash/blake2s/blake2s.cuh"

using namespace hash;

namespace blake2s {

typedef struct {
WORD digestlen;
BYTE key[32];
WORD keylen;
BYTE buff[BLAKE2S_BLOCK_LENGTH];
uint32_t chain[BLAKE2S_CHAIN_SIZE];
uint32_t state[BLAKE2S_STATE_SIZE];
WORD pos;
uint32_t t0;
uint32_t t1;
uint32_t f0;
} cuda_blake2s_ctx_t;

typedef cuda_blake2s_ctx_t CUDA_BLAKE2S_CTX;

__constant__ CUDA_BLAKE2S_CTX c_CTX;

__constant__ uint32_t BLAKE2S_IVS[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL};

const uint32_t CPU_BLAKE2S_IVS[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL};

void cpu_blake2s_init(cuda_blake2s_ctx_t* ctx, BYTE* key, WORD keylen, WORD digestbitlen)
{
memset(ctx, 0, sizeof(cuda_blake2s_ctx_t));
if (keylen > 0) {
memcpy(ctx->buff, key, keylen);
memcpy(ctx->key, key, keylen);
}
ctx->keylen = keylen;
ctx->digestlen = digestbitlen >> 3;
ctx->pos = 0;
ctx->t0 = 0;
ctx->t1 = 0;
ctx->f0 = 0;
ctx->chain[0] = CPU_BLAKE2S_IVS[0] ^ (ctx->digestlen | (ctx->keylen << 8) | 0x1010000);
ctx->chain[1] = CPU_BLAKE2S_IVS[1];
ctx->chain[2] = CPU_BLAKE2S_IVS[2];
ctx->chain[3] = CPU_BLAKE2S_IVS[3];
ctx->chain[4] = CPU_BLAKE2S_IVS[4];
ctx->chain[5] = CPU_BLAKE2S_IVS[5];
ctx->chain[6] = CPU_BLAKE2S_IVS[6];
ctx->chain[7] = CPU_BLAKE2S_IVS[7];

ctx->pos = (keylen > 0) ? BLAKE2S_BLOCK_LENGTH : 0;
}

__constant__ uint8_t BLAKE2S_SIGMA[10][16] = {
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
{9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13}, {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11}, {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
{6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5}, {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0}};

__device__ uint32_t cuda_blake2s_leuint32(const BYTE* in)
{
uint32_t a;
memcpy(&a, in, 4);
return a;
}

__device__ uint32_t cuda_blake2s_ROTR32(uint32_t a, uint8_t b) { return (a >> b) | (a << (32 - b)); }
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe worth to add __inline__


__device__ void
cuda_blake2s_G(cuda_blake2s_ctx_t* ctx, uint32_t m1, uint32_t m2, int32_t a, int32_t b, int32_t c, int32_t d)
{
ctx->state[a] = ctx->state[a] + ctx->state[b] + m1;
ctx->state[d] = cuda_blake2s_ROTR32(ctx->state[d] ^ ctx->state[a], 16);
ctx->state[c] = ctx->state[c] + ctx->state[d];
ctx->state[b] = cuda_blake2s_ROTR32(ctx->state[b] ^ ctx->state[c], 12);
ctx->state[a] = ctx->state[a] + ctx->state[b] + m2;
ctx->state[d] = cuda_blake2s_ROTR32(ctx->state[d] ^ ctx->state[a], 8);
ctx->state[c] = ctx->state[c] + ctx->state[d];
ctx->state[b] = cuda_blake2s_ROTR32(ctx->state[b] ^ ctx->state[c], 7);
}

__device__ __forceinline__ void cuda_blake2s_init_state(cuda_blake2s_ctx_t* ctx)
{
memcpy(ctx->state, ctx->chain, BLAKE2S_CHAIN_LENGTH);
// ctx->state[8] = ctx->t0;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are these commented?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's from an old version. deleted

// ctx->state[9] = ctx->t1;
// ctx->state[10] = ctx->f0;
// ctx->state[11] = BLAKE2S_IVS[4];
ctx->state[8] = BLAKE2S_IVS[0];
ctx->state[9] = BLAKE2S_IVS[1];
ctx->state[10] = BLAKE2S_IVS[2];
ctx->state[11] = BLAKE2S_IVS[3];
ctx->state[12] = ctx->t0 ^ BLAKE2S_IVS[4];
ctx->state[13] = ctx->t1 ^ BLAKE2S_IVS[5];
ctx->state[14] = ctx->f0 ^ BLAKE2S_IVS[6];
ctx->state[15] = BLAKE2S_IVS[7];
// ctx->state[12] = BLAKE2S_IVS[5];
// ctx->state[13] = BLAKE2S_IVS[6];
// ctx->state[14] = BLAKE2S_IVS[7];
}

__device__ __forceinline__ void cuda_blake2s_compress(cuda_blake2s_ctx_t* ctx, const BYTE* in, WORD inoffset)
{
cuda_blake2s_init_state(ctx);
uint32_t m[16] = {0};
for (int j = 0; j < 16; j++)
m[j] = cuda_blake2s_leuint32(in + inoffset + (j << 2));

for (int round = 0; round < BLAKE2S_ROUNDS; round++) {
cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][0]], m[BLAKE2S_SIGMA[round][1]], 0, 4, 8, 12);
cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][2]], m[BLAKE2S_SIGMA[round][3]], 1, 5, 9, 13);
cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][4]], m[BLAKE2S_SIGMA[round][5]], 2, 6, 10, 14);
cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][6]], m[BLAKE2S_SIGMA[round][7]], 3, 7, 11, 15);
cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][8]], m[BLAKE2S_SIGMA[round][9]], 0, 5, 10, 15);
cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][10]], m[BLAKE2S_SIGMA[round][11]], 1, 6, 11, 12);
cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][12]], m[BLAKE2S_SIGMA[round][13]], 2, 7, 8, 13);
cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][14]], m[BLAKE2S_SIGMA[round][15]], 3, 4, 9, 14);
}

for (int offset = 0; offset < BLAKE2S_CHAIN_SIZE; offset++)
ctx->chain[offset] = ctx->chain[offset] ^ ctx->state[offset] ^ ctx->state[offset + 8];
}

__device__ void cuda_blake2s_init(cuda_blake2s_ctx_t* ctx, BYTE* key, WORD keylen, WORD digestbitlen)
{
memset(ctx, 0, sizeof(cuda_blake2s_ctx_t));
ctx->keylen = keylen;
ctx->digestlen = digestbitlen >> 3;
ctx->pos = 0;
ctx->t0 = 0;
ctx->t1 = 0;
ctx->f0 = 0;
ctx->chain[0] = BLAKE2S_IVS[0] ^ (ctx->digestlen | (ctx->keylen << 8) | 0x1010000);
ctx->chain[1] = BLAKE2S_IVS[1];
ctx->chain[2] = BLAKE2S_IVS[2];
ctx->chain[3] = BLAKE2S_IVS[3];
ctx->chain[4] = BLAKE2S_IVS[4];
ctx->chain[5] = BLAKE2S_IVS[5];
ctx->chain[6] = BLAKE2S_IVS[6];
ctx->chain[7] = BLAKE2S_IVS[7];

if (keylen > 0) {
memcpy(ctx->buff, key, keylen);
memcpy(ctx->key, key, keylen);
}
ctx->pos = (keylen > 0) ? BLAKE2S_BLOCK_LENGTH : 0;
}

__device__ void cuda_blake2s_update(cuda_blake2s_ctx_t* ctx, const BYTE* in, LONG inlen)
{
if (inlen == 0) return;

WORD start = 0;
int64_t in_index = 0, block_index = 0;

if (ctx->pos) {
start = BLAKE2S_BLOCK_LENGTH - ctx->pos;
if (start < inlen) {
memcpy(ctx->buff + ctx->pos, in, start);
ctx->t0 += BLAKE2S_BLOCK_LENGTH;

if (ctx->t0 == 0) ctx->t1++;

cuda_blake2s_compress(ctx, ctx->buff, 0);
ctx->pos = 0;
memset(ctx->buff, 0, BLAKE2S_BLOCK_LENGTH);
} else {
memcpy(ctx->buff + ctx->pos, in, inlen);
ctx->pos += inlen;
return;
}
}

block_index = inlen - BLAKE2S_BLOCK_LENGTH;
for (in_index = start; in_index < block_index; in_index += BLAKE2S_BLOCK_LENGTH) {
ctx->t0 += BLAKE2S_BLOCK_LENGTH;
if (ctx->t0 == 0) ctx->t1++;

cuda_blake2s_compress(ctx, in, in_index);
}

memcpy(ctx->buff, in + in_index, inlen - in_index);
ctx->pos += inlen - in_index;
}

__device__ void cuda_blake2s_final(cuda_blake2s_ctx_t* ctx, BYTE* out)
{
ctx->f0 = 0xFFFFFFFFUL;
ctx->t0 += ctx->pos;
if (ctx->pos > 0 && ctx->t0 == 0) ctx->t1++;

cuda_blake2s_compress(ctx, ctx->buff, 0);
memset(ctx->buff, 0, BLAKE2S_BLOCK_LENGTH);
memset(ctx->state, 0, BLAKE2S_STATE_LENGTH);

int i4 = 0;
for (int i = 0; i < BLAKE2S_CHAIN_SIZE && ((i4 = i * 4) < ctx->digestlen); i++) {
BYTE* BYTEs = (BYTE*)(&ctx->chain[i]);
if (i4 < ctx->digestlen - 4)
memcpy(out + i4, BYTEs, 4);
else
memcpy(out + i4, BYTEs, ctx->digestlen - i4);
}
}

__global__ void
kernel_blake2s_hash(const BYTE* indata, WORD inlen, BYTE* outdata, WORD n_batch, WORD BLAKE2S_BLOCK_SIZE)
{
WORD thread = blockIdx.x * blockDim.x + threadIdx.x;
if (thread >= n_batch) { return; }
BYTE key[32] = ""; // Null key
WORD keylen = 0;
CUDA_BLAKE2S_CTX blake_ctx;
const BYTE* in = indata + thread * inlen;
BYTE* out = outdata + thread * BLAKE2S_BLOCK_SIZE;

cuda_blake2s_init(&blake_ctx, key, keylen, (BLAKE2S_BLOCK_SIZE << 3));
cuda_blake2s_update(&blake_ctx, in, inlen);
cuda_blake2s_final(&blake_ctx, out);
}

extern "C" {
void
mcm_cuda_blake2s_hash_batch(BYTE* key, WORD keylen, BYTE* in, WORD inlen, BYTE* out, WORD output_len, WORD n_batch)
{
BYTE* cuda_indata;
BYTE* cuda_outdata;
const WORD BLAKE2S_BLOCK_SIZE = output_len;
cudaMalloc(&cuda_indata, inlen * n_batch);
cudaMalloc(&cuda_outdata, BLAKE2S_BLOCK_SIZE * n_batch);
assert(keylen <= 32);

// CUDA_BLAKE2S_CTX ctx;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These should be removed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

// cpu_blake2s_init(&ctx, key, keylen, n_outbit);
// cudaMemcpyToSymbol(c_CTX, &ctx, sizeof(CUDA_BLAKE2S_CTX), 0, cudaMemcpyHostToDevice);

cudaMemcpy(cuda_indata, in, inlen * n_batch, cudaMemcpyHostToDevice);

WORD thread = 256;
WORD block = (n_batch + thread - 1) / thread;
kernel_blake2s_hash<<<block, thread>>>(cuda_indata, inlen, cuda_outdata, n_batch, BLAKE2S_BLOCK_SIZE);
cudaMemcpy(out, cuda_outdata, BLAKE2S_BLOCK_SIZE * n_batch, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Current implementation does not support async. All of our other primitives do. So maybe worth adding HashConfig as an input and changing all the functions to their async alternatives

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mcm_cuda_blake2s_hash_batch is not a part of the API, it's only for unit tests.
run_hash_many_kernel() is used in the same way as our other implementations

cudaError_t error = cudaGetLastError();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please use our error-management functions (you can find an example in any of our primitives)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

if (error != cudaSuccess) { printf("Error cuda blake2s hash: %s \n", cudaGetErrorString(error)); }
cudaFree(cuda_indata);
cudaFree(cuda_outdata);
}
}

cudaError_t Blake2s::run_hash_many_kernel(
const BYTE* input,
BYTE* output,
WORD number_of_states,
WORD input_len,
WORD output_len,
const device_context::DeviceContext& ctx) const
{
const WORD BLAKE2S_BLOCK_SIZE = output_len;
WORD thread = 256;
WORD block = (number_of_states + thread - 1) / thread;

kernel_blake2s_hash<<<block, thread, 0, ctx.stream>>>(
input, input_len, output, number_of_states, BLAKE2S_BLOCK_SIZE);

CHK_IF_RETURN(cudaPeekAtLastError());
return CHK_LAST();
}

} // namespace blake2s
10 changes: 10 additions & 0 deletions icicle/src/hash/blake2s/expected_hashes.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
652e530edee5893b576f72b875ea1c918e85e29d859e7e3fa78b623d8abca3de
57e36dae300302953c953b59a1b263cb314326db44e919ca4acd57e1da8f0543
9fbfc63ab34b8c35c58e9178c60e2bb165dde7340cb063e9567e4f6bef5eacbe
f26bbce62eaed81606f72000d95f0eea0eac23893f9db0c1a65af173c5095cb7
9a21619cac392bd9c80725b2161b033efc0dad3b57d8a9b4c5103ed1cd065a38
b2e524f32a0b8eeb72737da8ea0075c4e7e11936289954d571622f23e3df9076
d78d4fe12e21ce58226c2707b86b167e237b24f1f84a5e39b073460998c5359d
0d74da2a1062445822cbc8ec7bf424714e09923b4c1eba0ca2170504f56c4331
f5205d77b033111f1e15f585a86b7a4c292c0ec39addb3b2fcb0de4a0bf61003
410381eb72313f23f9f62478d62ec7635f4166ab5e53a20af5c9e8f7ee445de8
Loading
Loading