Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated blake3 instructions and added neon #356

Open
wants to merge 2 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 25 additions & 7 deletions Bladebit.cmake
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
add_library(bladebit_core)
target_link_libraries(bladebit_core PUBLIC bladebit_config)

string(TOLOWER ${CMAKE_SYSTEM_PROCESSOR} SYSTEM_PROCESSOR_LC)
if(SYSTEM_PROCESSOR_LC MATCHES "arm" OR SYSTEM_PROCESSOR_LC MATCHES "aarch64" OR SYSTEM_PROCESSOR_LC MATCHES "arm64")
set(is_arm ON)
else()
set(is_arm OFF)
endif()
if(${is_arm})
add_compile_options(-mfpu=neon)
endif()

target_include_directories(bladebit_core PUBLIC
${INCLUDE_DIRECTORIES}
${CMAKE_CURRENT_SOURCE_DIR}/src
Expand All @@ -13,7 +23,7 @@ target_compile_definitions(bladebit_core PUBLIC

target_compile_options(bladebit_core PUBLIC ${preinclude_pch})

target_link_libraries(bladebit_core PUBLIC
target_link_libraries(bladebit_core PUBLIC
Threads::Threads
bls

Expand Down Expand Up @@ -64,20 +74,28 @@ set(src_blake3
src/b3/blake3.h
src/b3/blake3_impl.h
src/b3/blake3_portable.c

$<${is_x86}:

$<$<BOOL:${is_x86}>:
$<$<PLATFORM_ID:Windows>:
src/b3/blake3_sse41.c
src/b3/blake3_avx2.c
src/b3/blake3_avx512.c
$<$<CXX_COMPILER_ID:MSVC>:
src/b3/blake3_avx2_x86-64_windows_msvc.asm
src/b3/blake3_avx512_x86-64_windows_msvc.asm
src/b3/blake3_sse41_x86-64_windows_msvc.asm
>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:
src/b3/blake3_avx2_x86-64_windows_gnu.S
src/b3/blake3_avx512_x86-64_windows_gnu.S
src/b3/blake3_sse41_x86-64_windows_gnu.S
>
>
$<$<NOT:$<PLATFORM_ID:Windows>>:
src/b3/blake3_avx2_x86-64_unix.S
src/b3/blake3_avx512_x86-64_unix.S
src/b3/blake3_sse41_x86-64_unix.S
>
>
$<$<BOOL:${is_arm}>:
src/b3/blake3_neon.c
>
)

set(src_bech32
Expand Down
43 changes: 31 additions & 12 deletions Harvester.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,20 @@ else()
add_library(bladebit_harvester STATIC)
endif()

string(TOLOWER ${CMAKE_SYSTEM_PROCESSOR} SYSTEM_PROCESSOR_LC)
if(SYSTEM_PROCESSOR_LC MATCHES "arm" OR SYSTEM_PROCESSOR_LC MATCHES "aarch64" OR SYSTEM_PROCESSOR_LC MATCHES "arm64")
set(is_arm ON)
else()
set(is_arm OFF)
endif()
if(${is_arm})
add_compile_options(-mfpu=neon)
endif()


set_property(TARGET bladebit_harvester PROPERTY PUBLIC_HEADER
src/harvesting/GreenReaper.h

set_property(TARGET bladebit_harvester PROPERTY PUBLIC_HEADER
src/harvesting/GreenReaper.h
src/harvesting/GreenReaperPortable.h)

install(TARGETS bladebit_harvester
Expand Down Expand Up @@ -41,20 +52,28 @@ target_sources(bladebit_harvester PRIVATE
src/b3/blake3.h
src/b3/blake3_impl.h
src/b3/blake3_portable.c

$<${is_x86}:
$<$<BOOL:${is_x86}>:
$<$<PLATFORM_ID:Windows>:
src/b3/blake3_sse41.c
src/b3/blake3_avx2.c
src/b3/blake3_avx512.c
$<$<CXX_COMPILER_ID:MSVC>:
src/b3/blake3_avx2_x86-64_windows_msvc.asm
src/b3/blake3_avx512_x86-64_windows_msvc.asm
src/b3/blake3_sse41_x86-64_windows_msvc.asm
>
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:
src/b3/blake3_avx2_x86-64_windows_gnu.S
src/b3/blake3_avx512_x86-64_windows_gnu.S
src/b3/blake3_sse41_x86-64_windows_gnu.S
>
>
$<$<NOT:$<PLATFORM_ID:Windows>>:
src/b3/blake3_avx2_x86-64_unix.S
src/b3/blake3_avx512_x86-64_unix.S
src/b3/blake3_sse41_x86-64_unix.S
>
>

$<$<BOOL:${is_arm}>:
src/b3/blake3_neon.c
>

src/util/Log.cpp
src/util/Util.cpp
Expand Down Expand Up @@ -135,7 +154,7 @@ target_compile_definitions(bladebit_harvester
)


target_compile_options(bladebit_harvester PRIVATE
target_compile_options(bladebit_harvester PRIVATE
${preinclude_pch}
# $<${have_cuda}:${cuda_archs}>
)
Expand All @@ -147,13 +166,13 @@ endif()
target_link_libraries(bladebit_harvester
PRIVATE
bladebit_config
PUBLIC
PUBLIC
Threads::Threads
$<${have_cuda}:CUDA::cudart_static>
)

if(CUDAToolkit_FOUND)
set_target_properties(bladebit_harvester PROPERTIES
set_target_properties(bladebit_harvester PROPERTIES
EXCLUDE_FROM_ALL ON
MSVC_RUNTIME_LIBRARY MultiThreaded$<$<CONFIG:Debug>:Debug>
CUDA_RUNTIME_LIBRARY Static
Expand All @@ -165,7 +184,7 @@ endif()

# Disable blake3 conversion loss of data warnings
if("${CMAKE_CXX_COMPILER_ID}" MATCHES "MSVC")
set_source_files_properties(
set_source_files_properties(
src/b3/blake3_avx2.c
src/b3/blake3_avx512.c
src/b3/blake3_sse41.c
Expand Down
39 changes: 25 additions & 14 deletions src/b3/blake3.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
#include "blake3.h"
#include "blake3_impl.h"

const char *blake3_version(void) { return BLAKE3_VERSION_STRING; }

INLINE void chunk_state_init(blake3_chunk_state *self, const uint32_t key[8],
uint8_t flags) {
memcpy(self->cv, key, BLAKE3_KEY_LEN);
Expand Down Expand Up @@ -81,7 +83,7 @@ INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) {
memcpy(cv_words, self->input_cv, 32);
blake3_compress_in_place(cv_words, self->block, self->block_len,
self->counter, self->flags);
memcpy(cv, cv_words, 32);
store_cv_words(cv, cv_words);
}

INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out,
Expand Down Expand Up @@ -231,12 +233,6 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
0, // Parents have no end flags.
out);


#pragma GCC diagnostic push
#if !defined( __clang__ )
#pragma GCC diagnostic ignored "-Wstringop-overflow"
#endif

// If there's an odd child left over, it becomes an output.
if (num_chaining_values > 2 * parents_array_len) {
memcpy(&out[parents_array_len * BLAKE3_OUT_LEN],
Expand All @@ -246,20 +242,19 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
} else {
return parents_array_len;
}
#pragma GCC diagnostic pop
}

// The wide helper function returns (writes out) an array of chaining values
// and returns the length of that array. The number of chaining values returned
// is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
// is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
// if the input is shorter than that many chunks. The reason for maintaining a
// wide array of chaining values going back up the tree, is to allow the
// implementation to hash as many parents in parallel as possible.
//
// As a special case when the SIMD degree is 1, this function will still return
// at least 2 outputs. This guarantees that this function doesn't perform the
// root compression. (If it did, it would use the wrong flags, and also we
// wouldn't be able to implement exendable ouput.) Note that this function is
// wouldn't be able to implement extendable output.) Note that this function is
// not used when the whole input is only 1 chunk long; that's a different
// codepath.
//
Expand Down Expand Up @@ -342,15 +337,21 @@ INLINE void compress_subtree_to_parent_node(
assert(input_len > BLAKE3_CHUNK_LEN);
#endif

uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
chunk_counter, flags, cv_array);
assert(num_cvs <= MAX_SIMD_DEGREE_OR_2);

// If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
// compress_subtree_wide() returns more than 2 chaining values. Condense
// them into 2 by forming parent nodes repeatedly.
uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
while (num_cvs > 2) {
// The second half of this loop condition is always true, and we just
// asserted it above. But GCC can't tell that it's always true, and if NDEBUG
// is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious
// warnings here. GCC 8.5 is particularly sensitive, so if you're changing
// this code, test it against that version.
while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) {
num_cvs =
compress_parents_parallel(cv_array, num_cvs, key, flags, out_array);
memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
Expand All @@ -374,17 +375,22 @@ void blake3_hasher_init_keyed(blake3_hasher *self,
hasher_init_base(self, key_words, KEYED_HASH);
}

void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) {
void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
size_t context_len) {
blake3_hasher context_hasher;
hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT);
blake3_hasher_update(&context_hasher, context, strlen(context));
blake3_hasher_update(&context_hasher, context, context_len);
uint8_t context_key[BLAKE3_KEY_LEN];
blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN);
uint32_t context_key_words[8];
load_key_words(context_key, context_key_words);
hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL);
}

void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) {
blake3_hasher_init_derive_key_raw(self, context, strlen(context));
}

// As described in hasher_push_cv() below, we do "lazy merging", delaying
// merges until right before the next CV is about to be added. This is
// different from the reference implementation. Another difference is that we
Expand Down Expand Up @@ -603,3 +609,8 @@ void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
}
output_root_bytes(&output, seek, out, out_len);
}

void blake3_hasher_reset(blake3_hasher *self) {
chunk_state_reset(&self->chunk, self->key, 0);
self->cv_stack_len = 0;
}
48 changes: 37 additions & 11 deletions src/b3/blake3.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,38 @@
#include <stddef.h>
#include <stdint.h>

#if !defined(BLAKE3_API)
# if defined(_WIN32) || defined(__CYGWIN__)
# if defined(BLAKE3_DLL)
# if defined(BLAKE3_DLL_EXPORTS)
# define BLAKE3_API __declspec(dllexport)
# else
# define BLAKE3_API __declspec(dllimport)
# endif
# define BLAKE3_PRIVATE
# else
# define BLAKE3_API
# define BLAKE3_PRIVATE
# endif
# elif __GNUC__ >= 4
# define BLAKE3_API __attribute__((visibility("default")))
# define BLAKE3_PRIVATE __attribute__((visibility("hidden")))
# else
# define BLAKE3_API
# define BLAKE3_PRIVATE
# endif
#endif

#ifdef __cplusplus
extern "C" {
#endif

#define BLAKE3_VERSION_STRING "1.4.1"
#define BLAKE3_KEY_LEN 32
#define BLAKE3_OUT_LEN 32
#define BLAKE3_BLOCK_LEN 64
#define BLAKE3_CHUNK_LEN 1024
#define BLAKE3_MAX_DEPTH 54
#define BLAKE3_MAX_SIMD_DEGREE 16

// This struct is a private implementation detail. It has to be here because
// it's part of blake3_hasher below.
Expand All @@ -38,16 +60,20 @@ typedef struct {
uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN];
} blake3_hasher;

void blake3_hasher_init(blake3_hasher *self);
void blake3_hasher_init_keyed(blake3_hasher *self,
const uint8_t key[BLAKE3_KEY_LEN]);
void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
void blake3_hasher_update(blake3_hasher *self, const void *input,
size_t input_len);
void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
size_t out_len);
void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
uint8_t *out, size_t out_len);
BLAKE3_API const char *blake3_version(void);
BLAKE3_API void blake3_hasher_init(blake3_hasher *self);
BLAKE3_API void blake3_hasher_init_keyed(blake3_hasher *self,
const uint8_t key[BLAKE3_KEY_LEN]);
BLAKE3_API void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
BLAKE3_API void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
size_t context_len);
BLAKE3_API void blake3_hasher_update(blake3_hasher *self, const void *input,
size_t input_len);
BLAKE3_API void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
size_t out_len);
BLAKE3_API void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
uint8_t *out, size_t out_len);
BLAKE3_API void blake3_hasher_reset(blake3_hasher *self);

#ifdef __cplusplus
}
Expand Down
9 changes: 5 additions & 4 deletions src/b3/blake3_avx2.c
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ INLINE void transpose_vecs(__m256i vecs[DEGREE]) {
__m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
__m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);

// Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is
// Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is
// 11/33.
__m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
__m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
Expand Down Expand Up @@ -208,7 +208,7 @@ INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
out[14] = loadu(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
out[15] = loadu(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
for (size_t i = 0; i < 8; ++i) {
_mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
_mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
}
transpose_vecs(&out[0]);
transpose_vecs(&out[8]);
Expand All @@ -219,14 +219,15 @@ INLINE void load_counters(uint64_t counter, bool increment_counter,
const __m256i mask = _mm256_set1_epi32(-(int32_t)increment_counter);
const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
const __m256i add1 = _mm256_and_si256(mask, add0);
__m256i l = _mm256_add_epi32(_mm256_set1_epi32(counter), add1);
__m256i l = _mm256_add_epi32(_mm256_set1_epi32((int32_t)counter), add1);
__m256i carry = _mm256_cmpgt_epi32(_mm256_xor_si256(add1, _mm256_set1_epi32(0x80000000)),
_mm256_xor_si256( l, _mm256_set1_epi32(0x80000000)));
__m256i h = _mm256_sub_epi32(_mm256_set1_epi32(counter >> 32), carry);
__m256i h = _mm256_sub_epi32(_mm256_set1_epi32((int32_t)(counter >> 32)), carry);
*out_lo = l;
*out_hi = h;
}

static
void blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks,
const uint32_t key[8], uint64_t counter,
bool increment_counter, uint8_t flags,
Expand Down
Loading