From daf42809c57d989bd099c843fe30612f0792c242 Mon Sep 17 00:00:00 2001 From: evoskuil Date: Sun, 24 Nov 2024 13:02:13 -0500 Subject: [PATCH 1/4] Don't route expanded schedule/compress thru dispatch. --- include/bitcoin/system/hash/sha/algorithm.hpp | 42 +++++++++++-------- .../impl/hash/sha/algorithm_compress.ipp | 22 +++++----- .../impl/hash/sha/algorithm_iterate.ipp | 32 +++++++------- .../system/impl/hash/sha/algorithm_merkle.ipp | 19 +++++---- .../system/impl/hash/sha/algorithm_native.ipp | 15 +++++-- .../impl/hash/sha/algorithm_schedule.ipp | 3 +- 6 files changed, 75 insertions(+), 58 deletions(-) diff --git a/include/bitcoin/system/hash/sha/algorithm.hpp b/include/bitcoin/system/hash/sha/algorithm.hpp index ceaa903b62..e3c0de2399 100644 --- a/include/bitcoin/system/hash/sha/algorithm.hpp +++ b/include/bitcoin/system/hash/sha/algorithm.hpp @@ -102,8 +102,6 @@ class algorithm /// Double hashing (sha256/512). /// ----------------------------------------------------------------------- - static constexpr void reinput(auto& buffer, const auto& state) NOEXCEPT; - template static constexpr digest_t double_hash(const ablocks_t& blocks) NOEXCEPT; static constexpr digest_t double_hash(const block_t& block) NOEXCEPT; @@ -126,7 +124,7 @@ class algorithm static constexpr digest_t finalize_double(state_t& state, size_t blocks) NOEXCEPT; protected: - /// Functions + /// Functions. /// ----------------------------------------------------------------------- using uint = unsigned int; @@ -144,9 +142,12 @@ class algorithm INLINE static constexpr auto Sigma0(auto x) NOEXCEPT; INLINE static constexpr auto Sigma1(auto x) NOEXCEPT; - /// Compression + /// Compression. /// ----------------------------------------------------------------------- + template + INLINE static constexpr auto extract(Word a) NOEXCEPT; + template static CONSTEVAL auto functor() NOEXCEPT; @@ -165,25 +166,25 @@ class algorithm template static constexpr void compress_(auto& state, const auto& buffer) NOEXCEPT; template - static constexpr void compress(auto& state, const auto& buffer) NOEXCEPT; + static constexpr void compress(state_t& state, const buffer_t& buffer) NOEXCEPT; - /// Message Scheduling + /// Message scheduling. /// ----------------------------------------------------------------------- template INLINE static constexpr void prepare(auto& buffer) NOEXCEPT; INLINE static constexpr void add_k(auto& buffer) NOEXCEPT; static constexpr void schedule_(auto& buffer) NOEXCEPT; - static constexpr void schedule(auto& buffer) NOEXCEPT; + static constexpr void schedule(buffer_t& buffer) NOEXCEPT; - /// Parsing (endian sensitive) + /// Parsing (endian sensitive). /// ----------------------------------------------------------------------- INLINE static constexpr void input(buffer_t& buffer, const block_t& block) NOEXCEPT; INLINE static constexpr void input_left(buffer_t& buffer, const half_t& half) NOEXCEPT; INLINE static constexpr void input_right(buffer_t& buffer, const half_t& half) NOEXCEPT; INLINE static constexpr digest_t output(const state_t& state) NOEXCEPT; - /// Padding + /// Padding. /// ----------------------------------------------------------------------- template static constexpr void schedule_n(buffer_t& buffer) NOEXCEPT; @@ -232,6 +233,11 @@ class algorithm using xchunk_t = std_array; using idigests_t = mutable_iterable; + /// Double hashing. + /// ----------------------------------------------------------------------- + + static constexpr void reinput(auto& buffer, const auto& state) NOEXCEPT; + /// Common. /// ----------------------------------------------------------------------- @@ -242,7 +248,7 @@ class algorithm INLINE static void xinput(xbuffer_t& xbuffer, iblocks_t& blocks) NOEXCEPT; - /// Merkle Hash. + /// Merkle hashing. /// ----------------------------------------------------------------------- template @@ -264,15 +270,12 @@ class algorithm INLINE static digest_t unpack(const xstate_t& xstate) NOEXCEPT; template - INLINE static void output(idigests_t& digests, + INLINE static void xoutput(idigests_t& digests, const xstate_t& xstate) NOEXCEPT; - /// Message Schedule (block vectorization). + /// Message scheduling. /// ----------------------------------------------------------------------- - template - INLINE static constexpr auto extract(Word a) NOEXCEPT; - template = true> INLINE static Word extract(xWord a) NOEXCEPT; @@ -320,10 +323,15 @@ class algorithm INLINE static void schedule_native(xbuffer_t& xbuffer) NOEXCEPT; INLINE static void schedule_native(buffer_t& buffer) NOEXCEPT; - template + template INLINE static void compress_native(xstate_t& xstate, const xbuffer_t& xbuffer) NOEXCEPT; - template + + template + INLINE static void compress_native(state_t& state, + const xbuffer_t& xbuffer) NOEXCEPT; + + template INLINE static void compress_native(state_t& state, const buffer_t& buffer) NOEXCEPT; diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp index c6feb98e07..9b063f7774 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp @@ -31,6 +31,16 @@ namespace sha { // protected // ---------------------------------------------------------------------------- +TEMPLATE +template +INLINE constexpr auto CLASS:: +extract(Word a) NOEXCEPT +{ + // Bypass lane extraction for non-expanded (normal form) buffer. + static_assert(Lane == zero); + return a; +} + TEMPLATE template CONSTEVAL auto CLASS:: @@ -99,16 +109,6 @@ round(auto a, auto b, auto c, auto& d, auto e, auto f, auto g, auto& h, // efgh = vsha256h2q(efgh, abcd, value); } -TEMPLATE -template -INLINE constexpr auto CLASS:: -extract(Word a) NOEXCEPT -{ - // Bypass lane extraction for non-expanded (normal form) buffer. - static_assert(Lane == zero); - return a; -} - TEMPLATE template INLINE constexpr void CLASS:: @@ -276,7 +276,7 @@ summarize(auto& out, const auto& in) NOEXCEPT TEMPLATE template constexpr void CLASS:: -compress(auto& state, const auto& buffer) NOEXCEPT +compress(state_t& state, const buffer_t& buffer) NOEXCEPT { if (std::is_constant_evaluated()) { diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp index 25ba8a4ebc..fbefbdeacc 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp @@ -139,33 +139,33 @@ sequential_compress(state_t& state, const xbuffer_t& xbuffer) NOEXCEPT // Sequential compression uses non-expanded state (normal form). constexpr auto lanes = capacity; - compress<0>(state, xbuffer); - compress<1>(state, xbuffer); + compress_<0>(state, xbuffer); + compress_<1>(state, xbuffer); if constexpr (lanes >= 4) { - compress<2>(state, xbuffer); - compress<3>(state, xbuffer); + compress_<2>(state, xbuffer); + compress_<3>(state, xbuffer); } if constexpr (lanes >= 8) { - compress<4>(state, xbuffer); - compress<5>(state, xbuffer); - compress<6>(state, xbuffer); - compress<7>(state, xbuffer); + compress_<4>(state, xbuffer); + compress_<5>(state, xbuffer); + compress_<6>(state, xbuffer); + compress_<7>(state, xbuffer); } if constexpr (lanes >= 16) { - compress<8>(state, xbuffer); - compress<9>(state, xbuffer); - compress<10>(state, xbuffer); - compress<11>(state, xbuffer); - compress<12>(state, xbuffer); - compress<13>(state, xbuffer); - compress<14>(state, xbuffer); - compress<15>(state, xbuffer); + compress_<8>(state, xbuffer); + compress_<9>(state, xbuffer); + compress_<10>(state, xbuffer); + compress_<11>(state, xbuffer); + compress_<12>(state, xbuffer); + compress_<13>(state, xbuffer); + compress_<14>(state, xbuffer); + compress_<15>(state, xbuffer); } } diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp index f08f2367a2..7659c6ddd0 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp @@ -287,7 +287,7 @@ unpack(const xstate_t& xstate) NOEXCEPT TEMPLATE template INLINE void CLASS:: -output(idigests_t& digests, const xstate_t& xstate) NOEXCEPT +xoutput(idigests_t& digests, const xstate_t& xstate) NOEXCEPT { constexpr auto lanes = capacity; BC_ASSERT(digests.size() >= lanes); @@ -354,6 +354,7 @@ merkle_hash_vector(idigests_t& digests, iblocks_t& blocks) NOEXCEPT { if (blocks.size() >= lanes) { + // TODO: expose const structs to avoid local static. static auto initial = pack(H::get); xbuffer_t xbuffer{}; @@ -362,22 +363,22 @@ merkle_hash_vector(idigests_t& digests, iblocks_t& blocks) NOEXCEPT { auto xstate = initial; - // input() advances block iterator by lanes. + // xinput() advances block iterator by lanes. xinput(xbuffer, blocks); - schedule(xbuffer); - compress(xstate, xbuffer); + schedule_(xbuffer); + compress_(xstate, xbuffer); schedule_1(xbuffer); - compress(xstate, xbuffer); + compress_(xstate, xbuffer); // Second hash reinput(xbuffer, xstate); pad_half(xbuffer); - schedule(xbuffer); + schedule_(xbuffer); xstate = initial; - compress(xstate, xbuffer); + compress_(xstate, xbuffer); - // output() advances digest iterator by lanes. - output(digests, xstate); + // xoutput() advances digest iterator by lanes. + xoutput(digests, xstate); } while (blocks.size() >= lanes); } diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp index 83fdffa16d..6d0da5bf74 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp @@ -57,8 +57,17 @@ INLINE void CLASS:: compress_native(xstate_t& xstate, const xbuffer_t& xbuffer) NOEXCEPT { - // Merkle extended buffer is not native dispatched. - compress_(xstate, xbuffer); + // Merkle extended state/buffer is not native dispatched. + compress_(xstate, xbuffer); +} + +TEMPLATE +template +INLINE void CLASS:: +compress_native(state_t& state, const xbuffer_t& xbuffer) NOEXCEPT +{ + // Iterate extended buffer is not native dispatched. + compress_(state, xbuffer); } TEMPLATE @@ -67,7 +76,7 @@ INLINE void CLASS:: compress_native(state_t& state, const buffer_t& buffer) NOEXCEPT { // TODO: - compress_(state, buffer); + compress_(state, buffer); } } // namespace sha diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_schedule.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_schedule.ipp index 63e368487d..f688b2ba5d 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_schedule.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_schedule.ipp @@ -210,7 +210,7 @@ BC_POP_WARNING() TEMPLATE constexpr void CLASS:: -schedule(auto& buffer) NOEXCEPT +schedule(buffer_t& buffer) NOEXCEPT { if (std::is_constant_evaluated()) { @@ -223,7 +223,6 @@ schedule(auto& buffer) NOEXCEPT } else if constexpr (vector) { - // [Multi-block vectorized scheduling is implemented by iterate().] // Single block (without shani) message scheduling optimization. schedule_sigma(buffer); } From d9e3350f0387f06db41c61f512fb0f2d0a9228e9 Mon Sep 17 00:00:00 2001 From: evoskuil Date: Sun, 24 Nov 2024 14:07:54 -0500 Subject: [PATCH 2/4] Style, comments. --- include/bitcoin/system/hash/sha/algorithm.hpp | 193 +++++++++--------- .../impl/hash/sha/algorithm_compress.ipp | 42 ++-- .../impl/hash/sha/algorithm_iterate.ipp | 4 +- .../system/impl/hash/sha/algorithm_merkle.ipp | 1 + .../system/impl/hash/sha/algorithm_native.ipp | 4 +- .../impl/hash/sha/algorithm_parsing.ipp | 4 + .../impl/hash/sha/algorithm_schedule.ipp | 6 +- 7 files changed, 128 insertions(+), 126 deletions(-) diff --git a/include/bitcoin/system/hash/sha/algorithm.hpp b/include/bitcoin/system/hash/sha/algorithm.hpp index e3c0de2399..3903148da0 100644 --- a/include/bitcoin/system/hash/sha/algorithm.hpp +++ b/include/bitcoin/system/hash/sha/algorithm.hpp @@ -76,16 +76,15 @@ class algorithm using iblocks_t = iterable; using digests_t = std::vector; - /// Constants (and count_t). + /// Constants. /// ----------------------------------------------------------------------- + /// count_t is uint64_t (sha160/256) or uint128_t (sha512). /// All extended integer intrinsics currently have a "64 on 32" limit. - static constexpr auto count_bits = SHA::block_words * SHA::word_bytes; static constexpr auto count_bytes = bytes; using count_t = unsigned_exact_type>; - static constexpr auto caching = Cached; static constexpr auto limit_bits = maximum - count_bits; static constexpr auto limit_bytes = to_floored_bytes(limit_bits); static constexpr auto big_end_count = true; @@ -109,11 +108,6 @@ class algorithm static constexpr digest_t double_hash(const half_t& left, const half_t& right) NOEXCEPT; static digest_t double_hash(iblocks_t&& blocks) NOEXCEPT; - /// Merkle hashing (sha256/512). - /// ----------------------------------------------------------------------- - static VCONSTEXPR digests_t& merkle_hash(digests_t& digests) NOEXCEPT; - static VCONSTEXPR digest_t merkle_root(digests_t&& digests) NOEXCEPT; - /// Streamed hashing (explicitly finalized). /// ----------------------------------------------------------------------- static void accumulate(state_t& state, iblocks_t&& blocks) NOEXCEPT; @@ -123,10 +117,53 @@ class algorithm static constexpr digest_t finalize_second(const state_t& state) NOEXCEPT; static constexpr digest_t finalize_double(state_t& state, size_t blocks) NOEXCEPT; + /// Merkle hashing (sha256/512). + /// ----------------------------------------------------------------------- + static VCONSTEXPR digests_t& merkle_hash(digests_t& digests) NOEXCEPT; + static VCONSTEXPR digest_t merkle_root(digests_t&& digests) NOEXCEPT; + protected: - /// Functions. + /// Intrinsics constants. /// ----------------------------------------------------------------------- + + static constexpr auto use_shani = Native && system::with_shani; + static constexpr auto use_neon = Native && system::with_neon; + static constexpr auto use_x128 = Vector && system::with_sse41; + static constexpr auto use_x256 = Vector && system::with_avx2; + static constexpr auto use_x512 = Vector && system::with_avx512; + + template + static constexpr auto is_valid_lanes = + (Lanes == 16u || Lanes == 8u || Lanes == 4u || Lanes == 2u); + + static constexpr auto min_lanes = + (use_x128 ? bytes<128> : + (use_x256 ? bytes<256> : + (use_x512 ? bytes<512> : 0))) / SHA::word_bytes; + + /// Intrinsics types. + /// ----------------------------------------------------------------------- + + /// Extended integer capacity for uint32_t/uint64_t is 2/4/8/16 only. + template > = true> + using xblock_t = std_array; + + template = true> + using xbuffer_t = std_array; + + template = true> + using xstate_t = std_array; + + template = true> + using xchunk_t = std_array; + using uint = unsigned int; + using idigests_t = mutable_iterable; + using pad_t = std_array; + + /// Functions. + /// ----------------------------------------------------------------------- INLINE static constexpr auto parity(auto x, auto y, auto z) NOEXCEPT; INLINE static constexpr auto choice(auto x, auto y, auto z) NOEXCEPT; @@ -179,6 +216,7 @@ class algorithm /// Parsing (endian sensitive). /// ----------------------------------------------------------------------- + INLINE static constexpr void input(buffer_t& buffer, const block_t& block) NOEXCEPT; INLINE static constexpr void input_left(buffer_t& buffer, const half_t& half) NOEXCEPT; INLINE static constexpr void input_right(buffer_t& buffer, const half_t& half) NOEXCEPT; @@ -186,59 +224,25 @@ class algorithm /// Padding. /// ----------------------------------------------------------------------- - template - static constexpr void schedule_n(buffer_t& buffer) NOEXCEPT; - static constexpr void schedule_n(buffer_t& buffer, size_t blocks) NOEXCEPT; - static constexpr void schedule_1(buffer_t& buffer) NOEXCEPT; - static constexpr void pad_half(buffer_t& buffer) NOEXCEPT; - static constexpr void pad_n(buffer_t& buffer, count_t blocks) NOEXCEPT; - -/// Block iteration. -/// --------------------------------------------------------------------------- -protected: - template - INLINE static constexpr void iterate_(state_t& state, - const ablocks_t& blocks) NOEXCEPT; - INLINE static void iterate_(state_t& state, iblocks_t& blocks) NOEXCEPT; - - template - INLINE static constexpr void iterate(state_t& state, - const ablocks_t& blocks) NOEXCEPT; - INLINE static void iterate(state_t& state, iblocks_t& blocks) NOEXCEPT; - -private: - using pad_t = std_array; template static CONSTEVAL buffer_t scheduled_pad() NOEXCEPT; static CONSTEVAL chunk_t chunk_pad() NOEXCEPT; static CONSTEVAL pad_t stream_pad() NOEXCEPT; -/// Vectorization. -/// --------------------------------------------------------------------------- -protected: - /// Extended integer capacity for uint32_t/uint64_t is 2/4/8/16 only. - template - static constexpr auto is_valid_lanes = - (Lanes == 16u || Lanes == 8u || Lanes == 4u || Lanes == 2u); - - template > = true> - using xblock_t = std_array; - template = true> - using xbuffer_t = std_array; - template = true> - using xstate_t = std_array; - template = true> - using xchunk_t = std_array; - using idigests_t = mutable_iterable; + template + static constexpr void schedule_n(buffer_t& buffer) NOEXCEPT; + static constexpr void schedule_n(buffer_t& buffer, size_t blocks) NOEXCEPT; + static constexpr void schedule_1(buffer_t& buffer) NOEXCEPT; + static constexpr void pad_half(buffer_t& buffer) NOEXCEPT; + static constexpr void pad_n(buffer_t& buffer, count_t blocks) NOEXCEPT; /// Double hashing. /// ----------------------------------------------------------------------- static constexpr void reinput(auto& buffer, const auto& state) NOEXCEPT; - /// Common. + /// Iteration. /// ----------------------------------------------------------------------- template @@ -248,6 +252,34 @@ class algorithm INLINE static void xinput(xbuffer_t& xbuffer, iblocks_t& blocks) NOEXCEPT; + template = true> + INLINE static Word extract(xWord a) NOEXCEPT; + + template + INLINE static void sequential_compress(state_t& state, + const xbuffer_t& xbuffer) NOEXCEPT; + + template = true> + INLINE static void vector_schedule_sequential_compress(state_t& state, + iblocks_t& blocks) NOEXCEPT; + + template + INLINE static void iterate_vector(state_t& state, + const ablocks_t& blocks) NOEXCEPT; + INLINE static void iterate_vector(state_t& state, + iblocks_t& blocks) NOEXCEPT; + + template + INLINE static constexpr void iterate_(state_t& state, + const ablocks_t& blocks) NOEXCEPT; + INLINE static void iterate_(state_t& state, iblocks_t& blocks) NOEXCEPT; + + template + INLINE static constexpr void iterate(state_t& state, + const ablocks_t& blocks) NOEXCEPT; + INLINE static void iterate(state_t& state, iblocks_t& blocks) NOEXCEPT; + /// Merkle hashing. /// ----------------------------------------------------------------------- @@ -273,26 +305,11 @@ class algorithm INLINE static void xoutput(idigests_t& digests, const xstate_t& xstate) NOEXCEPT; - /// Message scheduling. - /// ----------------------------------------------------------------------- - - template = true> - INLINE static Word extract(xWord a) NOEXCEPT; - - template - INLINE static void sequential_compress(state_t& state, - const xbuffer_t& xbuffer) NOEXCEPT; - template = true> - INLINE static void vector_schedule_sequential_compress(state_t& state, - iblocks_t& blocks) NOEXCEPT; - - template - INLINE static void iterate_vector(state_t& state, - const ablocks_t& blocks) NOEXCEPT; - INLINE static void iterate_vector(state_t& state, - iblocks_t& blocks) NOEXCEPT; + INLINE static void merkle_hash_vector(idigests_t& digests, iblocks_t& blocks) NOEXCEPT; + INLINE static void merkle_hash_vector(digests_t& digests) NOEXCEPT; + VCONSTEXPR static void merkle_hash_(digests_t& digests, + size_t offset=zero) NOEXCEPT; /// sigma0 vectorization. /// ----------------------------------------------------------------------- @@ -303,6 +320,7 @@ class algorithm template INLINE static void prepare1(buffer_t& buffer, const auto& xsigma0) NOEXCEPT; + template INLINE static void prepare8(buffer_t& buffer) NOEXCEPT; @@ -312,12 +330,11 @@ class algorithm /// Native. /// ----------------------------------------------------------------------- -protected: - using cword_t = xint128_t; - static constexpr auto cratio = sizeof(cword_t) / SHA::word_bytes; - static constexpr auto crounds = SHA::rounds / cratio; - using cbuffer_t = std_array; - using cstate_t = std_array; + ////using cword_t = xint128_t; + ////static constexpr auto cratio = sizeof(cword_t) / SHA::word_bytes; + ////static constexpr auto crounds = SHA::rounds / cratio; + ////using cbuffer_t = std_array; + ////using cstate_t = std_array; template INLINE static void schedule_native(xbuffer_t& xbuffer) NOEXCEPT; @@ -335,33 +352,13 @@ class algorithm INLINE static void compress_native(state_t& state, const buffer_t& buffer) NOEXCEPT; - /// Merkle. - /// ----------------------------------------------------------------------- -protected: - VCONSTEXPR static void merkle_hash_(digests_t& digests, - size_t offset = zero) NOEXCEPT; - - template = true> - INLINE static void merkle_hash_vector(idigests_t& digests, - iblocks_t& blocks) NOEXCEPT; - - INLINE static void merkle_hash_vector(digests_t& digests) NOEXCEPT; - public: - static constexpr auto use_neon = Native && system::with_neon; - static constexpr auto use_shani = Native && system::with_shani; + /// Summary public values. + /// ----------------------------------------------------------------------- + static constexpr auto caching = Cached; static constexpr auto native = use_shani || use_neon; - - static constexpr auto use_x128 = Vector && system::with_sse41; - static constexpr auto use_x256 = Vector && system::with_avx2; - static constexpr auto use_x512 = Vector && system::with_avx512; static constexpr auto vector = (use_x128 || use_x256 || use_x512) && !(build_x32 && is_same_size); - - static constexpr auto min_lanes = - (use_x128 ? bytes<128> : - (use_x256 ? bytes<256> : - (use_x512 ? bytes<512> : 0))) / SHA::word_bytes; }; } // namespace sha diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp index 9b063f7774..4e0a194d98 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp @@ -149,8 +149,27 @@ round(auto& state, const auto& wk) NOEXCEPT } } +TEMPLATE +INLINE constexpr void CLASS:: +summarize(auto& out, const auto& in) NOEXCEPT +{ + constexpr auto s = SHA::word_bits; + out[0] = f::add(out[0], in[0]); + out[1] = f::add(out[1], in[1]); + out[2] = f::add(out[2], in[2]); + out[3] = f::add(out[3], in[3]); + out[4] = f::add(out[4], in[4]); + + if constexpr (SHA::strength != 160) + { + out[5] = f::add(out[5], in[5]); + out[6] = f::add(out[6], in[6]); + out[7] = f::add(out[7], in[7]); + } +} + // msvc++ not inlined in x32. -BC_PUSH_WARNING(NOT_INLINED) +////BC_PUSH_WARNING(NOT_INLINED) TEMPLATE template @@ -252,26 +271,7 @@ compress_(auto& state, const auto& buffer) NOEXCEPT summarize(state, start); } -BC_POP_WARNING() - -TEMPLATE -INLINE constexpr void CLASS:: -summarize(auto& out, const auto& in) NOEXCEPT -{ - constexpr auto s = SHA::word_bits; - out[0] = f::add(out[0], in[0]); - out[1] = f::add(out[1], in[1]); - out[2] = f::add(out[2], in[2]); - out[3] = f::add(out[3], in[3]); - out[4] = f::add(out[4], in[4]); - - if constexpr (SHA::strength != 160) - { - out[5] = f::add(out[5], in[5]); - out[6] = f::add(out[6], in[6]); - out[7] = f::add(out[7], in[7]); - } -} +////BC_POP_WARNING() TEMPLATE template diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp index fbefbdeacc..ea43bc15c7 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp @@ -122,8 +122,8 @@ xinput(xbuffer_t& xbuffer, iblocks_t& blocks) NOEXCEPT TEMPLATE template > - INLINE Word CLASS:: - extract(xWord a) NOEXCEPT +INLINE Word CLASS:: +extract(xWord a) NOEXCEPT { // Extract word from lane of vectorized buffer. return get(a); diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp index 7659c6ddd0..9e04c545e9 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp @@ -271,6 +271,7 @@ template INLINE typename CLASS::digest_t CLASS:: unpack(const xstate_t& xstate) NOEXCEPT { + // TODO: byteswap state in full one time before unpacking (vs. 8 times). return array_cast(state_t { get(byteswap(xstate[0])), diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp index 6d0da5bf74..c27995f8ab 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp @@ -47,7 +47,7 @@ TEMPLATE INLINE void CLASS:: schedule_native(buffer_t& buffer) NOEXCEPT { - // TODO: + // TODO: single block compression. schedule_(buffer); } @@ -75,7 +75,7 @@ template INLINE void CLASS:: compress_native(state_t& state, const buffer_t& buffer) NOEXCEPT { - // TODO: + // TODO: single block compression. compress_(state, buffer); } diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_parsing.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_parsing.ipp index a9ee56097a..b40f3d6290 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_parsing.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_parsing.ipp @@ -58,6 +58,7 @@ input(buffer_t& buffer, const block_t& block) NOEXCEPT } else if constexpr (bc::is_little_endian) { + // TODO: evaluate 4/8/16 lane optimization using byteswap. const auto& in = array_cast(block); buffer[0] = native_from_big_end(in[0]); buffer[1] = native_from_big_end(in[1]); @@ -91,6 +92,7 @@ input_left(buffer_t& buffer, const half_t& half) NOEXCEPT if (std::is_constant_evaluated()) { + // TODO: evaluate 4/8 lane optimization using byteswap. constexpr auto size = SHA::word_bytes; from_big<0 * size>(buffer.at(0), half); from_big<1 * size>(buffer.at(1), half); @@ -139,6 +141,7 @@ input_right(buffer_t& buffer, const half_t& half) NOEXCEPT } else if constexpr (bc::is_little_endian) { + // TODO: evaluate 4/8 lane optimization using byteswap. const auto& in = array_cast(half); buffer[8] = native_from_big_end(in[0]); buffer[9] = native_from_big_end(in[1]); @@ -195,6 +198,7 @@ output(const state_t& state) NOEXCEPT } else { + // TODO: evaluate 4/8 lane optimization using byteswap. return array_cast(state_t { native_to_big_end(state[0]), diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_schedule.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_schedule.ipp index f688b2ba5d..856a338c0d 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_schedule.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_schedule.ipp @@ -125,7 +125,7 @@ add_k(auto& buffer) NOEXCEPT } // msvc++ not inlined in x32. -BC_PUSH_WARNING(NOT_INLINED) +////BC_PUSH_WARNING(NOT_INLINED) TEMPLATE constexpr void CLASS:: @@ -206,7 +206,7 @@ schedule_(auto& buffer) NOEXCEPT add_k(buffer); } -BC_POP_WARNING() +////BC_POP_WARNING() TEMPLATE constexpr void CLASS:: @@ -218,7 +218,7 @@ schedule(buffer_t& buffer) NOEXCEPT } else if constexpr (native) { - // Single block shani message scheduling optimization. + // Single block (with shani) message scheduling optimization. schedule_native(buffer); } else if constexpr (vector) From 6abe49584eb275c819324d437b9a72582931cb8d Mon Sep 17 00:00:00 2001 From: evoskuil Date: Sun, 24 Nov 2024 14:19:11 -0500 Subject: [PATCH 3/4] Remove dead comments. --- include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp | 5 ----- include/bitcoin/system/impl/hash/sha/algorithm_schedule.ipp | 5 ----- 2 files changed, 10 deletions(-) diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp index 4e0a194d98..ada7bae8f4 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp @@ -168,9 +168,6 @@ summarize(auto& out, const auto& in) NOEXCEPT } } -// msvc++ not inlined in x32. -////BC_PUSH_WARNING(NOT_INLINED) - TEMPLATE template constexpr void CLASS:: @@ -271,8 +268,6 @@ compress_(auto& state, const auto& buffer) NOEXCEPT summarize(state, start); } -////BC_POP_WARNING() - TEMPLATE template constexpr void CLASS:: diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_schedule.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_schedule.ipp index 856a338c0d..ad84f99d76 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_schedule.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_schedule.ipp @@ -124,9 +124,6 @@ add_k(auto& buffer) NOEXCEPT buffer[r + 15] = f::addc(buffer[r + 15]); } -// msvc++ not inlined in x32. -////BC_PUSH_WARNING(NOT_INLINED) - TEMPLATE constexpr void CLASS:: schedule_(auto& buffer) NOEXCEPT @@ -206,8 +203,6 @@ schedule_(auto& buffer) NOEXCEPT add_k(buffer); } -////BC_POP_WARNING() - TEMPLATE constexpr void CLASS:: schedule(buffer_t& buffer) NOEXCEPT From 2a744816cc02c9268aecea5e0e6b6ac153b8a3f5 Mon Sep 17 00:00:00 2001 From: evoskuil Date: Sun, 24 Nov 2024 14:28:55 -0500 Subject: [PATCH 4/4] Stub shani into hash iterator. --- .../system/impl/hash/sha/algorithm_iterate.ipp | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp index ea43bc15c7..03154d3d9c 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp @@ -273,9 +273,15 @@ iterate(state_t& state, const ablocks_t& blocks) NOEXCEPT { iterate_(state, blocks); } + else if constexpr (native) + { + // Multiple block shani message schduling and compression optimization. + iterate_(state, blocks); + } else if constexpr (vector) { - // Multi-block vectorized message scheduling optimization. + // TODO: evaluate 4/8/16 lane message scheduling vs. shani scheduling. + // Multiple block vectorized message scheduling optimization. iterate_vector(state, blocks); } else @@ -288,9 +294,15 @@ TEMPLATE INLINE void CLASS:: iterate(state_t& state, iblocks_t& blocks) NOEXCEPT { - if constexpr (vector) + if constexpr (native) + { + // TODO: evaluate 4/8/16 lane message scheduling vs. shani scheduling. + // Multiple block shani message schduling and compression optimization. + iterate_(state, blocks); + } + else if constexpr (vector) { - // Multi-block vectorized message scheduling optimization. + // Multiple block vectorized message scheduling optimization. iterate_vector(state, blocks); } else