Skip to content

Commit

Permalink
Merge pull request #1565 from evoskuil/master
Browse files Browse the repository at this point in the history
Optimize sha intrinsics.
  • Loading branch information
evoskuil authored Dec 11, 2024
2 parents 9b16970 + 462822d commit 178513f
Show file tree
Hide file tree
Showing 16 changed files with 763 additions and 407 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,4 @@ build
/configure
/libtool
.dirstamp
/.vs
15 changes: 6 additions & 9 deletions include/bitcoin/system/hash/sha/algorithm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,8 @@ class algorithm
INLINE static constexpr void input(buffer_t& buffer, const block_t& block) NOEXCEPT;
INLINE static constexpr void input_left(auto& buffer, const half_t& half) NOEXCEPT;
INLINE static constexpr void input_right(auto& buffer, const half_t& half) NOEXCEPT;
INLINE static constexpr void reinput_left(auto& buffer, const auto& left) NOEXCEPT;
INLINE static constexpr void reinput_right(auto& buffer, const auto& right) NOEXCEPT;
INLINE static constexpr digest_t output(const state_t& state) NOEXCEPT;

/// Padding.
Expand All @@ -257,12 +259,6 @@ class algorithm
static constexpr void pad_half(auto& buffer) NOEXCEPT;
static constexpr void pad_n(auto& buffer, count_t blocks) NOEXCEPT;

/// Double hashing.
/// -----------------------------------------------------------------------

static constexpr void reinput_left(auto& buffer, const auto& left) NOEXCEPT;
static constexpr void reinput_right(auto& buffer, const auto& right) NOEXCEPT;

/// Iteration (message scheduling vectorized for multiple blocks).
/// -----------------------------------------------------------------------

Expand Down Expand Up @@ -386,9 +382,12 @@ class algorithm
xint128_t message) NOEXCEPT;

template <bool Swap>
static void native_rounds(xint128_t& lo, xint128_t& hi,
INLINE static void native_rounds(xint128_t& lo, xint128_t& hi,
const block_t& block) NOEXCEPT;

INLINE static void native_rounds(xint128_t& lo, xint128_t& hi,
const half_t& left, const chunk_t& pad) NOEXCEPT;

template <bool Swap>
static void native_transform(state_t& state, const auto& block) NOEXCEPT;
static void native_transform(state_t& state, iblocks_t& blocks) NOEXCEPT;
Expand All @@ -409,8 +408,6 @@ class algorithm
static digest_t native_double_hash(const half_t& half) NOEXCEPT;
static digest_t native_double_hash(const half_t& left, const half_t& right) NOEXCEPT;



public:
/// Summary public values.
/// -----------------------------------------------------------------------
Expand Down
21 changes: 2 additions & 19 deletions include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp
Original file line number Diff line number Diff line change
Expand Up @@ -243,25 +243,8 @@ template <size_t Lane>
constexpr void CLASS::
compress(state_t& state, const buffer_t& buffer) NOEXCEPT
{
if (std::is_constant_evaluated())
{
compress_<Lane>(state, buffer);
}
////else if constexpr (native)
////{
//// // Single block shani compression optimization.
//// compress_native<Lane>(state, buffer);
////}
////else if constexpr (vector)
////{
//// // Compression is not vectorized within a block, however this is
//// // feasible but may not be optimal (see round() comments).
//// compress_vector(buffer);
////}
else
{
compress_<Lane>(state, buffer);
}
// block-internal vectorization is suboptimal.
compress_<Lane>(state, buffer);
}

} // namespace sha
Expand Down
53 changes: 0 additions & 53 deletions include/bitcoin/system/impl/hash/sha/algorithm_double.ipp
Original file line number Diff line number Diff line change
Expand Up @@ -28,59 +28,6 @@ namespace libbitcoin {
namespace system {
namespace sha {

// protected
// ----------------------------------------------------------------------------

TEMPLATE
INLINE constexpr void CLASS::
reinput_left(auto& buffer, const auto& left) NOEXCEPT
{
using words = decltype(buffer);
static_assert(array_count<words> >= SHA::state_words);

if (std::is_constant_evaluated())
{
buffer.at(0) = left.at(0);
buffer.at(1) = left.at(1);
buffer.at(2) = left.at(2);
buffer.at(3) = left.at(3);
buffer.at(4) = left.at(4);
buffer.at(5) = left.at(5);
buffer.at(6) = left.at(6);
buffer.at(7) = left.at(7);
}
else
{
using word = array_element<words>;
array_cast<word, SHA::state_words>(buffer) = left;
}
}

TEMPLATE
INLINE constexpr void CLASS::
reinput_right(auto& buffer, const auto& right) NOEXCEPT
{
using words = decltype(buffer);
static_assert(array_count<words> >= SHA::state_words);

if (std::is_constant_evaluated())
{
buffer.at(8) = right.at(0);
buffer.at(9) = right.at(1);
buffer.at(10) = right.at(2);
buffer.at(11) = right.at(3);
buffer.at(12) = right.at(4);
buffer.at(13) = right.at(5);
buffer.at(14) = right.at(6);
buffer.at(15) = right.at(7);
}
else
{
using word = array_element<words>;
array_cast<word, SHA::state_words, SHA::state_words>(buffer) = right;
}
}

// public
// ----------------------------------------------------------------------------
// These benefit from avoiding state endian transition and reusing buffer.
Expand Down
13 changes: 4 additions & 9 deletions include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp
Original file line number Diff line number Diff line change
Expand Up @@ -401,18 +401,13 @@ merkle_hash_vector(digests_t& digests) NOEXCEPT
auto idigests = idigests_t{ to_half(size), data };
const auto start = iblocks.size();

// Merkle hash vector dispatch.
// Always use if available.
if constexpr (use_x512)
merkle_hash_vector<xint512_t>(idigests, iblocks);

// Use if shani is not available or at least 32 blocks.
if constexpr (use_x256)
{
if constexpr (!native)
merkle_hash_vector<xint256_t>(idigests, iblocks);
else if (start >= 32_size)
merkle_hash_vector<xint256_t>(idigests, iblocks);
}
// Only use if shani is not available.
if constexpr (use_x256 && !native)
merkle_hash_vector<xint256_t>(idigests, iblocks);

// Only use if shani is not available.
if constexpr (use_x128 && !native)
Expand Down
9 changes: 8 additions & 1 deletion include/bitcoin/system/impl/hash/sha/algorithm_native.ipp
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ round_4(xint128_t& state0, xint128_t& state1, xint128_t message) NOEXCEPT

TEMPLATE
template <bool Swap>
void CLASS::
INLINE void CLASS::
native_rounds(xint128_t& lo, xint128_t& hi, const block_t& block) NOEXCEPT
{
const auto& wblock = array_cast<xint128_t>(block);
Expand Down Expand Up @@ -186,11 +186,13 @@ TEMPLATE
void CLASS::
native_transform(state_t& state, iblocks_t& blocks) NOEXCEPT
{
// Individual state vars are used vs. array to ensure register persistence.
auto& wstate = array_cast<xint128_t>(state);
auto lo = load(wstate[0]);
auto hi = load(wstate[1]);
shuffle(lo, hi);

// native_rounds must be inlined here (register boundary).
for (auto& block: blocks)
native_rounds<true>(lo, hi, block);

Expand All @@ -208,7 +210,10 @@ native_transform(state_t& state, const auto& block) NOEXCEPT
auto lo = load(wstate[0]);
auto hi = load(wstate[1]);
shuffle(lo, hi);

// native_rounds must be inlined here (register boundary).
native_rounds<Swap>(lo, hi, array_cast<byte_t>(block));

unshuffle(lo, hi);
store(wstate[0], lo);
store(wstate[1], hi);
Expand All @@ -228,6 +233,8 @@ native_finalize(state_t& state, const words_t& pad) NOEXCEPT
auto lo = load(wstate[0]);
auto hi = load(wstate[1]);
shuffle(lo, hi);

// native_rounds must be inlined here (register boundary).
native_rounds<false>(lo, hi, array_cast<byte_t>(pad));
unshuffle(lo, hi);

Expand Down
Loading

0 comments on commit 178513f

Please sign in to comment.