diff --git a/number_theory/bitmatrix.hpp b/number_theory/bitmatrix.hpp new file mode 100644 index 0000000..4df1c0c --- /dev/null +++ b/number_theory/bitmatrix.hpp @@ -0,0 +1,340 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include "config_macros.hpp" + +// clang-format off + +/// @brief Transposes 8x8 matrix @a src and puts it in into @a dst +/// @a src may be equal to @a dst (inplace transposition) +/// @details See Hackers Delight for more info. +/// @tparam AgainstMinorDiagonal bool flag (see note below) +/// @param src source 8x8 matrix +/// @param dst destination 8x8 matrix +/// +/// @note explanation of @a AgainstMinorDiagonal +/// +/// Suppose we are given 8x8 matrix M = +/// { +/// 0b00001111, +/// 0b00000000, +/// 0b00001111, +/// 0b00000000, +/// 0b00001111, +/// 0b00000000, +/// 0b00001111, +/// 0b00000000, +/// } +/// +/// If AgainstMinorDiagonal = false, M will become +/// { +/// 0b00000000, +/// 0b00000000, +/// 0b00000000, +/// 0b00000000, +/// 0b10101010, +/// 0b10101010, +/// 0b10101010, +/// 0b10101010, +/// } +/// +/// Otherwise, M will become +/// { +/// 0b01010101, +/// 0b01010101, +/// 0b01010101, +/// 0b01010101, +/// 0b00000000, +/// 0b00000000, +/// 0b00000000, +/// 0b00000000, +/// } +template +ATTRIBUTE_ACCESS(read_only, 1) +ATTRIBUTE_ACCESS(write_only, 2) +ATTRIBUTE_NONNULL(1, 2) +constexpr void transpose8(const uint8_t src[8], uint8_t dst[8]) noexcept { + uint64_t x = 0; + + /** + * To unroll loops is important here (one can check it via godbolt) + * + * if AgainstMinorDiagonal == false, all 8 lines above can be done using 1 + * instruction: mov %register_for_x, QWORD PTR[%register_with_src] + * + * movq %register_for_x %register_with_src + * + * Otherwise, if AgainstMinorDiagonal == true, it can be done using 2 instructions: + * mov %register_for_x, QWORD PTR[%register_with_src] + * bswap %register_for_x + * + * Or using 1 instruction if target has `movbe`: + * movbe %register_for_x, QWORD PTR[%register_with_src] + */ + if constexpr (!AgainstMinorDiagonal) { + x = src[7]; + x = (x << 8) | src[6]; + x = (x << 8) | src[5]; + x = (x << 8) | src[4]; + x = (x << 8) | src[3]; + x = (x << 8) | src[2]; + x = (x << 8) | src[1]; + x = (x << 8) | src[0]; + } else { + x = src[0]; + x = (x << 8) | src[1]; + x = (x << 8) | src[2]; + x = (x << 8) | src[3]; + x = (x << 8) | src[4]; + x = (x << 8) | src[5]; + x = (x << 8) | src[6]; + x = (x << 8) | src[7]; + } + + x = (x & 0xAA55AA55AA55AA55ULL) | ((x & 0x00AA00AA00AA00AAULL) << 7) | + ((x >> 7) & 0x00AA00AA00AA00AAULL); + x = (x & 0xCCCC3333CCCC3333ULL) | ((x & 0x0000CCCC0000CCCCULL) << 14) | + ((x >> 14) & 0x0000CCCC0000CCCCULL); + x = (x & 0xF0F0F0F00F0F0F0FULL) | ((x & 0x00000000F0F0F0F0ULL) << 28) | + ((x >> 28) & 0x00000000F0F0F0F0ULL); + + if constexpr (!AgainstMinorDiagonal) { + dst[0] = uint8_t(x); + x >>= 8; + dst[1] = uint8_t(x); + x >>= 8; + dst[2] = uint8_t(x); + x >>= 8; + dst[3] = uint8_t(x); + x >>= 8; + dst[4] = uint8_t(x); + x >>= 8; + dst[5] = uint8_t(x); + x >>= 8; + dst[6] = uint8_t(x); + x >>= 8; + dst[7] = uint8_t(x); + } else { + dst[7] = uint8_t(x); + x >>= 8; + dst[6] = uint8_t(x); + x >>= 8; + dst[5] = uint8_t(x); + x >>= 8; + dst[4] = uint8_t(x); + x >>= 8; + dst[3] = uint8_t(x); + x >>= 8; + dst[2] = uint8_t(x); + x >>= 8; + dst[1] = uint8_t(x); + x >>= 8; + dst[0] = uint8_t(x); + } +} + +/// @brief Transposes 8x8 matrix @a src inplace +/// @tparam AgainstMinorDiagonal +/// @param src source 8x8 matrix +/// @note see @c transpose8(const uint8_t[8], uint8_t[8]) for the explanation of @a AgainstMinorDiagonal +template +ATTRIBUTE_ACCESS(read_write, 1) +ATTRIBUTE_NONNULL(1) +constexpr void transpose8(uint8_t src[8]) noexcept { + transpose8(src, src); +} + +/// @brief Transposes 32x32 matrix @a src inplace. +/// @details See Hackers Delight for more info +/// @tparam AgainstMinorDiagonal +/// @param src source 32x32 matrix +/// @note see @c transpose8(const uint8_t[8], uint8_t[8]) for the explanation of @a AgainstMinorDiagonal +template +ATTRIBUTE_ACCESS(read_write, 1) +ATTRIBUTE_NONNULL(1) +constexpr void transpose32(uint32_t src[32]) noexcept { + uint32_t m = 0x0000FFFFU; + /** + * mask m values are { + * 0x0000FFFF for j = 16 + * 0x00FF00FF for j = 8 + * 0x0F0F0F0F for j = 4 + * 0x33333333 for j = 2 + * 0x55555555 for j = 1 + * } + */ + for (std::size_t j = 16; j != 0; j >>= 1, m ^= (m << j)) { + for (std::size_t k = 0; k < 32; k = (k + j + 1) & ~j) { + if constexpr (!AgainstMinorDiagonal) { + uint32_t t = (src[k + j] ^ (src[k] >> j)) & m; + src[k + j] ^= t; + src[k] ^= (t << j); + } else { + uint32_t t = (src[k] ^ (src[k + j] >> j)) & m; + src[k] ^= t; + src[k + j] ^= (t << j); + } + } + } +} + +/// @brief Transposes 32x32 matrix @a src and puts it in into @a dst . +/// @a src and @a dst may overlap. +/// @tparam AgainstMinorDiagonal +/// @param src +/// @param dst +/// @note see @c transpose8(const uint8_t[8], uint8_t[8]) for the explanation of @a AgainstMinorDiagonal +template +ATTRIBUTE_ACCESS(read_only, 1) +ATTRIBUTE_NONNULL(1, 2) +constexpr void transpose32(const uint32_t src[32], uint32_t dst[32]) noexcept { + if (dst < src) { + std::copy(&src[0], &src[32], &dst[0]); + } else if (dst > src) { + std::copy_backward(&src[0], &src[32], &dst[32]); + } + transpose32(dst); +} + +/// @brief Transposes 64x64 matrix in `src` inplace. +/// @details See Hackers Delight for more info. +/// @tparam AgainstMinorDiagonal +/// @param src source 64x64 matrix +template +ATTRIBUTE_ACCESS(read_write, 1) +ATTRIBUTE_NONNULL(1) +constexpr void transpose64(uint64_t src[64]) noexcept { + uint64_t m = 0x00000000FFFFFFFFULL; + /** + * mask m values are { + * 0x00000000FFFFFFFF for j = 32 + * 0x0000FFFF0000FFFF for j = 16 + * 0x00FF00FF00FF00FF for j = 8 + * 0x0F0F0F0F0F0F0F0F for j = 4 + * 0x3333333333333333 for j = 2 + * 0x5555555555555555 for j = 1 + * } + */ + for (std::size_t j = 32; j != 0; j >>= 1, m ^= (m << j)) { + for (std::size_t k = 0; k < 64; k = (k + j + 1) & ~j) { + if constexpr (!AgainstMinorDiagonal) { + uint64_t t = (src[k + j] ^ (src[k] >> j)) & m; + src[k + j] ^= t; + src[k] ^= (t << j); + } else { + uint64_t t = (src[k] ^ (src[k + j] >> j)) & m; + src[k] ^= t; + src[k + j] ^= (t << j); + } + } + } +} + +// clang-format on + +#include + +inline void transpose_4096(std::bitset<4096> (&m)[4096]) noexcept { + assert(std::bit_cast(std::addressof(m[0])) % 8 == 0); + assert(std::bit_cast(std::addressof(m[4095])) % 8 == 0); + uint64_t tmp1[64]{}; + uint64_t tmp2[64]{}; + for (std::size_t i = 0; i < 64; i++) { + for (std::size_t j = i; j < 64; j++) { + for (std::size_t k = 0; k < 64; k++) { + tmp1[k] = *(std::bit_cast(std::addressof(m[i * 64 + k])) + j); + tmp2[k] = *(std::bit_cast(std::addressof(m[j * 64 + k])) + i); + } + transpose64(tmp1); + transpose64(tmp2); + for (std::size_t k = 0; k < 64; k++) { + *(std::bit_cast(std::addressof(m[i * 64 + k])) + j) = tmp2[k]; + *(std::bit_cast(std::addressof(m[j * 64 + k])) + i) = tmp1[k]; + } + } + } +} + +template +#if defined(__cpp_lib_constexpr_bitset) && __cpp_lib_constexpr_bitset >= 202207L +constexpr +#endif + inline auto + multiply_f2(const std::bitset (&lhs)[N], const std::bitset (&rhs)[N]) noexcept { + std::array, N> result{}; + for (std::size_t i = 0; i < N; i++) { + for (std::size_t j = 0; j < N; j++) { + if (lhs[i][j]) { + result[i] ^= rhs[j]; + } + } + } + return result; +} + +#if defined(__cpp_lib_constexpr_bitset) && __cpp_lib_constexpr_bitset >= 202207L +#define CONSTEXPR_BITSET_CXX_23 constexpr +#else +#define CONSTEXPR_BITSET_CXX_23 +#endif + +template +class square_bitmatrix { +private: + static constexpr std::size_t kAlignmentBits = 1 << 6; + static constexpr std::size_t NBits = (N + kAlignmentBits - 1) & ~kAlignmentBits; + +public: + using container_type = typename std::array, NBits>; + using value_type = typename container_type::value_type; + using pointer = typename container_type::pointer; + using const_pointer = typename container_type::const_pointer; + using reference = typename container_type::reference; + using const_reference = typename container_type::const_reference; + using iterator = typename container_type::iterator; + using const_iterator = typename container_type::const_iterator; + using size_type = typename container_type::size_type; + using difference_type = typename container_type::difference_type; + using reverse_iterator = typename container_type::reverse_iterator; + using const_reverse_iterator = typename container_type::const_reverse_iterator; + using row_type = value_type; + + constexpr reference operator[](size_type index) noexcept { + return data_[index]; + } + constexpr const_reference operator[](size_type index) const noexcept { + return data_[index]; + } + CONSTEXPR_BITSET_CXX_23 auto operator[](std::pair indexes) noexcept { + return data_[indexes.first][indexes.second]; + } + constexpr bool operator[](std::pair indexes) const noexcept { + return data_[indexes.first][indexes.second]; + } + + CONSTEXPR_BITSET_CXX_23 square_bitmatrix& operator*=(const square_bitmatrix& other) noexcept { + for (auto& row : data_) { + row_type row_mult(row); + for (std::size_t j = 0; j < N; j++) { + if (row[j]) { + row_mult ^= other.data_[j]; + } + } + row = row_mult; + } + return *this; + } + ATTRIBUTE_PURE CONSTEXPR_BITSET_CXX_23 square_bitmatrix operator*(const square_bitmatrix& other) const noexcept { + square_bitmatrix copy(*this); + copy *= other; + return copy; + } + +private: + container_type data_{}; +}; diff --git a/number_theory/bitmatrix.cpp b/number_theory/test_bitmatrix.cpp similarity index 74% rename from number_theory/bitmatrix.cpp rename to number_theory/test_bitmatrix.cpp index 0ea1292..b8205e7 100644 --- a/number_theory/bitmatrix.cpp +++ b/number_theory/test_bitmatrix.cpp @@ -1,187 +1,10 @@ -#include -#include -#include -#include -#include -#include - -#include "config_macros.hpp" - -/// @brief Transposes 8x8 matrix in `src` and puts it in into `dst`. `src` may -/// be equal to `dst` (inplace transposition) -/// @details See Hackers Delight for more info. -/// @tparam AgainstMinorDiagonal -/// @param src source 8x8 matrix -/// @param dst destination 8x8 matrix -template -constexpr void transpose8(const uint8_t src[8], uint8_t dst[8]) noexcept { - uint64_t x = 0; - - /** - * To unroll loops is important here (one can check it via godbolt) - * - * if AgainstMinorDiagonal == false, all 8 lines above can be done using 1 - * instruction: mov %register_for_x, QWORD PTR[%register_with_src] - * - * movq %register_for_x %register_with_src - * - * Otherwise, if AgainstMinorDiagonal == true, it can be done using 2 instructions: - * mov %register_for_x, QWORD PTR[%register_with_src] - * bswap %register_for_x - * - * Or using 1 instruction if target has `movbe`: - * movbe %register_for_x, QWORD PTR[%register_with_src] - */ - if constexpr (!AgainstMinorDiagonal) { - x = src[7]; - x = (x << 8) | src[6]; - x = (x << 8) | src[5]; - x = (x << 8) | src[4]; - x = (x << 8) | src[3]; - x = (x << 8) | src[2]; - x = (x << 8) | src[1]; - x = (x << 8) | src[0]; - } else { - x = src[0]; - x = (x << 8) | src[1]; - x = (x << 8) | src[2]; - x = (x << 8) | src[3]; - x = (x << 8) | src[4]; - x = (x << 8) | src[5]; - x = (x << 8) | src[6]; - x = (x << 8) | src[7]; - } - - x = (x & 0xAA55AA55AA55AA55ULL) | ((x & 0x00AA00AA00AA00AAULL) << 7) | - ((x >> 7) & 0x00AA00AA00AA00AAULL); - x = (x & 0xCCCC3333CCCC3333ULL) | ((x & 0x0000CCCC0000CCCCULL) << 14) | - ((x >> 14) & 0x0000CCCC0000CCCCULL); - x = (x & 0xF0F0F0F00F0F0F0FULL) | ((x & 0x00000000F0F0F0F0ULL) << 28) | - ((x >> 28) & 0x00000000F0F0F0F0ULL); - - if constexpr (!AgainstMinorDiagonal) { - dst[0] = uint8_t(x); - x >>= 8; - dst[1] = uint8_t(x); - x >>= 8; - dst[2] = uint8_t(x); - x >>= 8; - dst[3] = uint8_t(x); - x >>= 8; - dst[4] = uint8_t(x); - x >>= 8; - dst[5] = uint8_t(x); - x >>= 8; - dst[6] = uint8_t(x); - x >>= 8; - dst[7] = uint8_t(x); - } else { - dst[7] = uint8_t(x); - x >>= 8; - dst[6] = uint8_t(x); - x >>= 8; - dst[5] = uint8_t(x); - x >>= 8; - dst[4] = uint8_t(x); - x >>= 8; - dst[3] = uint8_t(x); - x >>= 8; - dst[2] = uint8_t(x); - x >>= 8; - dst[1] = uint8_t(x); - x >>= 8; - dst[0] = uint8_t(x); - } -} - -/// @brief Transposes 32x32 matrix in `src` inplace. -/// If AgainstMinorDiagonal = false, the function is equivalent to: -/// for (std::size_t i = 0; i < 32; i++) { -/// for (std::size_t j = i + 1; j < 32; j++) { -/// auto aij = (src[i] & (1u << j)) >> j; -/// auto aji = (src[j] & (1u << i)) >> i; -/// src[i] &= ~(1u << j); -/// src[i] |= aji << j; -/// src[j] &= ~(1u << i); -/// src[j] |= aij << i; -/// } -/// } -/// @details See Hackers Delight for more info. -/// @tparam AgainstMinorDiagonal -/// @param src source 32x32 matrix -template -constexpr void transpose32(uint32_t src[32]) noexcept { - uint32_t m = 0x0000FFFFU; - /** - * mask m values are { - * 0x0000FFFF for j = 16 - * 0x00FF00FF for j = 8 - * 0x0F0F0F0F for j = 4 - * 0x33333333 for j = 2 - * 0x55555555 for j = 1 - * } - */ - for (std::size_t j = 16; j != 0; j >>= 1, m ^= (m << j)) { - for (std::size_t k = 0; k < 32; k = (k + j + 1) & ~j) { - if constexpr (!AgainstMinorDiagonal) { - uint32_t t = (src[k + j] ^ (src[k] >> j)) & m; - src[k + j] ^= t; - src[k] ^= (t << j); - } else { - uint32_t t = (src[k] ^ (src[k + j] >> j)) & m; - src[k] ^= t; - src[k + j] ^= (t << j); - } - } - } -} - -/// @brief Transposes 32x32 matrix in `src` and puts it in into `dst`. `src` and -/// `dst` can not overlap (otherwise, behaviour is undefined) -/// @param src -/// @param dst -constexpr void transpose32(const uint32_t RESTRICT_QUALIFIER src[32], - uint32_t RESTRICT_QUALIFIER dst[32]) noexcept { - if (likely(dst != src)) { - std::copy(&src[0], &src[32], &dst[0]); - } - transpose32(dst); -} +#include "test_tools.hpp" +#include "bitmatrix.hpp" -/// @brief Transposes 64x64 matrix in `src` inplace. -/// @details See Hackers Delight for more info. -/// @tparam AgainstMinorDiagonal -/// @param src source 64x64 matrix -template -constexpr void transpose64(uint64_t src[64]) noexcept { - uint64_t m = 0x00000000FFFFFFFFULL; - /** - * mask m values are { - * 0x00000000FFFFFFFF for j = 32 - * 0x0000FFFF0000FFFF for j = 16 - * 0x00FF00FF00FF00FF for j = 8 - * 0x0F0F0F0F0F0F0F0F for j = 4 - * 0x3333333333333333 for j = 2 - * 0x5555555555555555 for j = 1 - * } - */ - for (std::size_t j = 32; j != 0; j >>= 1, m ^= (m << j)) { - for (std::size_t k = 0; k < 64; k = (k + j + 1) & ~j) { - assert(k + j < 64); - if constexpr (!AgainstMinorDiagonal) { - uint64_t t = (src[k + j] ^ (src[k] >> j)) & m; - src[k + j] ^= t; - src[k] ^= (t << j); - } else { - uint64_t t = (src[k] ^ (src[k + j] >> j)) & m; - src[k] ^= t; - src[k + j] ^= (t << j); - } - } - } -} +#include +#include -constexpr bool test_8x8() { +ATTRIBUTE_CONST constexpr bool test_8x8() { // clang-format off uint8_t a[8] = { 0b00011000, @@ -229,7 +52,7 @@ constexpr bool test_8x8() { return f1 && f2; } -constexpr bool test_32x32() { +ATTRIBUTE_CONST constexpr bool test_32x32() { // clang-format off uint32_t a[32] = { 0b00011000000000000000000000000001U, @@ -349,7 +172,7 @@ constexpr bool test_32x32() { return f1 && f2; } -constexpr bool test_64x64() { +ATTRIBUTE_CONST constexpr bool test_64x64() { // clang-format off uint64_t a[64] = { 0b0000000000000000000000000000000000000000000000000000000000010101, @@ -560,29 +383,23 @@ constexpr bool test_64x64() { transpose64(a); transpose64(a); - bool f2 = std::equal(&a[0], &a[32], &b2[0]); - - class { - public: - constexpr uint64_t operator()() noexcept { - constexpr uint64_t A = std::minstd_rand0::multiplier; - constexpr uint64_t C = std::minstd_rand0::increment; - constexpr uint64_t M = std::minstd_rand0::modulus; - state = (A * state + C) % M; - return state; - } + bool f2 = std::equal(&a[0], &a[64], &b2[0]); - private: - uint64_t state = 872189u; - } gen; + auto random_gen = [random_state = uint64_t(872189)]() mutable constexpr noexcept { + constexpr uint64_t A = std::minstd_rand0::multiplier; + constexpr uint64_t C = std::minstd_rand0::increment; + constexpr uint64_t M = std::minstd_rand0::modulus; + random_state = (A * random_state + C) % M; + return random_state; + }; if (!std::is_constant_evaluated()) { - // Here we manually set too many operations for compile time check + // This test has too many operations for the compile time check uint64_t src[64] = {}; static_assert(sizeof(a) == sizeof(src)); - for (std::size_t iter = 1 << 20; iter > 0; iter--) { + for (std::size_t iter = 1u << 20; iter > 0; iter--) { for (std::size_t i = 0; i < 32; i++) { - uint64_t w = gen(); + uint64_t w = random_gen(); a[i * 2] = uint32_t(w); a[i * 2 + 1] = uint32_t(w >> 32); } @@ -607,29 +424,6 @@ constexpr bool test_64x64() { return f1 && f2; } -#include - -void transpose_4096(std::bitset<4096> (&m)[4096]) { - assert(std::bit_cast(&m[0]) % 8 == 0); - assert(std::bit_cast(&m[4095]) % 8 == 0); - uint64_t tmp1[64]{}; - uint64_t tmp2[64]{}; - for (std::size_t i = 0; i < 64; i++) { - for (std::size_t j = i; j < 64; j++) { - for (std::size_t k = 0; k < 64; k++) { - tmp1[k] = *(std::bit_cast(&m[i * 64 + k]) + j); - tmp2[k] = *(std::bit_cast(&m[j * 64 + k]) + i); - } - transpose64(tmp1); - transpose64(tmp2); - for (std::size_t k = 0; k < 64; k++) { - *(std::bit_cast(&m[i * 64 + k]) + j) = tmp2[k]; - *(std::bit_cast(&m[j * 64 + k]) + i) = tmp1[k]; - } - } - } -} - int main() { static_assert(test_8x8()); static_assert(test_32x32());