diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 7a2666359..208b2a6f8 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -221,10 +221,13 @@ jobs: ar_block, ar_block_cache, ar_chain_stats, + ar_chunk_copy, ar_chunk_storage, ar_data_sync_worker_master, ar_deep_hash, + ar_device_lock, ar_diff_dag, + ar_entropy_storage, ar_ets_intervals, ar_events, ar_inflation, diff --git a/.github/workflows/test.yml.bak b/.github/workflows/test.yml.bak deleted file mode 100644 index b1e22be70..000000000 --- a/.github/workflows/test.yml.bak +++ /dev/null @@ -1,189 +0,0 @@ -name: "Arweave Tests" -on: - workflow_dispatch: - push: - branches: ["**"] -jobs: - eunit-tests: - runs-on: self-hosted - strategy: - fail-fast: true - max-parallel: 6 - matrix: - core_test_mod: [ - ar, - ar_block, - ar_block_cache, - ar_chunk_storage, - ar_data_sync_worker_master, - ar_deep_hash, - ar_diff_dag, - ar_ets_intervals, - ar_events, - ar_inflation, - ar_intervals, - ar_join, - ar_kv, - ar_merkle, - ar_mining_server, - ar_mining_stats, - ar_node, - ar_node_utils, - ar_nonce_limiter, - ar_packing_server, - ar_patricia_tree, - ar_peers, - ar_poa, - ar_pricing, - ar_retarget, - ar_serialize, - ar_storage_module, - ar_storage, - ar_storage_module, - ar_sync_buckets, - ar_tx, - ar_tx_db, - ar_unbalanced_merkle, - ar_util, - ar_pool, - - ## Note, that _tests are implicitly run by a matching prefix name - ar_base64_compatibility_tests, - ar_config_tests, - ar_coordinated_mining_tests, - ar_data_sync_tests, - ar_difficulty_tests, - ar_fork_recovery_tests, - ar_header_sync_tests, - ar_http_iface_tests, - ar_http_util_tests, - ar_mempool_tests, - ar_mine_randomx_tests, - ar_mine_vdf_tests, - ar_mining_io_tests, - ar_tx_tests, - # ar_node_tests, ## implicitly runs from ar_node - # ar_poa_tests, ## implicitly runs from ar_poa - ar_poller_tests, - ar_post_block_tests, - # ar_pricing_tests, ## implicitly runs from ar_pricing - ar_semaphore_tests, - ar_tx_blacklist_tests, - ar_tx_replay_pool_tests, - ar_vdf_server_tests, - ar_vdf_tests, - ar_wallet_tests, - ar_webhook_tests, - ] - steps: - - uses: actions/checkout@v4 - with: - submodules: "recursive" - - name: Cache build - id: cache-build - uses: actions/cache@v3 - with: - path: build.tar.gz - key: ${{ runner.os }}-build-${{ hashFiles('apps/**', 'config/**', '.github/**', 'rebar.config', 'rebar.lock', 'ar-rebar3') }} - restore-keys: | - ${{ runner.os }}-build- - - name: Extract build cache - if: steps.cache-build.outputs.cache-hit == 'true' - run: | - rm -rf _build || true - tar -xzf build.tar.gz - - name: Build arweave test sources - if: steps.cache-build.outputs.cache-hit != 'true' - run: | - rm -rf _build build.tar.gz || true - ./ar-rebar3 test compile - chmod -R u+w ./_build - tar --dereference -czf build.tar.gz _build - - name: Save build cache - if: steps.cache-build.outputs.cache-hit != 'true' - uses: actions/cache@v3 - with: - path: build.tar.gz - key: ${{ runner.os }}-build-${{ hashFiles('apps/**', 'config/**', '.github/**', 'rebar.config', 'rebar.lock', 'ar-rebar3') }} - - name: Restart epmd - run: | - if ! pgrep -x "epmd" > /dev/null - then - echo "Starting epmd" - epmd -relaxed_command_check -daemon - fi - - name: ${{ matrix.core_test_mod }}.erl - id: tests - run: | - rm -f *.out || true - EXIT_CODE=0 - export PATH=$(pwd)/_build/erts/bin:$PATH - export ERL_EPMD_ADDRESS=127.0.0.1 - export TIMESTAMP_IN_MILLISECONDS=$(date +%s%3N) - export NAMESPACE="${{ matrix.core_test_mod }}_${TIMESTAMP_IN_MILLISECONDS}" - export ERL_TEST_OPTS="-pa $(echo $(pwd)/_build/test/lib/*/ebin) $(pwd)/_build/test/lib/arweave/test -config $(pwd)/config/sys.config" - RETRYABLE=1 - while [[ $RETRYABLE -eq 1 ]]; do - RETRYABLE=0 - set +e - set -x - erl $ERL_TEST_OPTS -noshell -name main-${NAMESPACE}@127.0.0.1 -setcookie ${{ matrix.core_test_mod }} -run ar tests "${{ matrix.core_test_mod }}" -s init stop 2>&1 | tee main.out - EXIT_CODE=${PIPESTATUS[0]} - set +x - set -e - # For debugging purposes, print the peer1 output if the tests failed - if [[ $EXIT_CODE -ne 0 ]]; then - echo -e "\033[0;32m===> Checking for retry\033[0m" - if ls peer1-*.out 1> /dev/null 2>&1; then - first_line_peer1=$(head -n 1 peer1-*.out) - fi - first_line_main=$(head -n 1 main.out) - echo -e "\033[0;31m===> First line of peer1 node's output: $first_line_peer1\033[0m" - echo -e "\033[0;31m===> First line of main node's output: $first_line_main\033[0m" - - # Check if it is a retryable error - if [[ "$first_line_peer1" == "Protocol 'inet_tcp': register/listen error: "* ]]; then - echo "Retrying test because of inet_tcp error..." - RETRYABLE=1 - sleep 1 - elif [[ "$first_line_peer1" == "Protocol 'inet_tcp': the name"* ]]; then - echo "Retrying test because of inet_tcp clash..." - RETRYABLE=1 - sleep 1 - elif [[ "$first_line_main" == *"econnrefused"* ]]; then - echo "Retrying test because of econnrefused..." - RETRYABLE=1 - sleep 1 - else - if ls peer1-*.out 1> /dev/null 2>&1; then - echo -e "\033[0;31m===> Test failed, printing the peer1 node's output...\033[0m" - cat peer1-*.out - else - echo -e "\033[0;31m===> Test failed without peer1 output...\033[0m" - fi - if ls peer2-*.out 1> /dev/null 2>&1; then - echo -e "\033[0;31m===> Test failed, printing the peer2 node's output...\033[0m" - cat peer2-*.out - else - echo -e "\033[0;31m===> Test failed without peer2 output...\033[0m" - fi - if ls peer3-*.out 1> /dev/null 2>&1; then - echo -e "\033[0;31m===> Test failed, printing the peer3 node's output...\033[0m" - cat peer3-*.out - else - echo -e "\033[0;31m===> Test failed without peer3 output...\033[0m" - fi - if ls peer4-*.out 1> /dev/null 2>&1; then - echo -e "\033[0;31m===> Test failed, printing the peer4 node's output...\033[0m" - cat peer4-*.out - else - echo -e "\033[0;31m===> Test failed without peer4 output...\033[0m" - fi - fi - fi - done - echo "::set-output name=exit_code::$EXIT_CODE" # Set the exit_code output variable - exit $EXIT_CODE # exit with the exit code of the tests - - name: Cleanup successful test - if: steps.tests.outputs.exit_code == '0' # Conditional based on the output variable - run: rm -rf _build logs *.out diff --git a/apps/arweave/e2e/ar_e2e.erl b/apps/arweave/e2e/ar_e2e.erl index 53a5bb04a..183c69e42 100644 --- a/apps/arweave/e2e/ar_e2e.erl +++ b/apps/arweave/e2e/ar_e2e.erl @@ -6,7 +6,7 @@ -export([delayed_print/2, packing_type_to_packing/2, start_source_node/3, source_node_storage_modules/3, max_chunk_offset/1, assert_block/2, assert_syncs_range/3, assert_does_not_sync_range/3, - assert_chunks/3, assert_no_chunks/2, assert_partition_size/4, assert_empty_partition/3]). + assert_chunks/3, assert_no_chunks/2, assert_partition_size/3, assert_empty_partition/3]). -include_lib("arweave/include/ar.hrl"). -include_lib("arweave/include/ar_config.hrl"). @@ -70,6 +70,8 @@ packing_type_to_packing(PackingType, Address) -> end. start_source_node(Node, unpacked, _WalletFixture) -> + ?LOG_INFO("Starting source node ~p with packing type ~p and wallet fixture ~p", + [Node, unpacked, _WalletFixture]), TempNode = case Node of peer1 -> peer2; peer2 -> peer1 @@ -85,11 +87,16 @@ start_source_node(Node, unpacked, _WalletFixture) -> auto_join = true }, true), - ar_e2e:assert_partition_size(Node, 0, unpacked, ?PARTITION_SIZE), - ar_e2e:assert_partition_size(Node, 1, unpacked, ?PARTITION_SIZE), + ?LOG_INFO("Source node ~p started.", [Node]), + + ar_e2e:assert_partition_size(Node, 0, unpacked), + ar_e2e:assert_partition_size(Node, 1, unpacked), ar_e2e:assert_syncs_range(Node, ?PARTITION_SIZE, 2*?PARTITION_SIZE), ar_e2e:assert_chunks(Node, unpacked, Chunks), + + ?LOG_INFO("Source node ~p assertions passed.", [Node]), + ar_test_node:stop(TempNode), {Blocks, undefined, Chunks}; start_source_node(Node, PackingType, WalletFixture) -> @@ -110,6 +117,8 @@ start_source_node(Node, PackingType, WalletFixture) -> }, true) ), + ?LOG_INFO("Source node ~p started.", [Node]), + %% Note: small chunks will be padded to 256 KiB. So B1 actually contains 3 chunks of data %% and B2 starts at a chunk boundary and contains 1 chunk of data. B1 = mine_block(Node, Wallet, floor(2.5 * ?DATA_CHUNK_SIZE)), @@ -130,14 +139,16 @@ start_source_node(Node, PackingType, WalletFixture) -> {B3, ?PARTITION_SIZE + (8*?DATA_CHUNK_SIZE), ?DATA_CHUNK_SIZE} ], - ?LOG_INFO("Source node ~p started.", [Node]), + ?LOG_INFO("Source node ~p blocks mined.", [Node]), SourcePacking = ar_e2e:packing_type_to_packing(PackingType, RewardAddr), - ar_e2e:assert_partition_size(Node, 0, SourcePacking, ?PARTITION_SIZE), - ar_e2e:assert_partition_size(Node, 1, SourcePacking, ?PARTITION_SIZE), + ar_e2e:assert_partition_size(Node, 0, SourcePacking), + ar_e2e:assert_partition_size(Node, 1, SourcePacking), - ar_e2e:assert_syncs_range(Node, ?PARTITION_SIZE, 2*?PARTITION_SIZE), + ar_e2e:assert_syncs_range(Node, + ?PARTITION_SIZE, + 2*?PARTITION_SIZE + ar_storage_module:get_overlap(SourcePacking)), ar_e2e:assert_chunks(Node, SourcePacking, Chunks), ?LOG_INFO("Source node ~p assertions passed.", [Node]), @@ -211,14 +222,21 @@ assert_block({replica_2_9, Address}, MinedBlock) -> assert_syncs_range(Node, StartOffset, EndOffset) -> - ?assert( - ar_util:do_until( - fun() -> has_range(Node, StartOffset, EndOffset) end, - 100, - 60_000 - ), - iolist_to_binary(io_lib:format( - "~s Failed to sync range ~p - ~p", [Node, StartOffset, EndOffset]))). + HasRange = ar_util:do_until( + fun() -> has_range(Node, StartOffset, EndOffset) end, + 100, + 60_000 + ), + case HasRange of + true -> + ok; + false -> + SyncRecord = ar_http_iface_client:get_sync_record(Node, json), + ?assert(false, + iolist_to_binary(io_lib:format( + "~s failed to sync range ~p - ~p. Sync record: ~p", + [Node, StartOffset, EndOffset, SyncRecord]))) + end. assert_does_not_sync_range(Node, StartOffset, EndOffset) -> ar_util:do_until( @@ -231,7 +249,10 @@ assert_does_not_sync_range(Node, StartOffset, EndOffset) -> "~s synced range when it should not have: ~p - ~p", [Node, StartOffset, EndOffset]))). -assert_partition_size(Node, PartitionNumber, Packing, Size) -> +assert_partition_size(Node, PartitionNumber, Packing) -> + Size = ?PARTITION_SIZE, + ?LOG_INFO("~p: Asserting partition ~p,~p is size ~p", + [Node, PartitionNumber, ar_serialize:encode_packing(Packing, true), Size]), ?assert( ar_util:do_until( fun() -> @@ -330,17 +351,18 @@ assert_chunk(Node, Packing, Block, EndOffset, ChunkSize) -> {ok, ExpectedPackedChunk} = ar_e2e:load_chunk_fixture(Packing, EndOffset), ?assertEqual(ExpectedPackedChunk, Chunk, iolist_to_binary(io_lib:format( - "Chunk at offset ~p, size ~p does not match previously packed chunk", - [EndOffset, ChunkSize]))), + "~p: Chunk at offset ~p, size ~p does not match previously packed chunk", + [Node, EndOffset, ChunkSize]))), {ok, UnpackedChunk} = ar_packing_server:unpack( Packing, EndOffset, Block#block.tx_root, Chunk, ?DATA_CHUNK_SIZE), - UnpaddedChunk = ar_packing_server:unpad_chunk(Packing, UnpackedChunk, ChunkSize, byte_size(Chunk)), + UnpaddedChunk = ar_packing_server:unpad_chunk( + Packing, UnpackedChunk, ChunkSize, byte_size(Chunk)), ExpectedUnpackedChunk = ar_test_node:get_genesis_chunk(EndOffset), ?assertEqual(ExpectedUnpackedChunk, UnpaddedChunk, iolist_to_binary(io_lib:format( - "Chunk at offset ~p, size ~p does not match unpacked chunk", - [EndOffset, ChunkSize]))). + "~p: Chunk at offset ~p, size ~p does not match unpacked chunk", + [Node, EndOffset, ChunkSize]))). assert_no_chunks(Node, Chunks) -> lists:foreach(fun({_Block, EndOffset, _ChunkSize}) -> diff --git a/apps/arweave/e2e/ar_repack_in_place_mine_tests.erl b/apps/arweave/e2e/ar_repack_in_place_mine_tests.erl index b35e3e6ad..89b5c63a7 100644 --- a/apps/arweave/e2e/ar_repack_in_place_mine_tests.erl +++ b/apps/arweave/e2e/ar_repack_in_place_mine_tests.erl @@ -15,25 +15,9 @@ repack_in_place_mine_test_() -> Timeout = ?REPACK_IN_PLACE_MINE_TEST_TIMEOUT, [ - % XXX {timeout, Timeout, {with, {unpacked, replica_2_9}, [fun test_repack_in_place_mine/1]}}, - % XXX {timeout, Timeout, {with, {unpacked, spora_2_6}, [fun test_repack_in_place_mine/1]}}, - % XXX {timeout, Timeout, {with, {unpacked, composite_1}, [fun test_repack_in_place_mine/1]}}, - % XXX {timeout, Timeout, {with, {unpacked, composite_2}, [fun test_repack_in_place_mine/1]}}, + % {timeout, Timeout, {with, {unpacked, replica_2_9}, [fun test_repack_in_place_mine/1]}}, {timeout, Timeout, {with, {spora_2_6, replica_2_9}, [fun test_repack_in_place_mine/1]}}, - {timeout, Timeout, {with, {spora_2_6, spora_2_6}, [fun test_repack_in_place_mine/1]}}, - {timeout, Timeout, {with, {spora_2_6, composite_1}, [fun test_repack_in_place_mine/1]}}, - {timeout, Timeout, {with, {spora_2_6, composite_2}, [fun test_repack_in_place_mine/1]}}, - % % % % % XXX {timeout, Timeout, {with, {spora_2_6, unpacked}, [fun test_repack_in_place_mine/1]}}, - {timeout, Timeout, {with, {composite_1, replica_2_9}, [fun test_repack_in_place_mine/1]}}, - {timeout, Timeout, {with, {composite_1, spora_2_6}, [fun test_repack_in_place_mine/1]}}, - {timeout, Timeout, {with, {composite_1, composite_1}, [fun test_repack_in_place_mine/1]}}, - {timeout, Timeout, {with, {composite_1, composite_2}, [fun test_repack_in_place_mine/1]}}, - % % % % % XXX {timeout, Timeout, {with, {composite_1, unpacked}, [fun test_repack_in_place_mine/1]}}, - {timeout, Timeout, {with, {composite_2, replica_2_9}, [fun test_repack_in_place_mine/1]}}, - {timeout, Timeout, {with, {composite_2, spora_2_6}, [fun test_repack_in_place_mine/1]}}, - {timeout, Timeout, {with, {composite_2, composite_1}, [fun test_repack_in_place_mine/1]}}, - {timeout, Timeout, {with, {composite_2, composite_2}, [fun test_repack_in_place_mine/1]}} - % % % % % XXX {timeout, Timeout, {with, {composite_2, unpacked}, [fun test_repack_in_place_mine/1]}} + {timeout, Timeout, {with, {composite_1, replica_2_9}, [fun test_repack_in_place_mine/1]}} ]. %% -------------------------------------------------------------------------------------------- @@ -69,8 +53,8 @@ test_repack_in_place_mine({FromPackingType, ToPackingType}) -> }), ar_test_node:restart(RepackerNode), - ar_e2e:assert_partition_size(RepackerNode, 0, ToPacking, ?PARTITION_SIZE), - ar_e2e:assert_partition_size(RepackerNode, 1, ToPacking, ?PARTITION_SIZE), + ar_e2e:assert_partition_size(RepackerNode, 0, ToPacking), + ar_e2e:assert_partition_size(RepackerNode, 1, ToPacking), ar_test_node:stop(RepackerNode), diff --git a/apps/arweave/e2e/ar_repack_mine_tests.erl b/apps/arweave/e2e/ar_repack_mine_tests.erl index 245b98e55..717074c9a 100644 --- a/apps/arweave/e2e/ar_repack_mine_tests.erl +++ b/apps/arweave/e2e/ar_repack_mine_tests.erl @@ -15,27 +15,18 @@ repack_mine_test_() -> {timeout, Timeout, {with, {replica_2_9, replica_2_9}, [fun test_repacking_blocked/1]}}, {timeout, Timeout, {with, {replica_2_9, spora_2_6}, [fun test_repacking_blocked/1]}}, {timeout, Timeout, {with, {replica_2_9, composite_1}, [fun test_repacking_blocked/1]}}, - {timeout, Timeout, {with, {replica_2_9, composite_2}, [fun test_repacking_blocked/1]}}, {timeout, Timeout, {with, {replica_2_9, unpacked}, [fun test_repacking_blocked/1]}}, {timeout, Timeout, {with, {unpacked, replica_2_9}, [fun test_repack_mine/1]}}, {timeout, Timeout, {with, {unpacked, spora_2_6}, [fun test_repack_mine/1]}}, {timeout, Timeout, {with, {unpacked, composite_1}, [fun test_repack_mine/1]}}, - {timeout, Timeout, {with, {unpacked, composite_2}, [fun test_repack_mine/1]}}, {timeout, Timeout, {with, {spora_2_6, replica_2_9}, [fun test_repack_mine/1]}}, {timeout, Timeout, {with, {spora_2_6, spora_2_6}, [fun test_repack_mine/1]}}, {timeout, Timeout, {with, {spora_2_6, composite_1}, [fun test_repack_mine/1]}}, - {timeout, Timeout, {with, {spora_2_6, composite_2}, [fun test_repack_mine/1]}}, {timeout, Timeout, {with, {spora_2_6, unpacked}, [fun test_repack_mine/1]}}, {timeout, Timeout, {with, {composite_1, replica_2_9}, [fun test_repack_mine/1]}}, {timeout, Timeout, {with, {composite_1, spora_2_6}, [fun test_repack_mine/1]}}, {timeout, Timeout, {with, {composite_1, composite_1}, [fun test_repack_mine/1]}}, - {timeout, Timeout, {with, {composite_1, composite_2}, [fun test_repack_mine/1]}}, - {timeout, Timeout, {with, {composite_1, unpacked}, [fun test_repack_mine/1]}}, - {timeout, Timeout, {with, {composite_2, replica_2_9}, [fun test_repack_mine/1]}}, - {timeout, Timeout, {with, {composite_2, spora_2_6}, [fun test_repack_mine/1]}}, - {timeout, Timeout, {with, {composite_2, composite_1}, [fun test_repack_mine/1]}}, - {timeout, Timeout, {with, {composite_2, composite_2}, [fun test_repack_mine/1]}}, - {timeout, Timeout, {with, {composite_2, unpacked}, [fun test_repack_mine/1]}} + {timeout, Timeout, {with, {composite_1, unpacked}, [fun test_repack_mine/1]}} ]. %% -------------------------------------------------------------------------------------------- @@ -43,6 +34,8 @@ repack_mine_test_() -> %% -------------------------------------------------------------------------------------------- test_repack_mine({FromPackingType, ToPackingType}) -> ar_e2e:delayed_print(<<" ~p -> ~p ">>, [FromPackingType, ToPackingType]), + ?LOG_INFO([{event, test_repack_mine}, {module, ?MODULE}, + {from_packing_type, FromPackingType}, {to_packing_type, ToPackingType}]), ValidatorNode = peer1, RepackerNode = peer2, {Blocks, _AddrA, Chunks} = ar_e2e:start_source_node( @@ -65,14 +58,16 @@ test_repack_mine({FromPackingType, ToPackingType}) -> }), ar_test_node:restart(RepackerNode), - ar_e2e:assert_partition_size(RepackerNode, 1, ToPacking, ?PARTITION_SIZE), + ar_e2e:assert_partition_size(RepackerNode, 1, ToPacking), ar_test_node:update_config(RepackerNode, Config#config{ storage_modules = StorageModules, mining_addr = AddrB }), ar_test_node:restart(RepackerNode), - ar_e2e:assert_syncs_range(RepackerNode, ?PARTITION_SIZE, 2*?PARTITION_SIZE), + ar_e2e:assert_syncs_range(RepackerNode, + ?PARTITION_SIZE, + 2*?PARTITION_SIZE + ar_storage_module:get_overlap(ToPacking)), ar_e2e:assert_chunks(RepackerNode, ToPacking, Chunks), @@ -94,6 +89,8 @@ test_repack_mine({FromPackingType, ToPackingType}) -> test_repacking_blocked({FromPackingType, ToPackingType}) -> ar_e2e:delayed_print(<<" ~p -> ~p ">>, [FromPackingType, ToPackingType]), + ?LOG_INFO([{event, test_repacking_blocked}, {module, ?MODULE}, + {from_packing_type, FromPackingType}, {to_packing_type, ToPackingType}]), ValidatorNode = peer1, RepackerNode = peer2, {Blocks, _AddrA, Chunks} = ar_e2e:start_source_node( diff --git a/apps/arweave/e2e/ar_sync_pack_mine_tests.erl b/apps/arweave/e2e/ar_sync_pack_mine_tests.erl index 99b837b9d..a82efeae7 100644 --- a/apps/arweave/e2e/ar_sync_pack_mine_tests.erl +++ b/apps/arweave/e2e/ar_sync_pack_mine_tests.erl @@ -30,7 +30,6 @@ replica_2_9_block_sync_test_() -> instantiator(GenesisData, replica_2_9, fun test_syncing_blocked/1), instantiator(GenesisData, spora_2_6, fun test_syncing_blocked/1), instantiator(GenesisData, composite_1, fun test_syncing_blocked/1), - instantiator(GenesisData, composite_2, fun test_syncing_blocked/1), instantiator(GenesisData, unpacked, fun test_syncing_blocked/1) ] end}. @@ -42,7 +41,6 @@ spora_2_6_sync_pack_mine_test_() -> instantiator(GenesisData, replica_2_9, fun test_sync_pack_mine/1), instantiator(GenesisData, spora_2_6, fun test_sync_pack_mine/1), instantiator(GenesisData, composite_1, fun test_sync_pack_mine/1), - instantiator(GenesisData, composite_2, fun test_sync_pack_mine/1), instantiator(GenesisData, unpacked, fun test_sync_pack_mine/1) ] end}. @@ -54,32 +52,27 @@ composite_1_sync_pack_mine_test_() -> instantiator(GenesisData, replica_2_9, fun test_sync_pack_mine/1), instantiator(GenesisData, spora_2_6, fun test_sync_pack_mine/1), instantiator(GenesisData, composite_1, fun test_sync_pack_mine/1), - instantiator(GenesisData, composite_2, fun test_sync_pack_mine/1), instantiator(GenesisData, unpacked, fun test_sync_pack_mine/1) ] end}. -composite_2_sync_pack_mine_test_() -> - {setup, fun () -> setup_source_node(composite_2) end, +unpacked_sync_pack_mine_test_() -> + {setup, fun () -> setup_source_node(unpacked) end, fun (GenesisData) -> [ instantiator(GenesisData, replica_2_9, fun test_sync_pack_mine/1), instantiator(GenesisData, spora_2_6, fun test_sync_pack_mine/1), instantiator(GenesisData, composite_1, fun test_sync_pack_mine/1), - instantiator(GenesisData, composite_2, fun test_sync_pack_mine/1), instantiator(GenesisData, unpacked, fun test_sync_pack_mine/1) ] end}. -unpacked_sync_pack_mine_test_() -> +unpacked_and_packed_sync_pack_mine_test_() -> {setup, fun () -> setup_source_node(unpacked) end, fun (GenesisData) -> [ - instantiator(GenesisData, replica_2_9, fun test_sync_pack_mine/1), - instantiator(GenesisData, spora_2_6, fun test_sync_pack_mine/1), - instantiator(GenesisData, composite_1, fun test_sync_pack_mine/1), - instantiator(GenesisData, composite_2, fun test_sync_pack_mine/1), - instantiator(GenesisData, unpacked, fun test_sync_pack_mine/1) + instantiator(GenesisData, replica_2_9, + fun test_unpacked_and_packed_sync_pack_mine/1) ] end}. @@ -93,14 +86,22 @@ test_sync_pack_mine({{Blocks, Chunks, SourcePackingType}, SinkPackingType}) -> SinkNode = peer2, SinkPacking = start_sink_node(SinkNode, SourceNode, B0, SinkPackingType), - ar_e2e:assert_syncs_range(SinkNode, ?PARTITION_SIZE, 2*?PARTITION_SIZE), + ar_e2e:assert_syncs_range( + SinkNode, + ?PARTITION_SIZE, + 2*?PARTITION_SIZE + ar_storage_module:get_overlap(SinkPacking)), ar_e2e:assert_chunks(SinkNode, SinkPacking, Chunks), case SinkPackingType of unpacked -> ok; _ -> - CurrentHeight = ar_test_node:remote_call(SinkNode, ar_node, get_height, []), + CurrentHeight = max( + ar_test_node:remote_call(SourceNode, ar_node, get_height, []), + ar_test_node:remote_call(SinkNode, ar_node, get_height, []) + ), + ar_test_node:wait_until_height(SourceNode, CurrentHeight), + ar_test_node:wait_until_height(SinkNode, CurrentHeight), ar_test_node:mine(SinkNode), SinkBI = ar_test_node:wait_until_height(SinkNode, CurrentHeight + 1), @@ -123,6 +124,33 @@ test_syncing_blocked({{Blocks, Chunks, SourcePackingType}, SinkPackingType}) -> ar_e2e:assert_does_not_sync_range(SinkNode, ?PARTITION_SIZE, 2*?PARTITION_SIZE), ar_e2e:assert_no_chunks(SinkNode, Chunks). +test_unpacked_and_packed_sync_pack_mine({{Blocks, Chunks, SourcePackingType}, PackingType}) -> + ar_e2e:delayed_print(<<" ~p -> {~p, ~p} ">>, [SourcePackingType, PackingType, unpacked]), + [B0 | _] = Blocks, + SourceNode = peer1, + SinkNode = peer2, + + {SinkPacking, unpacked} = start_sink_node(SinkNode, SourceNode, B0, PackingType, unpacked), + ar_e2e:assert_syncs_range( + SinkNode, + ?PARTITION_SIZE, + 2*?PARTITION_SIZE + ar_storage_module:get_overlap(SinkPacking)), + ar_e2e:assert_partition_size(SinkNode, 1, SinkPacking), + ar_e2e:assert_partition_size(SinkNode, 1, unpacked), + + CurrentHeight = ar_test_node:remote_call(SinkNode, ar_node, get_height, []), + ar_test_node:mine(SinkNode), + + SinkBI = ar_test_node:wait_until_height(SinkNode, CurrentHeight + 1), + {ok, SinkBlock} = ar_test_node:http_get_block(element(1, hd(SinkBI)), SinkNode), + ar_e2e:assert_block(SinkPacking, SinkBlock), + + SourceBI = ar_test_node:wait_until_height(SourceNode, SinkBlock#block.height), + {ok, SourceBlock} = ar_test_node:http_get_block(element(1, hd(SourceBI)), SourceNode), + ?assertEqual(SinkBlock, SourceBlock), + ok. + + start_sink_node(Node, SourceNode, B0, PackingType) -> Wallet = ar_test_node:remote_call(Node, ar_e2e, load_wallet_fixture, [wallet_b]), SinkAddr = ar_wallet:to_address(Wallet), @@ -143,3 +171,26 @@ start_sink_node(Node, SourceNode, B0, PackingType) -> ), SinkPacking. + +start_sink_node(Node, SourceNode, B0, PackingType1, PackingType2) -> + Wallet = ar_test_node:remote_call(Node, ar_e2e, load_wallet_fixture, [wallet_b]), + SinkAddr = ar_wallet:to_address(Wallet), + SinkPacking1 = ar_e2e:packing_type_to_packing(PackingType1, SinkAddr), + SinkPacking2 = ar_e2e:packing_type_to_packing(PackingType2, SinkAddr), + {ok, Config} = ar_test_node:get_config(Node), + + StorageModules = [ + {?PARTITION_SIZE, 1, SinkPacking1}, + {?PARTITION_SIZE, 1, SinkPacking2} + ], + + ?assertEqual(ar_test_node:peer_name(Node), + ar_test_node:start_other_node(Node, B0, Config#config{ + peers = [ar_test_node:peer_ip(SourceNode)], + start_from_latest_state = true, + storage_modules = StorageModules, + auto_join = true, + mining_addr = SinkAddr + }, true) + ), + {SinkPacking1, SinkPacking2}. \ No newline at end of file diff --git a/apps/arweave/include/ar_config.hrl b/apps/arweave/include/ar_config.hrl index d4ec6c51f..e4a2dc128 100644 --- a/apps/arweave/include/ar_config.hrl +++ b/apps/arweave/include/ar_config.hrl @@ -119,10 +119,6 @@ %% The default rocksdb WAL sync interval, 1 minute. -define(DEFAULT_ROCKSDB_WAL_SYNC_INTERVAL_S, 60). -%% The maximum allowed total size (in bytes) of the entropies generated -%% for the 2.9 replication. --define(DEFAULT_MAX_REPLICA_2_9_ENTROPY_CACHE_SIZE, 33_554_432). % 8_388_608 * 4. - %% The number of 2.9 storage modules allowed to prepare the storage at a time. -ifdef(AR_TEST). -define(DEFAULT_REPLICA_2_9_WORKERS, 2). @@ -130,6 +126,9 @@ -define(DEFAULT_REPLICA_2_9_WORKERS, 8). -endif. +%% The number of packing workers. +-define(DEFAULT_PACKING_WORKERS, erlang:system_info(dirty_cpu_schedulers_online)). + %% @doc Startup options with default values. -record(config, { init = false, @@ -199,7 +198,6 @@ get_tx => ?MAX_PARALLEL_GET_TX_REQUESTS }, disk_cache_size = ?DISK_CACHE_SIZE, - packing_rate, max_nonce_limiter_validation_thread_count = ?DEFAULT_MAX_NONCE_LIMITER_VALIDATION_THREAD_COUNT, max_nonce_limiter_last_step_validation_thread_count @@ -226,8 +224,8 @@ pool_server_address = not_set, pool_api_key = not_set, pool_worker_name = not_set, + packing_workers = ?DEFAULT_PACKING_WORKERS, replica_2_9_workers = ?DEFAULT_REPLICA_2_9_WORKERS, - replica_2_9_entropy_cache_size = ?DEFAULT_MAX_REPLICA_2_9_ENTROPY_CACHE_SIZE, %% Undocumented/unsupported options chunk_storage_file_size = ?CHUNK_GROUP_SIZE, rocksdb_flush_interval_s = ?DEFAULT_ROCKSDB_FLUSH_INTERVAL_S, diff --git a/apps/arweave/include/ar_data_sync.hrl b/apps/arweave/include/ar_data_sync.hrl index 7a911a109..bfcfd9362 100644 --- a/apps/arweave/include/ar_data_sync.hrl +++ b/apps/arweave/include/ar_data_sync.hrl @@ -163,7 +163,7 @@ migrations_index, %% A flag indicating the process has started collecting the intervals for syncing. %% We consult the other storage modules first, then search among the network peers. - started_syncing = false, + sync_status = undefined, %% The offsets of the chunks currently scheduled for (re-)packing (keys) and %% some chunk metadata needed for storing the chunk once it is packed. packing_map = #{}, diff --git a/apps/arweave/src/ar.erl b/apps/arweave/src/ar.erl index 1173d68b0..743a25a8a 100644 --- a/apps/arweave/src/ar.erl +++ b/apps/arweave/src/ar.erl @@ -120,20 +120,14 @@ show_help() -> "Q5EfKawrRazp11HEDf_NJpxjYMV385j21nlQNjR8_pY, specify " "storage_module " "22,En2eqsVJARnTVOSh723PBXAKGmKgrGSjQ2YIGwE_ZRI.1,repack_in_place," - "Q5EfKawrRazp11HEDf_NJpxjYMV385j21nlQNjR8_pY.1. This storage module " + "Q5EfKawrRazp11HEDf_NJpxjYMV385j21nlQNjR8_pY.replica.2.9. This storage module " "will only do the repacking - it won't be used for mining and won't " "serve any data to peers. Once the repacking is complete, a message will " "be logged to the file and written to the console. We suggest you rename " - "the storage module folder according to the new packing then." - - " If you changed your mind and want to repack " - "a module already being repacked to the yet different packing, simply " - "restart the node specifying the corresponding packing. E.g., in " - "the example above, you can restart with storage_module " - "22,En2eqsVJARnTVOSh723PBXAKGmKgrGSjQ2YIGwE_ZRI.1,repack_in_place,unpacked." - " The node will unpack everything that was repacked to " - "Q5EfKawrRazp11HEDf_NJpxjYMV385j21nlQNjR8_pY and also unpack everything " - "that is still packed with En2eqsVJARnTVOSh723PBXAKGmKgrGSjQ2YIGwE_ZRI.1." + "the storage module folder according to the new packing then. " + + "Note: as of 2.9.1 you can only repack in place to the replica_2_9 " + "format." }, {"repack_batch_size", io_lib:format("The number of chunk fetched from disk " "at a time during in-place repacking. Default: ~B.", @@ -266,8 +260,16 @@ show_help() -> ) )}, {"packing_rate", - "The maximum number of chunks per second to pack or unpack. " - "The default value is determined based on the number of CPU cores."}, + "DEPRECATED. Does not affect anything. Use packing_workers instead."}, + {"packing_workers (num)", + "The number of packing workers to spawn. The default is the number of " + "logical CPU cores."}, + {"replica_2_9_workers (num)", io_lib:format( + "The number of replica 2.9 workers to spawn. Replica 2.9 workers are used " + "to generate entropy the replica.2.9 format. At most one worker will be " + "active per physical disk at a time. Default: ~B", + [?DEFAULT_REPLICA_2_9_WORKERS] + )}, {"max_vdf_validation_thread_count", io_lib:format("\tThe maximum number " "of threads used for VDF validation. Default: ~B", [?DEFAULT_MAX_NONCE_LIMITER_VALIDATION_THREAD_COUNT])}, @@ -565,8 +567,14 @@ parse_cli_args(["max_disk_pool_data_root_buffer_mb", Num | Rest], C) -> parse_cli_args(Rest, C#config{ max_disk_pool_data_root_buffer_mb = list_to_integer(Num) }); parse_cli_args(["disk_cache_size_mb", Num | Rest], C) -> parse_cli_args(Rest, C#config{ disk_cache_size = list_to_integer(Num) }); -parse_cli_args(["packing_rate", Num | Rest], C) -> - parse_cli_args(Rest, C#config{ packing_rate = list_to_integer(Num) }); +parse_cli_args(["packing_rate", _Num | Rest], C) -> + ?LOG_WARNING("Deprecated option found 'packing_rate': " + " this option has been removed and is now a no-op.", []), + parse_cli_args(Rest, C#config{ }); +parse_cli_args(["packing_workers", Num | Rest], C) -> + parse_cli_args(Rest, C#config{ packing_workers = list_to_integer(Num) }); +parse_cli_args(["replica_2_9_workers", Num | Rest], C) -> + parse_cli_args(Rest, C#config{ replica_2_9_workers = list_to_integer(Num) }); parse_cli_args(["max_vdf_validation_thread_count", Num | Rest], C) -> parse_cli_args(Rest, C#config{ max_nonce_limiter_validation_thread_count = list_to_integer(Num) }); @@ -931,7 +939,6 @@ start_for_tests(TestType, Config) -> data_dir = ".tmp/data_" ++ atom_to_list(TestType) ++ "_main_" ++ UniqueName, port = ar_test_node:get_unused_port(), disable = [randomx_jit], - packing_rate = 20, auto_join = false }, start(TestConfig). diff --git a/apps/arweave/src/ar_chunk_copy.erl b/apps/arweave/src/ar_chunk_copy.erl new file mode 100644 index 000000000..8bca70be5 --- /dev/null +++ b/apps/arweave/src/ar_chunk_copy.erl @@ -0,0 +1,317 @@ +%%% @doc The module maintains a queue of processes fetching data from the network +%%% and from the local storage modules. +-module(ar_chunk_copy). + +-behaviour(gen_server). + +-export([start_link/1, register_workers/0, ready_for_work/1, read_range/4]). + +-export([init/1, handle_cast/2, handle_call/3, handle_info/2, terminate/2]). + +-include_lib("arweave/include/ar.hrl"). +-include_lib("arweave/include/ar_sup.hrl"). +-include_lib("arweave/include/ar_config.hrl"). +-include_lib("eunit/include/eunit.hrl"). + +-define(READ_RANGE_CHUNKS, 400). +-define(MAX_ACTIVE_TASKS, 10). +-define(MAX_QUEUED_TASKS, 50). + +-record(worker_tasks, { + worker, + task_queue = queue:new(), + active_count = 0 +}). + +-record(state, { + workers = #{} +}). + +%%%=================================================================== +%%% Public interface. +%%%=================================================================== + +%% @doc Start the server. +start_link(WorkerMap) -> + gen_server:start_link({local, ?MODULE}, ?MODULE, WorkerMap, []). + +register_workers() -> + {Workers, WorkerMap} = register_read_workers(), + ChunkCopy = ?CHILD_WITH_ARGS(ar_chunk_copy, worker, ar_chunk_copy, [WorkerMap]), + Workers ++ [ChunkCopy]. + +register_read_workers() -> + {ok, Config} = application:get_env(arweave, config), + {Workers, WorkerMap} = + lists:foldl( + fun(StorageModule, {AccWorkers, AccWorkerMap}) -> + StoreID = ar_storage_module:id(StorageModule), + Name = list_to_atom("ar_data_sync_worker_" ++ StoreID), + + Worker = ?CHILD_WITH_ARGS(ar_data_sync_worker, worker, Name, [Name]), + + {[ Worker | AccWorkers], AccWorkerMap#{StoreID => Name}} + end, + {[], #{}}, + Config#config.storage_modules + ), + {Workers, WorkerMap}. + +%% @doc Returns true if we can accept new tasks. Will always return false if syncing is +%% disabled (i.e. sync_jobs = 0). +ready_for_work(StoreID) -> + try + gen_server:call(?MODULE, {ready_for_work, StoreID}, 1000) + catch + exit:{timeout,_} -> + false + end. + +read_range(Start, End, OriginStoreID, TargetStoreID) -> + case ar_chunk_copy:ready_for_work(OriginStoreID) of + true -> + Args = {Start, End, OriginStoreID, TargetStoreID}, + gen_server:cast(?MODULE, {read_range, Args}), + true; + false -> + false + end. + +%%%=================================================================== +%%% Generic server callbacks. +%%%=================================================================== + +init(WorkerMap) -> + ?LOG_DEBUG([{event, init}, {module, ?MODULE}, {worker_map, WorkerMap}]), + Workers = maps:fold( + fun(StoreID, Name, Acc) -> + Acc#{StoreID => #worker_tasks{worker = Name}} + end, + #{}, + WorkerMap + ), + ar_util:cast_after(1000, self(), process_queues), + {ok, #state{ + workers = Workers + }}. + +handle_call({ready_for_work, StoreID}, _From, State) -> + {reply, do_ready_for_work(StoreID, State), State}; + +handle_call(Request, _From, State) -> + ?LOG_WARNING([{event, unhandled_call}, {module, ?MODULE}, {request, Request}]), + {reply, ok, State}. + +handle_cast({read_range, Args}, State) -> + ?LOG_DEBUG([{event, read_range}, {module, ?MODULE}, {args, Args}]), + {noreply, enqueue_read_range(Args, State)}; + +handle_cast(process_queues, State) -> + ?LOG_DEBUG([{event, process_queues}, {module, ?MODULE}]), + ar_util:cast_after(1000, self(), process_queues), + {noreply, process_queues(State)}; + +handle_cast({task_completed, {read_range, {Worker, _, Args}}}, State) -> + ?LOG_DEBUG([{event, task_completed}, {module, ?MODULE}, {worker, Worker}, {args, Args}]), + {noreply, task_completed(Args, State)}; + +handle_cast(Cast, State) -> + ?LOG_WARNING([{event, unhandled_cast}, {module, ?MODULE}, {cast, Cast}]), + {noreply, State}. + +handle_info(Message, State) -> + ?LOG_WARNING([{event, unhandled_info}, {module, ?MODULE}, {message, Message}]), + {noreply, State}. + +terminate(Reason, _State) -> + ?LOG_DEBUG([{event, terminate}, {module, ?MODULE}, {reason, io_lib:format("~p", [Reason])}]), + ok. + +%%%=================================================================== +%%% Private functions. +%%%=================================================================== + +do_ready_for_work(StoreID, State) -> + Worker = maps:get(StoreID, State#state.workers, undefined), + case Worker of + undefined -> + ?LOG_ERROR([{event, worker_not_found}, {module, ?MODULE}, {call, ready_for_work}, + {store_id, StoreID}]), + false; + _ -> + queue:len(Worker#worker_tasks.task_queue) < ?MAX_QUEUED_TASKS + end. + +enqueue_read_range(Args, State) -> + {_Start, _End, OriginStoreID, _TargetStoreID} = Args, + Worker = maps:get(OriginStoreID, State#state.workers, undefined), + case Worker of + undefined -> + ?LOG_ERROR([{event, worker_not_found}, {module, ?MODULE}, + {call, enqueue_read_range}, {store_id, OriginStoreID}]), + State; + _ -> + Worker2 = do_enqueue_read_range(Args, Worker), + State#state{ + workers = maps:put(OriginStoreID, Worker2, State#state.workers) + } + end. + +do_enqueue_read_range(Args, Worker) -> + {Start, End, OriginStoreID, TargetStoreID} = Args, + End2 = min(Start + (?READ_RANGE_CHUNKS * ?DATA_CHUNK_SIZE), End), + Args2 = {Start, End2, OriginStoreID, TargetStoreID}, + ?LOG_DEBUG([{event, enqueue_read_range}, {module, ?MODULE}, {args, Args2}]), + TaskQueue = queue:in(Args2, Worker#worker_tasks.task_queue), + Worker2 = Worker#worker_tasks{task_queue = TaskQueue}, + case End2 == End of + true -> + Worker2; + false -> + Args3 = {End2, End, OriginStoreID, TargetStoreID}, + do_enqueue_read_range(Args3, Worker2) + end. + +process_queues(State) -> + Workers = State#state.workers, + UpdatedWorkers = maps:map( + fun(_Key, Worker) -> + process_queue(Worker) + end, + Workers + ), + State#state{workers = UpdatedWorkers}. + +process_queue(Worker) -> + case Worker#worker_tasks.active_count < ?MAX_ACTIVE_TASKS of + true -> + case queue:out(Worker#worker_tasks.task_queue) of + {empty, _} -> + Worker; + {{value, Args}, Q2}-> + ?LOG_DEBUG([{event, process_queue}, {module, ?MODULE}, + {active_count, Worker#worker_tasks.active_count}, {args, Args}]), + gen_server:cast(Worker#worker_tasks.worker, {read_range, Args}), + Worker2 = Worker#worker_tasks{ + task_queue = Q2, + active_count = Worker#worker_tasks.active_count + 1 + }, + process_queue(Worker2) + end; + false -> + Worker + end. + +task_completed(Args, State) -> + {_Start, _End, OriginStoreID, _TargetStoreID} = Args, + Worker = maps:get(OriginStoreID, State#state.workers, undefined), + case Worker of + undefined -> + ?LOG_ERROR([{event, worker_not_found}, {module, ?MODULE}, {call, task_completed}, + {store_id, OriginStoreID}]), + State; + _ -> + ?LOG_DEBUG([{event, task_completed}, {module, ?MODULE}, + {worker, Worker#worker_tasks.worker}, + {active_count, Worker#worker_tasks.active_count}, {args, Args}]), + ActiveCount = Worker#worker_tasks.active_count - 1, + Worker2 = Worker#worker_tasks{active_count = ActiveCount}, + Worker3 = process_queue(Worker2), + State2 = State#state{ + workers = maps:put(OriginStoreID, Worker3, State#state.workers) + }, + State2 + end. + +%%%=================================================================== +%%% Tests. Included in the module so they can reference private +%%% functions. +%%%=================================================================== + +helpers_test_() -> + [ + {timeout, 30, fun test_ready_for_work/0}, + {timeout, 30, fun test_enqueue_read_range/0}, + {timeout, 30, fun test_process_queue/0} + ]. + +test_ready_for_work() -> + State = #state{ + workers = #{ + "store1" => #worker_tasks{ + task_queue = queue:from_list(lists:seq(1, ?MAX_QUEUED_TASKS - 1))}, + "store2" => #worker_tasks{ + task_queue = queue:from_list(lists:seq(1, ?MAX_QUEUED_TASKS))} + } + }, + ?assertEqual(true, do_ready_for_work("store1", State)), + ?assertEqual(false, do_ready_for_work("store2", State)). + +test_enqueue_read_range() -> + ExpectedWorker = #worker_tasks{ + task_queue = queue:from_list( + [{ + floor(2.5 * ?DATA_CHUNK_SIZE), + floor((2.5 + ?READ_RANGE_CHUNKS) * ?DATA_CHUNK_SIZE), + "store1", "store2" + }, + { + floor((2.5 + ?READ_RANGE_CHUNKS) * ?DATA_CHUNK_SIZE), + floor((2.5 + 2 * ?READ_RANGE_CHUNKS) * ?DATA_CHUNK_SIZE), + "store1", "store2" + }, + { + floor((2.5 + 2 * ?READ_RANGE_CHUNKS) * ?DATA_CHUNK_SIZE), + floor((2.5 + 3 * ?READ_RANGE_CHUNKS) * ?DATA_CHUNK_SIZE), + "store1", "store2" + }] + ) + }, + Worker = do_enqueue_read_range( + { + floor(2.5 * ?DATA_CHUNK_SIZE), + floor((2.5 + 3 * ?READ_RANGE_CHUNKS) * ?DATA_CHUNK_SIZE), + "store1", "store2" + }, + #worker_tasks{task_queue = queue:new()} + ), + ?assertEqual( + queue:to_list(ExpectedWorker#worker_tasks.task_queue), + queue:to_list(Worker#worker_tasks.task_queue)). + +test_process_queue() -> + Worker1 = #worker_tasks{ + active_count = ?MAX_ACTIVE_TASKS + }, + ?assertEqual(Worker1, process_queue(Worker1)), + + Worker2 = #worker_tasks{ + active_count = ?MAX_ACTIVE_TASKS + 1 + }, + ?assertEqual(Worker2, process_queue(Worker2)), + + Worker3 = process_queue( + #worker_tasks{ + active_count = ?MAX_ACTIVE_TASKS - 2, + task_queue = queue:from_list( + [{floor(2.5 * ?DATA_CHUNK_SIZE), floor(12.5 * ?DATA_CHUNK_SIZE), + "store1", "store2"}, + {floor(12.5 * ?DATA_CHUNK_SIZE), floor(22.5 * ?DATA_CHUNK_SIZE), + "store1", "store2"}, + {floor(22.5 * ?DATA_CHUNK_SIZE), floor(30 * ?DATA_CHUNK_SIZE), + "store1", "store2"}]) + } + ), + ExpectedWorker3 = #worker_tasks{ + active_count = ?MAX_ACTIVE_TASKS, + task_queue = queue:from_list( + [{floor(22.5 * ?DATA_CHUNK_SIZE), floor(30 * ?DATA_CHUNK_SIZE), + "store1", "store2"}] + ) + }, + ?assertEqual( + ExpectedWorker3#worker_tasks.active_count, Worker3#worker_tasks.active_count), + ?assertEqual( + queue:to_list(ExpectedWorker3#worker_tasks.task_queue), + queue:to_list(Worker3#worker_tasks.task_queue)). + diff --git a/apps/arweave/src/ar_chunk_storage.erl b/apps/arweave/src/ar_chunk_storage.erl index 009acec6d..906a8514e 100644 --- a/apps/arweave/src/ar_chunk_storage.erl +++ b/apps/arweave/src/ar_chunk_storage.erl @@ -8,9 +8,9 @@ get_range/2, get_range/3, cut/2, delete/1, delete/2, get_filepath/2, get_handle_by_filepath/1, close_file/2, close_files/1, list_files/2, run_defragmentation/0, - get_storage_module_path/2, get_chunk_storage_path/2, is_prepared/1, + get_storage_module_path/2, get_chunk_storage_path/2, get_chunk_bucket_start/1, get_chunk_bucket_end/1, - sync_record_id/1, store_chunk/7, write_chunk/4, write_chunk2/6, record_chunk/5]). + sync_record_id/1, store_chunk/7, write_chunk/4, record_chunk/7]). -export([init/1, handle_cast/2, handle_call/3, handle_info/2, terminate/2]). @@ -25,17 +25,26 @@ -record(state, { file_index, store_id, + store_id_label, + packing_labels = #{}, packing_map = #{}, repack_cursor = 0, target_packing = none, - repacking_complete = false, + repack_status = undefined, range_start, range_end, reward_addr, prepare_replica_2_9_cursor, - is_prepared = false + prepare_slice_index = 0, + prepare_status = undefined }). +-ifdef(AR_TEST). +-define(DEVICE_LOCK_WAIT, 100). +-else. +-define(DEVICE_LOCK_WAIT, 5_000). +-endif. + %%%=================================================================== %%% Public interface. %%%=================================================================== @@ -82,9 +91,7 @@ put(PaddedOffset, Chunk, Packing, StoreID) -> GenServerID = name(StoreID), case catch gen_server:call(GenServerID, {put, PaddedOffset, Chunk, Packing}, 180_000) of {'EXIT', {timeout, {gen_server, call, _}}} -> - ?LOG_DEBUG([{event, details_failed_to_store_chunk}, - {context, gen_server_timeout_putting_chunk}, - {error, timeout}, + ?LOG_ERROR([{event, gen_server_timeout_putting_chunk}, {padded_offset, PaddedOffset}, {store_id, StoreID} ]), @@ -249,25 +256,18 @@ get_storage_module_path(DataDir, StoreID) -> get_chunk_storage_path(DataDir, StoreID) -> filename:join([get_storage_module_path(DataDir, StoreID), ?CHUNK_DIR]). -%% @doc Return true if the storage is ready to accept chunks. --spec is_prepared(StoreID :: string()) -> true | false. -is_prepared(StoreID) -> - GenServerID = name(StoreID), - case catch gen_server:call(GenServerID, is_prepared) of - {'EXIT', {noproc, {gen_server, call, _}}} -> - {error, timeout}; - {'EXIT', {timeout, {gen_server, call, _}}} -> - {error, timeout}; - Reply -> - Reply - end. -%% @doc Return the start offset of the bucket containing the given offset. +%% @doc Return the start and end offset of the bucket containing the given offset. %% A chunk bucket a 0-based, 256-KiB wide, 256-KiB aligned range that fully contains a chunk. -spec get_chunk_bucket_start(PaddedEndOffset :: non_neg_integer()) -> non_neg_integer(). get_chunk_bucket_start(PaddedEndOffset) -> ar_util:floor_int(max(0, PaddedEndOffset - ?DATA_CHUNK_SIZE), ?DATA_CHUNK_SIZE). +-spec get_chunk_bucket_end(Offset :: non_neg_integer()) -> non_neg_integer(). +get_chunk_bucket_end(EndOffset) -> + PaddedEndOffset = ar_block:get_chunk_padded_offset(EndOffset), + get_chunk_bucket_start(PaddedEndOffset) + ?DATA_CHUNK_SIZE. + %%%=================================================================== %%% Generic server callbacks. %%%=================================================================== @@ -290,7 +290,9 @@ init({"default" = StoreID, _}) -> FileIndex ), warn_custom_chunk_group_size(StoreID), - {ok, #state{ file_index = FileIndex2, store_id = StoreID }}; + StoreIDLabel = ar_storage_module:label_by_id(StoreID), + {ok, #state{ + file_index = FileIndex2, store_id = StoreID, store_id_label = StoreIDLabel }}; init({StoreID, RepackInPlacePacking}) -> %% Trap exit to avoid corrupting any open files on quit.. process_flag(trap_exit, true), @@ -309,8 +311,9 @@ init({StoreID, RepackInPlacePacking}) -> ), warn_custom_chunk_group_size(StoreID), {RangeStart, RangeEnd} = ar_storage_module:get_range(StoreID), + StoreIDLabel = ar_storage_module:label_by_id(StoreID), State = #state{ file_index = FileIndex2, store_id = StoreID, - range_start = RangeStart, range_end = RangeEnd }, + range_start = RangeStart, range_end = RangeEnd, store_id_label = StoreIDLabel }, RunEntropyProcess = case RepackInPlacePacking of none -> @@ -328,34 +331,42 @@ init({StoreID, RepackInPlacePacking}) -> State2 = case RunEntropyProcess of {true, RewardAddr} -> - PrepareCursor = {Start, _SubChunkStart} = - read_prepare_replica_2_9_cursor(StoreID, {RangeStart + 1, 0}), - IsPrepared = - case Start =< RangeEnd of + PrepareCursor = + read_prepare_replica_2_9_cursor(StoreID, RangeStart + 1), + ?LOG_INFO([{event, read_prepare_replica_2_9_cursor}, {store_id, StoreID}, + {cursor, PrepareCursor}, {range_start, RangeStart}, + {range_end, RangeEnd}]), + PrepareStatus = + case PrepareCursor =< RangeEnd of true -> gen_server:cast(self(), prepare_replica_2_9), - false; + paused; false -> - true + complete end, + BucketEndOffset = get_chunk_bucket_end(PrepareCursor), State#state{ reward_addr = RewardAddr, prepare_replica_2_9_cursor = PrepareCursor, - is_prepared = IsPrepared }; + prepare_slice_index = ar_replica_2_9:get_slice_index(BucketEndOffset), + prepare_status = PrepareStatus }; _ -> - State#state{ is_prepared = true } + State#state{ prepare_status = off } end, case RepackInPlacePacking of none -> - {ok, State2#state{ repack_cursor = none }}; + {ok, State2#state{ repack_cursor = none, repack_status = off }}; Packing -> - Cursor = ar_repack:read_cursor(StoreID, Packing, RangeStart), + RepackCursor = ar_repack:read_cursor(StoreID, Packing, RangeStart), gen_server:cast(self(), {repack, Packing}), ?LOG_INFO([{event, starting_repack_in_place}, {tags, [repack_in_place]}, - {cursor, Cursor}, + {cursor, RepackCursor}, {store_id, StoreID}, {target_packing, ar_serialize:encode_packing(Packing, true)}]), - {ok, State2#state{ repack_cursor = Cursor, target_packing = Packing }} + {ok, State2#state{ + repack_cursor = RepackCursor, + target_packing = Packing, + repack_status = paused }} end. warn_custom_chunk_group_size(StoreID) -> @@ -372,170 +383,23 @@ warn_custom_chunk_group_size(StoreID) -> ok end. -handle_cast(prepare_replica_2_9, #state{ store_id = StoreID } = State) -> - case try_acquire_replica_2_9_formatting_lock(StoreID) of - true -> - ?LOG_DEBUG([{event, acquired_replica_2_9_formatting_lock}, {store_id, StoreID}]), - gen_server:cast(self(), do_prepare_replica_2_9); - false -> - ?LOG_DEBUG([{event, failed_to_acquire_replica_2_9_formatting_lock}, {store_id, StoreID}]), - ar_util:cast_after(2000, self(), prepare_replica_2_9) - end, - {noreply, State}; -handle_cast(do_prepare_replica_2_9, State) -> - #state{ reward_addr = RewardAddr, prepare_replica_2_9_cursor = {Start, SubChunkStart}, - range_start = RangeStart, range_end = RangeEnd, - store_id = StoreID, repack_cursor = RepackCursor } = State, - - ?LOG_DEBUG([{event, do_prepare_replica_2_9}, - {storage_module, StoreID}, - {start, Start}, - {sub_chunk_start, SubChunkStart}, - {range_start, RangeStart}, - {range_end, RangeEnd}, - {repack_cursor, RepackCursor}]), - - BucketEndOffset = get_chunk_bucket_end(ar_block:get_chunk_padded_offset(Start)), - PaddedRangeEnd = get_chunk_bucket_end(ar_block:get_chunk_padded_offset(RangeEnd)), - %% Sanity checks: - BucketEndOffset = get_chunk_bucket_end(BucketEndOffset), - true = ( - max(0, BucketEndOffset - ?DATA_CHUNK_SIZE) == get_chunk_bucket_start(BucketEndOffset) - ), - %% End of sanity checks. - - Partition = ar_replica_2_9:get_entropy_partition(BucketEndOffset), - CheckRangeEnd = - case BucketEndOffset > PaddedRangeEnd of - true -> - release_replica_2_9_formatting_lock(StoreID), - ?LOG_INFO([{event, storage_module_replica_2_9_preparation_complete}, - {store_id, StoreID}]), - ar:console("The storage module ~s is prepared for 2.9 replication.~n", - [StoreID]), - complete; - false -> - false - end, - %% For now the SubChunkStart and SubChunkStart2 values will always be 0. The field - %% is used to make future improvemets easier. e.g. have the cursor increment by - %% sub-chunk rather than chunk. - SubChunkStart2 = (SubChunkStart + ?DATA_CHUNK_SIZE) rem ?DATA_CHUNK_SIZE, - Start2 = BucketEndOffset + ?DATA_CHUNK_SIZE, - Cursor2 = {Start2, SubChunkStart2}, - State2 = State#state{ prepare_replica_2_9_cursor = Cursor2 }, - CheckRepackCursor = - case CheckRangeEnd of - complete -> - complete; - false -> - case RepackCursor of - none -> - false; - _ -> - SectorSize = ar_replica_2_9:get_sector_size(), - RangeStart2 = get_chunk_bucket_start(RangeStart + 1), - RepackCursor2 = get_chunk_bucket_start(RepackCursor + 1), - RepackSectorShift = (RepackCursor2 - RangeStart2) rem SectorSize, - SectorShift = (BucketEndOffset - RangeStart2) rem SectorSize, - case SectorShift > RepackSectorShift of - true -> - waiting_for_repack; - false -> - false - end - end - end, - CheckIsRecorded = - case CheckRepackCursor of - complete -> - complete; - waiting_for_repack -> - waiting_for_repack; - false -> - ar_entropy_storage:is_sub_chunk_recorded( - BucketEndOffset, SubChunkStart, StoreID) - end, - StoreEntropy = - case CheckIsRecorded of - complete -> - complete; - waiting_for_repack -> - waiting_for_repack; - true -> - is_recorded; - false -> - %% Get all the entropies needed to encipher the chunk at BucketEndOffset. - Entropies = ar_entropy_storage:generate_entropies(RewardAddr, BucketEndOffset, SubChunkStart), - EntropyKeys = ar_entropy_storage:generate_entropy_keys( - RewardAddr, BucketEndOffset, SubChunkStart), - SliceIndex = ar_replica_2_9:get_slice_index(BucketEndOffset), - %% If we are not at the beginning of the entropy, shift the offset to - %% the left. store_entropy will traverse the entire 2.9 partition shifting - %% the offset by sector size. It may happen some sub-chunks will be written - %% to the neighbouring storage module(s) on the left or on the right - %% since the storage module may be configured to be smaller than the - %% partition. - BucketEndOffset2 = ar_entropy_storage:shift_entropy_offset( - BucketEndOffset, -SliceIndex), - %% The end of a recall partition (3.6TB) may fall in the middle of a chunk, so - %% we'll use the padded offset to end the store_entropy iteration. - PartitionEnd = (Partition + 1) * ?PARTITION_SIZE, - PaddedPartitionEnd = - get_chunk_bucket_end(ar_block:get_chunk_padded_offset(PartitionEnd)), - ar_entropy_storage:store_entropy(Entropies, BucketEndOffset2, SubChunkStart, PaddedPartitionEnd, - EntropyKeys, RewardAddr, 0, 0) - end, - ?LOG_DEBUG([{event, do_prepare_replica_2_9}, {store_id, StoreID}, - {start, Start}, {bucket_end_offset, BucketEndOffset}, - {range_end, RangeEnd}, {padded_range_end, PaddedRangeEnd}, - {sub_chunk_start, SubChunkStart}, - {check_is_recorded, CheckIsRecorded}, {store_entropy, StoreEntropy}]), - case StoreEntropy of - complete -> - {noreply, State#state{ is_prepared = true }}; - waiting_for_repack -> - ?LOG_INFO([{event, waiting_for_repacking}, - {store_id, StoreID}, - {bucket_end_offset, BucketEndOffset}, - {repack_cursor, RepackCursor}, - {cursor, Start}, - {range_start, RangeStart}, - {range_end, RangeEnd}]), - ar_util:cast_after(10000, self(), do_prepare_replica_2_9), - {noreply, State}; - is_recorded -> - gen_server:cast(self(), do_prepare_replica_2_9), - {noreply, State2}; - {error, Error} -> - ?LOG_WARNING([{event, failed_to_store_replica_2_9_entropy}, - {cursor, Start}, - {store_id, StoreID}, - {reason, io_lib:format("~p", [Error])}]), - ar_util:cast_after(500, self(), do_prepare_replica_2_9), - {noreply, State}; - {ok, SubChunksStored} -> - ?LOG_DEBUG([{event, stored_replica_2_9_entropy}, - {sub_chunks_stored, SubChunksStored}, - {store_id, StoreID}, - {cursor, Start}, - {bucket_end_offset, BucketEndOffset}]), - gen_server:cast(self(), do_prepare_replica_2_9), - case store_prepare_replica_2_9_cursor(Cursor2, StoreID) of - ok -> - ok; - {error, Error} -> - ?LOG_WARNING([{event, failed_to_store_prepare_replica_2_9_cursor}, - {chunk_cursor, Start2}, - {sub_chunk_cursor, SubChunkStart2}, - {store_id, StoreID}, - {reason, io_lib:format("~p", [Error])}]) - end, - {noreply, State2} - end; +handle_cast(prepare_replica_2_9, State) -> + #state{ store_id = StoreID } = State, + NewStatus = ar_device_lock:acquire_lock(prepare, StoreID, State#state.prepare_status), + State2 = State#state{ prepare_status = NewStatus }, + State3 = case NewStatus of + active -> + do_prepare_replica_2_9(State2); + paused -> + ar_util:cast_after(?DEVICE_LOCK_WAIT, self(), prepare_replica_2_9), + State2; + _ -> + State2 + end, + {noreply, State3}; -handle_cast(store_repack_cursor, #state{ repacking_complete = true } = State) -> +handle_cast(store_repack_cursor, #state{ repack_status = complete } = State) -> {noreply, State}; handle_cast(store_repack_cursor, #state{ repack_cursor = Cursor, store_id = StoreID, @@ -544,19 +408,42 @@ handle_cast(store_repack_cursor, {noreply, State}; handle_cast(repacking_complete, State) -> - {noreply, State#state{ repacking_complete = true }}; + #state{ store_id = StoreID } = State, + ar_device_lock:release_lock(repack, StoreID), + {noreply, State#state{ repack_status = complete }}; handle_cast({repack, Packing}, #state{ store_id = StoreID, repack_cursor = Cursor, range_start = RangeStart, range_end = RangeEnd } = State) -> - spawn(fun() -> ar_repack:repack(Cursor, RangeStart, RangeEnd, Packing, StoreID) end), - {noreply, State}; + NewStatus = ar_device_lock:acquire_lock(repack, StoreID, State#state.repack_status), + State2 = State#state{ repack_status = NewStatus }, + case NewStatus of + active -> + spawn(fun() -> + ar_repack:repack(Cursor, RangeStart, RangeEnd, Packing, StoreID) end); + paused -> + ar_util:cast_after(?DEVICE_LOCK_WAIT, self(), {repack, Packing}); + _ -> + ok + end, + {noreply, State2}; -handle_cast({repack, Cursor, RangeStart, RangeEnd, Packing}, - #state{ store_id = StoreID } = State) -> +handle_cast({repack, Cursor, RangeStart, RangeEnd, Packing}, State) -> + #state{ store_id = StoreID } = State, gen_server:cast(self(), store_repack_cursor), - spawn(fun() -> ar_repack:repack(Cursor, RangeStart, RangeEnd, Packing, StoreID) end), - {noreply, State#state{ repack_cursor = Cursor }}; + NewStatus = ar_device_lock:acquire_lock(repack, StoreID, State#state.repack_status), + State2 = State#state{ repack_status = NewStatus, repack_cursor = Cursor }, + case NewStatus of + active -> + spawn(fun() -> + ar_repack:repack(Cursor, RangeStart, RangeEnd, Packing, StoreID) end); + paused -> + ar_util:cast_after(?DEVICE_LOCK_WAIT, self(), + {repack, Cursor, RangeStart, RangeEnd, Packing}); + _ -> + ok + end, + {noreply, State2}; handle_cast({register_packing_ref, Ref, Args}, #state{ packing_map = Map } = State) -> {noreply, State#state{ packing_map = maps:put(Ref, Args, Map) }}; @@ -568,16 +455,21 @@ handle_cast(Cast, State) -> ?LOG_WARNING([{event, unhandled_cast}, {module, ?MODULE}, {cast, Cast}]), {noreply, State}. -handle_call(is_prepared, _From, #state{ is_prepared = IsPrepared } = State) -> - {reply, IsPrepared, State}; - handle_call({put, PaddedEndOffset, Chunk, Packing}, _From, State) when byte_size(Chunk) == ?DATA_CHUNK_SIZE -> - case store_chunk(PaddedEndOffset, Chunk, Packing, State) of + #state{ store_id = StoreID, store_id_label = StoreIDLabel, reward_addr = RewardAddr, + prepare_status = PrepareStatus, file_index = FileIndex } = State, + + IsPrepared = PrepareStatus == complete, + {PackingLabel, State2} = get_packing_label(Packing, State), + Result = store_chunk( + PaddedEndOffset, Chunk, Packing, StoreID, + StoreIDLabel, PackingLabel, FileIndex, IsPrepared, RewardAddr), + case Result of {ok, FileIndex2, NewPacking} -> - {reply, {ok, NewPacking}, State#state{ file_index = FileIndex2 }}; + {reply, {ok, NewPacking}, State2#state{ file_index = FileIndex2 }}; Error -> - {reply, Error, State} + {reply, Error, State2} end; handle_call({delete, PaddedEndOffset}, _From, State) -> @@ -621,11 +513,16 @@ handle_info({chunk, {packed, Ref, ChunkArgs}}, #state{ packing_map = Map } = State) -> case maps:get(Ref, Map, not_found) of not_found -> + {Packing, _, Offset, _, ChunkSize} = ChunkArgs, + ?LOG_WARNING([{event, chunk_repack_request_not_found}, + {offset, Offset}, {chunk_size, ChunkSize}, + {packing, ar_serialize:encode_packing(Packing, true)}]), {noreply, State}; Args -> State2 = State#state{ packing_map = maps:remove(Ref, Map) }, - #state{ store_id = StoreID, reward_addr = RewardAddr, is_prepared = IsPrepared, - file_index = FileIndex } = State2, + #state{ store_id = StoreID, reward_addr = RewardAddr, + prepare_status = PrepareStatus, file_index = FileIndex } = State2, + IsPrepared = PrepareStatus == complete, case ar_repack:chunk_repacked( ChunkArgs, Args, StoreID, FileIndex, IsPrepared, RewardAddr) of {ok, FileIndex2} -> @@ -637,12 +534,21 @@ handle_info({chunk, {packed, Ref, ChunkArgs}}, end; handle_info({Ref, _Reply}, State) when is_reference(Ref) -> + ?LOG_ERROR([{event, stale_gen_server_call_reply}, {ref, Ref}, {reply, _Reply}]), %% A stale gen_server:call reply. {noreply, State}; handle_info({'EXIT', _PID, normal}, State) -> {noreply, State}; +handle_info({entropy_generated, _Ref, {error, Reason}}, State) -> + ?LOG_ERROR([{event, failed_to_generate_replica_2_9_entropy_and_timeout}, + {error, Reason}]), + {noreply, State}; +handle_info({entropy_generated, _Ref, _Entropy}, State) -> + ?LOG_WARNING([{event, entropy_generation_timed_out}]), + {noreply, State}; + handle_info(Info, State) -> ?LOG_ERROR([{event, unhandled_info}, {info, io_lib:format("~p", [Info])}]), {noreply, State}. @@ -656,18 +562,198 @@ terminate(_Reason, #state{ repack_cursor = Cursor, store_id = StoreID, %%%=================================================================== %%% Private functions. %%%=================================================================== - get_chunk_group_size() -> {ok, Config} = application:get_env(arweave, config), Config#config.chunk_storage_file_size. +do_prepare_replica_2_9(State) -> + #state{ reward_addr = RewardAddr, prepare_replica_2_9_cursor = Start, + range_start = RangeStart, range_end = RangeEnd, + store_id = StoreID, repack_cursor = RepackCursor, + prepare_slice_index = PreviousSliceIndex } = State, + + BucketEndOffset = get_chunk_bucket_end(Start), + PaddedRangeEnd = get_chunk_bucket_end(RangeEnd), + + %% Sanity checks: + BucketEndOffset = get_chunk_bucket_end(BucketEndOffset), + true = ( + get_chunk_bucket_start(ar_block:get_chunk_padded_offset(Start)) == + get_chunk_bucket_start(BucketEndOffset) + ), + true = ( + max(0, BucketEndOffset - ?DATA_CHUNK_SIZE) == get_chunk_bucket_start(BucketEndOffset) + ), + %% End of sanity checks. + + SliceIndex = ar_replica_2_9:get_slice_index(BucketEndOffset), + case SliceIndex of + _ when SliceIndex /= PreviousSliceIndex -> + %% Whenever the slice changes BucketEndOffset might be an offset that was + %% written to in a previous iteration. Furthermore it's possible (though unlikely), + %% that the write is still in process. So to make sure our "is recorded" checks + %% below consider all pending writes, we'll wait for the entropy storage process + %% to complete before proceeding. + %% + %% In practice we only expect pending writes to be a problem in tests. It can + %% hypothetically happen in production but is unlikely. + ?LOG_DEBUG([{event, prepare_replica_2_9_slice_changed}, {store_id, StoreID}, + {bucket_end_offset, BucketEndOffset}, + {previous_slice_index, PreviousSliceIndex}, + {slice_index, SliceIndex}]), + ar_entropy_storage:is_ready(StoreID); + _ -> + ok + end, + + CheckRangeEnd = + case BucketEndOffset > PaddedRangeEnd of + true -> + ar_device_lock:release_lock(prepare, StoreID), + ?LOG_INFO([{event, storage_module_replica_2_9_preparation_complete}, + {store_id, StoreID}]), + ar:console("The storage module ~s is prepared for 2.9 replication.~n", + [StoreID]), + complete; + false -> + false + end, + + Start2 = BucketEndOffset + ?DATA_CHUNK_SIZE, + State2 = State#state{ + prepare_replica_2_9_cursor = Start2, + prepare_slice_index = SliceIndex }, + CheckRepackCursor = + case CheckRangeEnd of + complete -> + complete; + false -> + case RepackCursor of + none -> + false; + _ -> + SectorSize = ar_replica_2_9:get_sector_size(), + RangeStart2 = get_chunk_bucket_start(RangeStart + 1), + RepackCursor2 = get_chunk_bucket_start(RepackCursor + 1), + RepackSectorShift = (RepackCursor2 - RangeStart2) rem SectorSize, + SectorShift = (BucketEndOffset - RangeStart2) rem SectorSize, + case SectorShift > RepackSectorShift of + true -> + waiting_for_repack; + false -> + false + end + end + end, + CheckIsRecorded = + case CheckRepackCursor of + complete -> + complete; + waiting_for_repack -> + waiting_for_repack; + false -> + ar_entropy_storage:is_entropy_recorded(BucketEndOffset, StoreID) + end, + + %% get_entropy_partition will use bucket *start* offset to determine the partition. + Partition = ar_replica_2_9:get_entropy_partition(BucketEndOffset), + StoreEntropy = + case CheckIsRecorded of + complete -> + complete; + waiting_for_repack -> + waiting_for_repack; + true -> + is_recorded; + false -> + %% Get all the entropies needed to encipher the chunk at BucketEndOffset. + Entropies = prometheus_histogram:observe_duration( + replica_2_9_entropy_duration_milliseconds, [32], + fun() -> + ar_entropy_storage:generate_entropies(RewardAddr, BucketEndOffset) + end), + case Entropies of + {error, Reason} -> + {error, Reason}; + _ -> + EntropyKeys = ar_entropy_storage:generate_entropy_keys( + RewardAddr, BucketEndOffset), + + %% A set of generated entropies covers slighly more than 3.6TB of + %% chunks, however we only want to use the first 3.6TB + %% (+ chunk padding) of it. + PartitionEnd = (Partition + 1) * ?PARTITION_SIZE, + PaddedPartitionEnd = + get_chunk_bucket_end( + ar_block:get_chunk_padded_offset(PartitionEnd)), + %% In addition to limiting this iteration to the PaddedPartitionEnd, + %% we also want to limit it to the current storage module's range. + %% This allows us to handle both the storage module range as well + %% as the small overlap region. + IterationEnd = min(PaddedPartitionEnd, RangeEnd), + %% Wait for the previous store_entropy to complete. Should only + %% return 'false' if the entropy storage process is down (e.g. during + %% shutdown) + case ar_entropy_storage:is_ready(StoreID) of + true -> + ar_entropy_storage:store_entropy( + StoreID, Entropies, BucketEndOffset, + IterationEnd, EntropyKeys, RewardAddr); + false -> + {error, entropy_storage_not_ready} + end + end + end, + ?LOG_DEBUG([{event, stored_replica_2_9_entropy}, {store_id, StoreID}, + {start, Start}, {bucket_end_offset, BucketEndOffset}, + {slice_index, ar_replica_2_9:get_slice_index(BucketEndOffset)}, + {range_start, RangeStart}, {range_end, RangeEnd}, + {partition, Partition}, + {repack_cursor, RepackCursor}, + {padded_range_end, PaddedRangeEnd}, + {check_is_recorded, CheckIsRecorded}, {store_entropy, StoreEntropy}]), + case StoreEntropy of + complete -> + State#state{ prepare_status = complete }; + waiting_for_repack -> + ?LOG_INFO([{event, waiting_for_repacking}, + {store_id, StoreID}, + {padded_end_offset, BucketEndOffset}, + {repack_cursor, RepackCursor}, + {cursor, Start}, + {range_start, RangeStart}, + {range_end, RangeEnd}]), + ar_util:cast_after(10000, self(), prepare_replica_2_9), + State; + is_recorded -> + gen_server:cast(self(), prepare_replica_2_9), + State2; + {error, Error} -> + ?LOG_WARNING([{event, failed_to_store_replica_2_9_entropy}, + {cursor, Start}, + {store_id, StoreID}, + {reason, io_lib:format("~p", [Error])}]), + ar_util:cast_after(500, self(), prepare_replica_2_9), + State; + ok -> + gen_server:cast(self(), prepare_replica_2_9), + case store_prepare_replica_2_9_cursor(Start2, StoreID) of + ok -> + ok; + {error, Error} -> + ?LOG_WARNING([{event, failed_to_store_prepare_replica_2_9_cursor}, + {chunk_cursor, Start2}, + {store_id, StoreID}, + {reason, io_lib:format("~p", [Error])}]) + end, + State2 + end. + read_prepare_replica_2_9_cursor(StoreID, Default) -> Filepath = get_filepath("prepare_replica_2_9_cursor", StoreID), case file:read_file(Filepath) of {ok, Bin} -> - case catch binary_to_term(Bin) of - {ChunkCursor, SubChunkCursor} = Cursor - when is_integer(ChunkCursor), is_integer(SubChunkCursor) -> + case catch binary_to_term(Bin) of Cursor when is_integer(Cursor) -> Cursor; _ -> Default @@ -686,24 +772,31 @@ get_filepath(Name, StoreID) -> ChunkDir = get_chunk_storage_path(DataDir, StoreID), filename:join([ChunkDir, Name]). -store_chunk(PaddedEndOffset, Chunk, Packing, State) -> - #state{ store_id = StoreID, reward_addr = RewardAddr, is_prepared = IsPrepared, - file_index = FileIndex } = State, - store_chunk(PaddedEndOffset, Chunk, Packing, StoreID, FileIndex, IsPrepared, RewardAddr). - store_chunk(PaddedEndOffset, Chunk, Packing, StoreID, FileIndex, IsPrepared, RewardAddr) -> + StoreIDLabel = ar_storage_module:label_by_id(StoreID), + PackingLabel = ar_storage_module:packing_label(Packing), + store_chunk(PaddedEndOffset, Chunk, Packing, StoreID, + StoreIDLabel, PackingLabel, FileIndex, IsPrepared, RewardAddr). + +store_chunk( + PaddedEndOffset, Chunk, Packing, StoreID, StoreIDLabel, + PackingLabel, FileIndex, IsPrepared, RewardAddr) -> case ar_entropy_storage:is_entropy_packing(Packing) of true -> ar_entropy_storage:record_chunk( - PaddedEndOffset, Chunk, RewardAddr, StoreID, FileIndex, IsPrepared); + PaddedEndOffset, Chunk, RewardAddr, StoreID, + StoreIDLabel, PackingLabel, FileIndex, IsPrepared); false -> - record_chunk(PaddedEndOffset, Chunk, Packing, StoreID, FileIndex) + record_chunk( + PaddedEndOffset, Chunk, Packing, StoreID, + StoreIDLabel, PackingLabel, FileIndex) end. -record_chunk(PaddedEndOffset, Chunk, Packing, StoreID, FileIndex) -> +record_chunk( + PaddedEndOffset, Chunk, Packing, StoreID, StoreIDLabel, PackingLabel, FileIndex) -> case write_chunk(PaddedEndOffset, Chunk, FileIndex, StoreID) of {ok, Filepath} -> - prometheus_counter:inc(chunks_stored, [Packing]), + prometheus_counter:inc(chunks_stored, [PackingLabel, StoreIDLabel]), case ar_sync_record:add( PaddedEndOffset, PaddedEndOffset - ?DATA_CHUNK_SIZE, sync_record_id(Packing), StoreID) of @@ -713,15 +806,6 @@ record_chunk(PaddedEndOffset, Chunk, Packing, StoreID, FileIndex) -> {{ChunkFileStart, StoreID}, Filepath}), {ok, maps:put(ChunkFileStart, Filepath, FileIndex), Packing}; Error -> - ?LOG_DEBUG([{event, details_failed_to_store_chunk}, - {context, error_adding_sync_record}, - {error, io_lib:format("~p", [Error])}, - {sync_record_id, sync_record_id(Packing)}, - {padded_offset, PaddedEndOffset}, - {packing, ar_serialize:encode_packing(Packing, true)}, - {store_id, StoreID}, - {filepath, Filepath} - ]), Error end; Error2 -> @@ -743,21 +827,11 @@ get_chunk_file_start(EndOffset) -> get_chunk_file_start_by_start_offset(StartOffset) -> ar_util:floor_int(StartOffset, get_chunk_group_size()). -get_chunk_bucket_end(PaddedEndOffset) -> - get_chunk_bucket_start(PaddedEndOffset) + ?DATA_CHUNK_SIZE. - write_chunk(PaddedOffset, Chunk, FileIndex, StoreID) -> {_ChunkFileStart, Filepath, Position, ChunkOffset} = locate_chunk_on_disk(PaddedOffset, StoreID, FileIndex), case get_handle_by_filepath(Filepath) of {error, _} = Error -> - ?LOG_DEBUG([{event, details_failed_to_store_chunk}, - {context, error_opening_chunk_file}, - {error, io_lib:format("~p", [Error])}, - {padded_offset, PaddedOffset}, - {store_id, StoreID}, - {filepath, Filepath} - ]), Error; F -> write_chunk2(PaddedOffset, ChunkOffset, Chunk, Filepath, F, Position) @@ -793,7 +867,7 @@ get_handle_by_filepath(Filepath) -> F end. -write_chunk2(PaddedOffset, ChunkOffset, Chunk, Filepath, F, Position) -> +write_chunk2(_PaddedOffset, ChunkOffset, Chunk, Filepath, F, Position) -> ChunkOffsetBinary = case ChunkOffset of 0 -> @@ -806,15 +880,7 @@ write_chunk2(PaddedOffset, ChunkOffset, Chunk, Filepath, F, Position) -> end, Result = file:pwrite(F, Position, [ChunkOffsetBinary | Chunk]), case Result of - {error, Reason} = Error -> - ?LOG_DEBUG([{event, details_failed_to_store_chunk}, - {context, error_writing_chunk_to_file}, - {error, io_lib:format("~p", [Reason])}, - {padded_offset, PaddedOffset}, - {chunk_offset, ChunkOffset}, - {filepath, Filepath}, - {position, Position} - ]), + {error, _Reason} = Error -> Error; ok -> {ok, Filepath} @@ -938,6 +1004,11 @@ extract_end_offset_chunk_pairs( [{EndOffset, Chunk} | extract_end_offset_chunk_pairs(Rest, BucketStart, Shift + 1)]; extract_end_offset_chunk_pairs(<<>>, _BucketStart, _Shift) -> + []; +extract_end_offset_chunk_pairs(<< ChunkOffset:?OFFSET_BIT_SIZE, Chunk/binary >>, + BucketStart, Shift) -> + ?LOG_ERROR([{event, unexpected_chunk_data}, {chunk_offset, ChunkOffset}, + {bucket_start, BucketStart}, {shift, Shift}, {chunk_size, byte_size(Chunk)}]), []. is_offset_valid(_Byte, _BucketStart, 0) -> @@ -1075,59 +1146,98 @@ read_chunks_sizes(DataDir) -> modules_to_defrag(#config{defragmentation_modules = [_ | _] = Modules}) -> Modules; modules_to_defrag(#config{storage_modules = Modules}) -> Modules. --ifdef(AR_TEST). -try_acquire_replica_2_9_formatting_lock(_StoreID) -> - true. --else. -try_acquire_replica_2_9_formatting_lock(StoreID) -> - case ets:insert_new(ar_chunk_storage, {update_replica_2_9_lock}) of - true -> - Count = get_replica_2_9_acquired_locks_count(), - {ok, Config} = application:get_env(arweave, config), - MaxWorkers = Config#config.replica_2_9_workers, - case Count + 1 > MaxWorkers of - true -> - ets:delete(ar_chunk_storage, update_replica_2_9_lock), - false; - false -> - ets:update_counter(ar_chunk_storage, replica_2_9_acquired_locks_count, - 1, {replica_2_9_acquired_locks_count, 0}), - ets:delete(ar_chunk_storage, update_replica_2_9_lock), - true - end; - false -> - try_acquire_replica_2_9_formatting_lock(StoreID) - end. --endif. - -get_replica_2_9_acquired_locks_count() -> - case ets:lookup(ar_chunk_storage, replica_2_9_acquired_locks_count) of - [] -> - 0; - [{_, Count}] -> - Count - end. - -release_replica_2_9_formatting_lock(StoreID) -> - case ets:insert_new(ar_chunk_storage, {update_replica_2_9_lock}) of - true -> - Count = get_replica_2_9_acquired_locks_count(), - case Count of - 0 -> - ok; - _ -> - ets:update_counter(ar_chunk_storage, replica_2_9_acquired_locks_count, - -1, {replica_2_9_acquired_locks_count, 0}) - end, - ets:delete(ar_chunk_storage, update_replica_2_9_lock); - false -> - release_replica_2_9_formatting_lock(StoreID) +get_packing_label(Packing, State) -> + case maps:get(Packing, State#state.packing_labels, not_found) of + not_found -> + Label = ar_storage_module:packing_label(Packing), + Map = maps:put(Packing, Label, State#state.packing_labels), + {Label, State#state{ packing_labels = Map }}; + Label -> + {Label, State} end. %%%=================================================================== %%% Tests. %%%=================================================================== +chunk_bucket_test() -> + ?assertEqual(786432, ?STRICT_DATA_SPLIT_THRESHOLD), + + %% get_chunk_bucket_end pads the provided offset + %% get_chunk_bucket_start does not padd the provided offset + + %% At and before the STRICT_DATA_SPLIT_THRESHOLD, offsets are not padded. + ?assertEqual(262144, get_chunk_bucket_end(0)), + ?assertEqual(0, get_chunk_bucket_start(0)), + + ?assertEqual(262144, get_chunk_bucket_end(1)), + ?assertEqual(0, get_chunk_bucket_start(1)), + + ?assertEqual(262144, get_chunk_bucket_end(?DATA_CHUNK_SIZE - 1)), + ?assertEqual(0, get_chunk_bucket_start(?DATA_CHUNK_SIZE - 1)), + + ?assertEqual(262144, get_chunk_bucket_end(?DATA_CHUNK_SIZE)), + ?assertEqual(0, get_chunk_bucket_start(?DATA_CHUNK_SIZE)), + + ?assertEqual(262144, get_chunk_bucket_end(?DATA_CHUNK_SIZE + 1)), + ?assertEqual(0, get_chunk_bucket_start(?DATA_CHUNK_SIZE + 1)), + + ?assertEqual(524288, get_chunk_bucket_end(2 * ?DATA_CHUNK_SIZE)), + ?assertEqual(262144, get_chunk_bucket_start(2 * ?DATA_CHUNK_SIZE)), + + ?assertEqual(524288, get_chunk_bucket_end(2 * ?DATA_CHUNK_SIZE + 1)), + ?assertEqual(262144, get_chunk_bucket_start(2 * ?DATA_CHUNK_SIZE + 1)), + + ?assertEqual(524288, get_chunk_bucket_end(3 * ?DATA_CHUNK_SIZE - 1)), + ?assertEqual(262144, get_chunk_bucket_start(3 * ?DATA_CHUNK_SIZE - 1)), + + ?assertEqual(786432, get_chunk_bucket_end(3 * ?DATA_CHUNK_SIZE)), + ?assertEqual(524288, get_chunk_bucket_start(3 * ?DATA_CHUNK_SIZE)), + + %% After the STRICT_DATA_SPLIT_THRESHOLD, offsets are padded. + ?assertEqual(1048576, get_chunk_bucket_end(3 * ?DATA_CHUNK_SIZE + 1)), + ?assertEqual(524288, get_chunk_bucket_start(3 * ?DATA_CHUNK_SIZE + 1)), + + ?assertEqual(1048576, get_chunk_bucket_end(4 * ?DATA_CHUNK_SIZE - 1)), + ?assertEqual(524288, get_chunk_bucket_start(4 * ?DATA_CHUNK_SIZE - 1)), + + ?assertEqual(1048576, get_chunk_bucket_end(4 * ?DATA_CHUNK_SIZE)), + ?assertEqual(786432, get_chunk_bucket_start(4 * ?DATA_CHUNK_SIZE)), + + ?assertEqual(1310720, get_chunk_bucket_end(4 * ?DATA_CHUNK_SIZE + 1)), + ?assertEqual(786432, get_chunk_bucket_start(4 * ?DATA_CHUNK_SIZE + 1)), + + ?assertEqual(1310720, get_chunk_bucket_end(5 * ?DATA_CHUNK_SIZE - 1)), + ?assertEqual(786432, get_chunk_bucket_start(5 * ?DATA_CHUNK_SIZE - 1)), + + ?assertEqual(1310720, get_chunk_bucket_end(5 * ?DATA_CHUNK_SIZE)), + ?assertEqual(1048576, get_chunk_bucket_start(5 * ?DATA_CHUNK_SIZE)), + + ?assertEqual(1572864, get_chunk_bucket_end(5 * ?DATA_CHUNK_SIZE + 1)), + ?assertEqual(1048576, get_chunk_bucket_start(5 * ?DATA_CHUNK_SIZE + 1)), + + ?assertEqual(1572864, get_chunk_bucket_end(6 * ?DATA_CHUNK_SIZE - 1)), + ?assertEqual(1048576, get_chunk_bucket_start(6 * ?DATA_CHUNK_SIZE - 1)), + + ?assertEqual(1572864, get_chunk_bucket_end(6 * ?DATA_CHUNK_SIZE)), + ?assertEqual(1310720, get_chunk_bucket_start(6 * ?DATA_CHUNK_SIZE)), + + ?assertEqual(1835008, get_chunk_bucket_end(6 * ?DATA_CHUNK_SIZE + 1)), + ?assertEqual(1310720, get_chunk_bucket_start(6 * ?DATA_CHUNK_SIZE + 1)), + + ?assertEqual(1835008, get_chunk_bucket_end(7 * ?DATA_CHUNK_SIZE)), + ?assertEqual(1572864, get_chunk_bucket_start(7 * ?DATA_CHUNK_SIZE)), + + ?assertEqual(2097152, get_chunk_bucket_end(8 * ?DATA_CHUNK_SIZE)), + ?assertEqual(1835008, get_chunk_bucket_start(8 * ?DATA_CHUNK_SIZE)), + + ?assertEqual(2359296, get_chunk_bucket_end(9 * ?DATA_CHUNK_SIZE)), + ?assertEqual(2097152, get_chunk_bucket_start(9 * ?DATA_CHUNK_SIZE)), + + ?assertEqual(2621440, get_chunk_bucket_end(10 * ?DATA_CHUNK_SIZE)), + ?assertEqual(2359296, get_chunk_bucket_start(10 * ?DATA_CHUNK_SIZE)). + + replica_2_9_test_() -> {timeout, 20, fun test_replica_2_9/0}. diff --git a/apps/arweave/src/ar_chunk_storage_sup.erl b/apps/arweave/src/ar_chunk_storage_sup.erl index 7a8579d99..7680e35ee 100644 --- a/apps/arweave/src/ar_chunk_storage_sup.erl +++ b/apps/arweave/src/ar_chunk_storage_sup.erl @@ -23,28 +23,48 @@ start_link() -> init([]) -> ets:new(chunk_storage_file_index, [set, public, named_table, {read_concurrency, true}]), {ok, Config} = application:get_env(arweave, config), - ConfiguredWorkers = lists:map( - fun(StorageModule) -> - StoreID = ar_storage_module:id(StorageModule), - Label = ar_storage_module:label(StorageModule), - Name = list_to_atom("ar_chunk_storage_" ++ Label), - ?CHILD_WITH_ARGS(ar_chunk_storage, worker, Name, [Name, {StoreID, none}]) - end, - Config#config.storage_modules + ConfiguredWorkers = lists:flatten( + lists:map( + fun(StorageModule) -> + StoreID = ar_storage_module:id(StorageModule), + + ChunkStorageName = ar_chunk_storage:name(StoreID), + ChunkStorageWorker = ?CHILD_WITH_ARGS(ar_chunk_storage, worker, + ChunkStorageName, [ChunkStorageName, {StoreID, none}]), + + EntropyStorageName = ar_entropy_storage:name(StoreID), + EntropyStorageWorker = ?CHILD_WITH_ARGS(ar_entropy_storage, worker, + EntropyStorageName, [EntropyStorageName, StoreID]), + + [ChunkStorageWorker, EntropyStorageWorker] + end, + Config#config.storage_modules + ) ), + DefaultChunkStorageWorker = ?CHILD_WITH_ARGS(ar_chunk_storage, worker, ar_chunk_storage_default, [ar_chunk_storage_default, {"default", none}]), - RepackInPlaceWorkers = lists:map( - fun({StorageModule, Packing}) -> - StoreID = ar_storage_module:id(StorageModule), - %% Note: the config validation will prevent a StoreID from being used in both - %% `storage_modules` and `repack_in_place_storage_modules`, so there's - %% no risk of a `Name` clash with the workers spawned above. - Label = ar_storage_module:label(StorageModule), - Name = list_to_atom("ar_chunk_storage_" ++ Label), - ?CHILD_WITH_ARGS(ar_chunk_storage, worker, Name, [Name, {StoreID, Packing}]) - end, - Config#config.repack_in_place_storage_modules + + RepackInPlaceWorkers = lists:flatten( + lists:map( + fun({StorageModule, Packing}) -> + StoreID = ar_storage_module:id(StorageModule), + %% Note: the config validation will prevent a StoreID from being used in both + %% `storage_modules` and `repack_in_place_storage_modules`, so there's + %% no risk of a `Name` clash with the workers spawned above. + ChunkStorageName = ar_chunk_storage:name(StoreID), + ChunkStorageWorker = ?CHILD_WITH_ARGS(ar_chunk_storage, worker, + ChunkStorageName, [ChunkStorageName, {StoreID, Packing}]), + + EntropyStorageName = ar_entropy_storage:name(StoreID), + EntropyStorageWorker = ?CHILD_WITH_ARGS(ar_entropy_storage, worker, + EntropyStorageName, [EntropyStorageName, StoreID]), + + [ChunkStorageWorker, EntropyStorageWorker] + end, + Config#config.repack_in_place_storage_modules + ) ), + Workers = [DefaultChunkStorageWorker] ++ ConfiguredWorkers ++ RepackInPlaceWorkers, {ok, {{one_for_one, 5, 10}, Workers}}. diff --git a/apps/arweave/src/ar_config.erl b/apps/arweave/src/ar_config.erl index 0f2357a9c..ebed2ff57 100644 --- a/apps/arweave/src/ar_config.erl +++ b/apps/arweave/src/ar_config.erl @@ -250,6 +250,16 @@ parse_options([{<<"join_workers">>, N} | Rest], Config) when is_integer(N)-> parse_options([{<<"join_workers">>, Opt} | _], _) -> {error, {bad_type, join_workers, number}, Opt}; +parse_options([{<<"packing_workers">>, N} | Rest], Config) when is_integer(N)-> + parse_options(Rest, Config#config{ packing_workers = N }); +parse_options([{<<"packing_workers">>, Opt} | _], _) -> + {error, {bad_type, packing_workers, number}, Opt}; + +parse_options([{<<"replica_2_9_workers">>, N} | Rest], Config) when is_integer(N)-> + parse_options(Rest, Config#config{ replica_2_9_workers = N }); +parse_options([{<<"replica_2_9_workers">>, Opt} | _], _) -> + {error, {bad_type, replica_2_9_workers, number}, Opt}; + parse_options([{<<"diff">>, Diff} | Rest], Config) when is_integer(Diff) -> parse_options(Rest, Config#config{ diff = Diff }); parse_options([{<<"diff">>, Diff} | _], _) -> @@ -518,7 +528,9 @@ parse_options([{<<"disk_cache_size_mb">>, D} | Rest], Config) when is_integer(D) parse_options(Rest, Config#config{ disk_cache_size = D }); parse_options([{<<"packing_rate">>, D} | Rest], Config) when is_integer(D) -> - parse_options(Rest, Config#config{ packing_rate = D }); + ?LOG_WARNING("Deprecated option found 'packing_rate': " + " this option has been removed and is a no-op.", []), + parse_options(Rest, Config); parse_options([{<<"max_nonce_limiter_validation_thread_count">>, D} | Rest], Config) when is_integer(D) -> @@ -925,22 +937,25 @@ validate_repack_in_place(Config) -> validate_repack_in_place([], _Modules) -> true; -validate_repack_in_place([{Module, _ToPacking} | L], Modules) -> +validate_repack_in_place([{Module, ToPacking} | L], Modules) -> {_BucketSize, _Bucket, Packing} = Module, ID = ar_storage_module:id(Module), - PackingType = ar_mining_server:get_packing_type(Packing), ModuleInUse = lists:member(ID, Modules), - RepackingFromReplica29 = PackingType == replica_2_9, - case {ModuleInUse, RepackingFromReplica29} of - {true, _} -> + FromPackingType = ar_mining_server:get_packing_type(Packing), + ToPackingType = ar_mining_server:get_packing_type(ToPacking), + case {ModuleInUse, FromPackingType, ToPackingType} of + {true, _, _} -> io:format("~nCannot use the storage module ~s " "while it is being repacked in place.~n~n", [ID]), false; - {_, true} -> + {_, replica_2_9, _} -> io:format("~nCannot repack in place from replica_2_9 to any format.~n~n"), false; + {_, _, replica_2_9} -> + validate_repack_in_place(L, Modules); _ -> - validate_repack_in_place(L, Modules) + io:format("~nCan only repack in place to replica_2_9.~n~n"), + false end. validate_cm_pool(Config) -> diff --git a/apps/arweave/src/ar_data_sync.erl b/apps/arweave/src/ar_data_sync.erl index 49fc1eb6c..d701cb7e3 100644 --- a/apps/arweave/src/ar_data_sync.erl +++ b/apps/arweave/src/ar_data_sync.erl @@ -2,7 +2,7 @@ -behaviour(gen_server). --export([name/1, start_link/2, join/1, add_tip_block/2, add_block/2, +-export([name/1, start_link/2, register_workers/0, join/1, add_tip_block/2, add_block/2, invalidate_bad_data_record/4, is_chunk_proof_ratio_attractive/3, add_chunk/5, add_data_root_to_disk_pool/3, maybe_drop_data_root_from_disk_pool/3, get_chunk/2, get_chunk_data/2, get_chunk_proof/2, get_tx_data/1, get_tx_data/2, @@ -21,6 +21,7 @@ -export([enqueue_intervals/3, remove_expired_disk_pool_data_roots/0]). -include("../include/ar.hrl"). +-include("../include/ar_sup.hrl"). -include("../include/ar_consensus.hrl"). -include("../include/ar_config.hrl"). -include("../include/ar_poa.hrl"). @@ -34,6 +35,12 @@ -define(COLLECT_SYNC_INTERVALS_FREQUENCY_MS, 300_000). -endif. +-ifdef(AR_TEST). +-define(DEVICE_LOCK_WAIT, 100). +-else. +-define(DEVICE_LOCK_WAIT, 5_000). +-endif. + %%%=================================================================== %%% Public interface. %%%=================================================================== @@ -44,6 +51,30 @@ name(StoreID) -> start_link(Name, StoreID) -> gen_server:start_link({local, Name}, ?MODULE, StoreID, []). +%% @doc Register the workers that will be monitored by ar_data_sync_sup.erl. +register_workers() -> + {ok, Config} = application:get_env(arweave, config), + StorageModuleWorkers = lists:map( + fun(StorageModule) -> + StoreID = ar_storage_module:id(StorageModule), + StoreLabel = ar_storage_module:label(StorageModule), + Name = list_to_atom("ar_data_sync_" ++ StoreLabel), + ?CHILD_WITH_ARGS(ar_data_sync, worker, Name, [Name, {StoreID, none}]) + end, + Config#config.storage_modules + ), + DefaultStorageModuleWorker = ?CHILD_WITH_ARGS(ar_data_sync, worker, + ar_data_sync_default, [ar_data_sync_default, {"default", none}]), + RepackInPlaceWorkers = lists:map( + fun({StorageModule, TargetPacking}) -> + StoreID = ar_storage_module:id(StorageModule), + Name = ar_data_sync:name(StoreID), + ?CHILD_WITH_ARGS(ar_data_sync, worker, Name, [Name, {StoreID, TargetPacking}]) + end, + Config#config.repack_in_place_storage_modules + ), + StorageModuleWorkers ++ [DefaultStorageModuleWorker] ++ RepackInPlaceWorkers. + %% @doc Notify the server the node has joined the network on the given block index. join(RecentBI) -> gen_server:cast(ar_data_sync_default, {join, RecentBI}). @@ -52,9 +83,9 @@ join(RecentBI) -> add_tip_block(BlockTXPairs, RecentBI) -> gen_server:cast(ar_data_sync_default, {add_tip_block, BlockTXPairs, RecentBI}). -invalidate_bad_data_record(Start, End, StoreID, Case) -> +invalidate_bad_data_record(AbsoluteEndOffset, ChunkSize, StoreID, Case) -> gen_server:cast(name(StoreID), {invalidate_bad_data_record, - {Start, End, StoreID, Case}}). + {AbsoluteEndOffset, ChunkSize, StoreID, Case}}). %% @doc The condition which is true if the chunk is too small compared to the proof. %% Small chunks make syncing slower and increase space amplification. A small chunk @@ -405,13 +436,6 @@ get_chunk(Offset, #{ packing := Packing } = Options) -> end, case IsRecorded of {{true, StoredPacking}, StoreID} -> - ?LOG_DEBUG([{event, get_chunk}, {offset, Offset}, - {request_origin, RequestOrigin}, - {pack, Pack}, - {options, Options}, - {packing, ar_serialize:encode_packing(Packing, true)}, - {stored_packing, ar_serialize:encode_packing(StoredPacking, true)}, - {store_id, StoreID}]), get_chunk(Offset, SeekOffset, Pack, Packing, StoredPacking, StoreID, RequestOrigin); {true, StoreID} -> @@ -697,7 +721,8 @@ init({"default" = StoreID, _}) -> weave_size = maps:get(weave_size, StateMap), disk_pool_cursor = first, disk_pool_threshold = DiskPoolThreshold, - store_id = StoreID + store_id = StoreID, + sync_status = off }, timer:apply_interval(?REMOVE_EXPIRED_DATA_ROOTS_FREQUENCY_MS, ?MODULE, remove_expired_disk_pool_data_roots, []), @@ -737,15 +762,25 @@ init({StoreID, RepackInPlacePacking}) -> none -> gen_server:cast(self(), process_store_chunk_queue), {RangeStart, RangeEnd} = ar_storage_module:get_range(StoreID), + SyncStatus = case ar_data_sync_worker_master:is_syncing_enabled() of + true -> paused; + false -> off + end, State2 = State#sync_data_state{ store_id = StoreID, range_start = RangeStart, range_end = RangeEnd, - packing = ar_storage_module:get_packing(StoreID) + packing = ar_storage_module:get_packing(StoreID), + sync_status = SyncStatus }, - {ok, may_be_start_syncing(State2)}; + gen_server:cast(self(), sync_intervals), + gen_server:cast(self(), sync_data), + {ok, State2}; _ -> - {ok, State} + State2 = State#sync_data_state{ + sync_status = off + }, + {ok, State2} end. handle_cast({move_data_root_index, Cursor, N}, State) -> @@ -847,91 +882,34 @@ handle_cast({add_tip_block, BlockTXPairs, BI}, State) -> {noreply, State2}; handle_cast(sync_data, State) -> - #sync_data_state{ store_id = StoreID, range_start = RangeStart, range_end = RangeEnd, - disk_pool_threshold = DiskPoolThreshold } = State, - %% See if any of StoreID's unsynced intervals can be found in the "default" - %% storage_module - Intervals = get_unsynced_intervals_from_other_storage_modules( - StoreID, "default", RangeStart, min(RangeEnd, DiskPoolThreshold)), - gen_server:cast(self(), sync_data2), - %% Find all storage_modules that might include the target chunks (e.g. neighboring - %% storage_modules with an overlap, or unpacked copies used for packing, etc...) - OtherStorageModules = [ar_storage_module:id(Module) - || Module <- ar_storage_module:get_all(RangeStart, RangeEnd), - ar_storage_module:id(Module) /= StoreID], - {noreply, State#sync_data_state{ - unsynced_intervals_from_other_storage_modules = Intervals, - other_storage_modules_with_unsynced_intervals = OtherStorageModules }}; - -%% @doc No unsynced overlap intervals, proceed with syncing -handle_cast(sync_data2, #sync_data_state{ - unsynced_intervals_from_other_storage_modules = [], - other_storage_modules_with_unsynced_intervals = [] } = State) -> - ar_util:cast_after(2000, self(), collect_peer_intervals), - {noreply, State}; -%% @doc Check to see if a neighboring storage_module may have already synced one of our -%% unsynced intervals -handle_cast(sync_data2, #sync_data_state{ - store_id = StoreID, range_start = RangeStart, range_end = RangeEnd, - unsynced_intervals_from_other_storage_modules = [], - other_storage_modules_with_unsynced_intervals = [OtherStoreID | OtherStoreIDs] - } = State) -> - Intervals = - case ar_storage_module:get_packing(OtherStoreID) of - {replica_2_9, _} when ?BLOCK_2_9_SYNCING -> - %% Do not unpack the 2.9 data by default, finding unpacked data - %% may be cheaper. - []; - _ -> - get_unsynced_intervals_from_other_storage_modules(StoreID, OtherStoreID, - RangeStart, RangeEnd) - end, - gen_server:cast(self(), sync_data2), - {noreply, State#sync_data_state{ - unsynced_intervals_from_other_storage_modules = Intervals, - other_storage_modules_with_unsynced_intervals = OtherStoreIDs }}; -%% @doc Read an unsynced interval from the disk of a neighboring storage_module -handle_cast(sync_data2, #sync_data_state{ - store_id = StoreID, - unsynced_intervals_from_other_storage_modules = - [{OtherStoreID, {Start, End}} | Intervals] - } = State) -> - TaskRef = make_ref(), - State2 = - case ar_data_sync_worker_master:read_range(Start, End, OtherStoreID, StoreID, false, - self(), TaskRef) of - true -> - State#sync_data_state{ - unsynced_intervals_from_other_storage_modules = Intervals }; - false -> - State - end, - ar_util:cast_after(50, self(), sync_data2), - {noreply, State2}; - -handle_cast({invalidate_bad_data_record, Args}, State) -> - invalidate_bad_data_record(Args), - {noreply, State}; - -handle_cast({pack_and_store_chunk, Args} = Cast, - #sync_data_state{ store_id = StoreID } = State) -> - case is_disk_space_sufficient(StoreID) of - true -> - pack_and_store_chunk(Args, State); + #sync_data_state{ store_id = StoreID } = State, + Status = ar_device_lock:acquire_lock(sync, StoreID, State#sync_data_state.sync_status), + State2 = State#sync_data_state{ sync_status = Status }, + State3 = case Status of + active -> + do_sync_data(State2); + paused -> + ar_util:cast_after(?DEVICE_LOCK_WAIT, self(), sync_data), + State2; _ -> - ar_util:cast_after(30000, self(), Cast), - {noreply, State} - end; + State2 + end, + {noreply, State3}; -handle_cast({store_chunk, ChunkArgs, Args} = Cast, - #sync_data_state{ store_id = StoreID } = State) -> - case is_disk_space_sufficient(StoreID) of - true -> - {noreply, store_chunk(ChunkArgs, Args, State)}; +handle_cast(sync_data2, State) -> + #sync_data_state{ store_id = StoreID } = State, + Status = ar_device_lock:acquire_lock(sync, StoreID, State#sync_data_state.sync_status), + State2 = State#sync_data_state{ sync_status = Status }, + State3 = case Status of + active -> + do_sync_data2(State2); + paused -> + ar_util:cast_after(?DEVICE_LOCK_WAIT, self(), sync_data2), + State2; _ -> - ar_util:cast_after(30000, self(), Cast), - {noreply, State} - end; + State2 + end, + {noreply, State3}; %% Schedule syncing of the unsynced intervals. Choose a peer for each of the intervals. %% There are two message payloads: @@ -949,9 +927,6 @@ handle_cast(collect_peer_intervals, State) -> {noreply, State}; handle_cast({collect_peer_intervals, Start, End}, State) when Start >= End -> - #sync_data_state{ store_id = StoreID } = State, - ?LOG_DEBUG([{event, collect_peer_intervals_end}, {pid, self()}, {store_id, StoreID}, - {range_end, End}]), %% We've finished collecting intervals for the whole storage_module range. Schedule %% the collection process to restart in ?COLLECT_SYNC_INTERVALS_FREQUENCY_MS and %% clear the all_peers_intervals cache so we can start fresh and requery peers for @@ -1024,6 +999,7 @@ handle_cast({collect_peer_intervals, Start, End}, State) -> true -> ar_util:cast_after(500, self(), {collect_peer_intervals, Start, End}); false -> + %% All checks have passed, find and enqueue intervals for one %% All checks have passed, find and enqueue intervals for one %% sync bucket worth of chunks starting at offset Start ar_peer_intervals:fetch( @@ -1082,66 +1058,42 @@ handle_cast({enqueue_intervals, Intervals}, State) -> sync_intervals_queue_intervals = QIntervals2 }}; handle_cast(sync_intervals, State) -> - #sync_data_state{ sync_intervals_queue = Q, - sync_intervals_queue_intervals = QIntervals, store_id = StoreID } = State, - IsQueueEmpty = - case gb_sets:is_empty(Q) of - true -> - ar_util:cast_after(500, self(), sync_intervals), - true; - false -> - false - end, - IsDiskSpaceSufficient = - case IsQueueEmpty of - true -> - false; - false -> - case is_disk_space_sufficient(StoreID) of - false -> - ar_util:cast_after(30000, self(), sync_intervals), - false; - true -> - true - end - end, - IsChunkCacheFull = - case IsDiskSpaceSufficient of - false -> - true; - true -> - case is_chunk_cache_full() of - true -> - ar_util:cast_after(1000, self(), sync_intervals), - true; - false -> - false - end - end, - AreSyncWorkersBusy = - case IsChunkCacheFull of - true -> - true; - false -> - case ar_data_sync_worker_master:ready_for_work() of - false -> - ar_util:cast_after(200, self(), sync_intervals), - true; - true -> - false - end - end, - case AreSyncWorkersBusy of + #sync_data_state{ store_id = StoreID } = State, + Status = ar_device_lock:acquire_lock(sync, StoreID, State#sync_data_state.sync_status), + State2 = State#sync_data_state{ sync_status = Status }, + State3 = case Status of + active -> + do_sync_intervals(State2); + paused -> + ar_util:cast_after(?DEVICE_LOCK_WAIT, self(), sync_intervals), + State2; + _ -> + State2 + end, + {noreply, State3}; + +handle_cast({invalidate_bad_data_record, Args}, State) -> + invalidate_bad_data_record(Args), + {noreply, State}; + +handle_cast({pack_and_store_chunk, Args} = Cast, + #sync_data_state{ store_id = StoreID } = State) -> + case is_disk_space_sufficient(StoreID) of true -> - {noreply, State}; - false -> - gen_server:cast(self(), sync_intervals), - {{Start, End, Peer}, Q2} = gb_sets:take_smallest(Q), - I2 = ar_intervals:delete(QIntervals, End, Start), - gen_server:cast(ar_data_sync_worker_master, - {sync_range, {Start, End, Peer, StoreID}}), - {noreply, State#sync_data_state{ sync_intervals_queue = Q2, - sync_intervals_queue_intervals = I2 }} + pack_and_store_chunk(Args, State); + _ -> + ar_util:cast_after(30000, self(), Cast), + {noreply, State} + end; + +handle_cast({store_chunk, ChunkArgs, Args} = Cast, + #sync_data_state{ store_id = StoreID } = State) -> + case is_disk_space_sufficient(StoreID) of + true -> + {noreply, store_chunk(ChunkArgs, Args, State)}; + _ -> + ar_util:cast_after(30000, self(), Cast), + {noreply, State} end; handle_cast({store_fetched_chunk, Peer, Byte, Proof} = Cast, State) -> @@ -1362,23 +1314,6 @@ handle_cast(store_sync_state, State) -> handle_cast({remove_recently_processed_disk_pool_offset, Offset, ChunkDataKey}, State) -> {noreply, remove_recently_processed_disk_pool_offset(Offset, ChunkDataKey, State)}; -handle_cast({request_default_unpacked_packing, Cursor, RightBound}, State) -> - case ar_sync_record:get_next_synced_interval(Cursor, RightBound, unpacked, ar_data_sync, - "default") of - not_found -> - ok; - {End, Start} when End - Start < ?DATA_CHUNK_SIZE, - End =< ?STRICT_DATA_SPLIT_THRESHOLD -> - gen_server:cast(ar_data_sync_default, {request_default_unpacked_packing, End, - RightBound}); - {End, Start} -> - gen_server:cast(ar_data_sync_default, {read_range, {Start, End, "default", - "default", true}}), - gen_server:cast(ar_data_sync_default, {request_default_unpacked_packing, End, - RightBound}) - end, - {noreply, State}; - handle_cast(Cast, State) -> ?LOG_WARNING([{event, unhandled_cast}, {module, ?MODULE}, {cast, Cast}]), {noreply, State}. @@ -1391,12 +1326,6 @@ handle_call(Request, _From, State) -> ?LOG_WARNING([{event, unhandled_call}, {module, ?MODULE}, {request, Request}]), {reply, ok, State}. -handle_info({event, node_state, {initialized, _B}}, - #sync_data_state{ store_id = "default" } = State) -> - {noreply, State}; -handle_info({event, node_state, {initialized, _B}}, State) -> - {noreply, may_be_start_syncing(State)}; - handle_info({event, node_state, {search_space_upper_bound, Bound}}, State) -> {noreply, State#sync_data_state{ disk_pool_threshold = Bound }}; @@ -1429,9 +1358,6 @@ handle_info({chunk, {packed, Offset, ChunkArgs}}, State) -> handle_info({chunk, _}, State) -> {noreply, State}; -handle_info({ar_data_sync_worker_master_read_range_task_complete, _Ref}, State) -> - {noreply, State}; - handle_info({event, disksup, {remaining_disk_space, StoreID, false, Percentage, _Bytes}}, #sync_data_state{ store_id = StoreID } = State) -> case Percentage < 0.01 of @@ -1535,10 +1461,147 @@ terminate(Reason, #sync_data_state{ store_id = StoreID } = State) -> log_chunk_error(Event, ExtraLogData) -> ?LOG_ERROR([{event, Event}, {tags, [solution_proofs]} | ExtraLogData]). -log_chunk_error(miner, Event, ExtraLogData) -> - log_chunk_error(Event, [{request_origin, miner} | ExtraLogData]); -log_chunk_error(_RequestOrigin, _, _) -> - ok. +log_chunk_error(http, _, _) -> + ok; +log_chunk_error(tx_data, _, _) -> + ok; +log_chunk_error(RequestOrigin, Event, ExtraLogData) -> + log_chunk_error(Event, [{request_origin, RequestOrigin} | ExtraLogData]). + +do_sync_intervals(State) -> + #sync_data_state{ sync_intervals_queue = Q, + sync_intervals_queue_intervals = QIntervals, store_id = StoreID } = State, + IsQueueEmpty = + case gb_sets:is_empty(Q) of + true -> + ar_util:cast_after(500, self(), sync_intervals), + true; + false -> + false + end, + IsDiskSpaceSufficient = + case IsQueueEmpty of + true -> + false; + false -> + case is_disk_space_sufficient(StoreID) of + false -> + ar_util:cast_after(30000, self(), sync_intervals), + false; + true -> + true + end + end, + IsChunkCacheFull = + case IsDiskSpaceSufficient of + false -> + true; + true -> + case is_chunk_cache_full() of + true -> + ar_util:cast_after(1000, self(), sync_intervals), + true; + false -> + false + end + end, + AreSyncWorkersBusy = + case IsChunkCacheFull of + true -> + true; + false -> + case ar_data_sync_worker_master:ready_for_work() of + false -> + ar_util:cast_after(200, self(), sync_intervals), + true; + true -> + false + end + end, + case AreSyncWorkersBusy of + true -> + State; + false -> + gen_server:cast(self(), sync_intervals), + {{Start, End, Peer}, Q2} = gb_sets:take_smallest(Q), + I2 = ar_intervals:delete(QIntervals, End, Start), + gen_server:cast(ar_data_sync_worker_master, + {sync_range, {Start, End, Peer, StoreID}}), + State#sync_data_state{ sync_intervals_queue = Q2, + sync_intervals_queue_intervals = I2 } + end. + +do_sync_data(State) -> + #sync_data_state{ store_id = StoreID, range_start = RangeStart, range_end = RangeEnd, + disk_pool_threshold = DiskPoolThreshold } = State, + %% See if any of StoreID's unsynced intervals can be found in the "default" + %% storage_module + Intervals = get_unsynced_intervals_from_other_storage_modules( + StoreID, "default", RangeStart, min(RangeEnd, DiskPoolThreshold)), + gen_server:cast(self(), sync_data2), + %% Find all storage_modules that might include the target chunks (e.g. neighboring + %% storage_modules with an overlap, or unpacked copies used for packing, etc...) + OtherStorageModules = [ar_storage_module:id(Module) + || Module <- ar_storage_module:get_all(RangeStart, RangeEnd), + ar_storage_module:id(Module) /= StoreID], + ?LOG_INFO([{event, sync_data}, {store_id, StoreID}, {range_start, RangeStart}, + {range_end, RangeEnd}, {disk_pool_threshold, DiskPoolThreshold}, + {default_intervals, length(Intervals)}, + {other_storage_modules, length(OtherStorageModules)}]), + State#sync_data_state{ + unsynced_intervals_from_other_storage_modules = Intervals, + other_storage_modules_with_unsynced_intervals = OtherStorageModules + }. + +%% @doc No unsynced overlap intervals, proceed with syncing +do_sync_data2(#sync_data_state{ + unsynced_intervals_from_other_storage_modules = [], + other_storage_modules_with_unsynced_intervals = [] } = State) -> + #sync_data_state{ store_id = StoreID, + range_start = RangeStart, range_end = RangeEnd } = State, + ?LOG_INFO([{event, sync_data_complete}, {store_id, StoreID}, {range_start, RangeStart}, + {range_end, RangeEnd}]), + ar_util:cast_after(2000, self(), collect_peer_intervals), + State; +%% @doc Check to see if a neighboring storage_module may have already synced one of our +%% unsynced intervals +do_sync_data2(#sync_data_state{ + store_id = StoreID, range_start = RangeStart, range_end = RangeEnd, + unsynced_intervals_from_other_storage_modules = [], + other_storage_modules_with_unsynced_intervals = [OtherStoreID | OtherStoreIDs] + } = State) -> + Intervals = + case ar_storage_module:get_packing(OtherStoreID) of + {replica_2_9, _} when ?BLOCK_2_9_SYNCING -> + %% Do not unpack the 2.9 data by default, finding unpacked data + %% may be cheaper. + []; + _ -> + get_unsynced_intervals_from_other_storage_modules(StoreID, OtherStoreID, + RangeStart, RangeEnd) + end, + % ?LOG_DEBUG([{event, sync_data2}, {store_id, StoreID}, {intervals, Intervals}]), + gen_server:cast(self(), sync_data2), + State#sync_data_state{ + unsynced_intervals_from_other_storage_modules = Intervals, + other_storage_modules_with_unsynced_intervals = OtherStoreIDs + }; +%% @doc Read an unsynced interval from the disk of a neighboring storage_module +do_sync_data2(#sync_data_state{ + store_id = StoreID, + unsynced_intervals_from_other_storage_modules = + [{OtherStoreID, {Start, End}} | Intervals] + } = State) -> + State2 = + case ar_chunk_copy:read_range(Start, End, OtherStoreID, StoreID) of + true -> + State#sync_data_state{ + unsynced_intervals_from_other_storage_modules = Intervals }; + false -> + State + end, + ar_util:cast_after(50, self(), sync_data2), + State2. remove_expired_disk_pool_data_roots() -> Now = os:system_time(microsecond), @@ -1643,8 +1706,8 @@ get_chunk(Offset, SeekOffset, Pack, Packing, StoredPacking, StoreID, RequestOrig {store_id, StoreID}, {expected_chunk_id, ar_util:encode(ChunkID)}, {chunk_id, ar_util:encode(ComputedChunkID)}]), - invalidate_bad_data_record({AbsoluteOffset - ChunkSize, - AbsoluteOffset, StoreID, 4}), + invalidate_bad_data_record({AbsoluteOffset, ChunkSize, + StoreID, get_chunk_invalid_id}), {error, chunk_not_found} end end @@ -1729,7 +1792,8 @@ read_chunk_with_metadata( {modules_covering_seek_offset, ModuleIDs}, {chunk_data_key, ar_util:encode(ChunkDataKey)}, {read_fun, ReadFun}]), - invalidate_bad_data_record({SeekOffset - 1, AbsoluteOffset, StoreID, 1}), + invalidate_bad_data_record({AbsoluteOffset, ChunkSize, StoreID, + failed_to_read_chunk_data_path}), {error, chunk_not_found}; {error, Error} -> log_chunk_error(failed_to_read_chunk, @@ -1763,41 +1827,70 @@ read_chunk_with_metadata( end end. -invalidate_bad_data_record({Start, End, StoreID, Case}) -> +invalidate_bad_data_record({AbsoluteEndOffset, ChunkSize, StoreID, Type}) -> [{_, T}] = ets:lookup(ar_data_sync_state, disk_pool_threshold), - case End > T of + case AbsoluteEndOffset > T of true -> - %% Do not invalidate fresh records - a reorg may be in progress. - ok; + [{_, T}] = ets:lookup(ar_data_sync_state, disk_pool_threshold), + case AbsoluteEndOffset > T of + true -> + %% Do not invalidate fresh records - a reorg may be in progress. + ok; + false -> + invalidate_bad_data_record2({AbsoluteEndOffset, ChunkSize, StoreID, Type}) + end; false -> - PaddedEnd = ar_block:get_chunk_padded_offset(End), - PaddedStart = ar_block:get_chunk_padded_offset(Start), - PaddedStart2 = - case PaddedStart == PaddedEnd of - true -> - PaddedEnd - ?DATA_CHUNK_SIZE; - false -> - PaddedStart - end, - ?LOG_WARNING([{event, invalidating_bad_data_record}, {type, Case}, - {range_start, PaddedStart2}, {range_end, PaddedEnd}, - {store_id, StoreID}]), - case ar_sync_record:delete(PaddedEnd, PaddedStart2, ar_data_sync, StoreID) of + invalidate_bad_data_record2({AbsoluteEndOffset, ChunkSize, StoreID, Type}) + end. + +invalidate_bad_data_record2({AbsoluteEndOffset, ChunkSize, StoreID, Type}) -> + PaddedEndOffset = ar_block:get_chunk_padded_offset(AbsoluteEndOffset), + StartOffset = AbsoluteEndOffset - ChunkSize, + ?LOG_WARNING([{event, invalidating_bad_data_record}, {type, Type}, + {range_start, StartOffset}, {range_end, PaddedEndOffset}, + {store_id, StoreID}]), + case remove_invalid_sync_records(PaddedEndOffset, StartOffset, StoreID) of + ok -> + ar_sync_record:add(PaddedEndOffset, StartOffset, invalid_chunks, StoreID), + case delete_chunk_metadata(AbsoluteEndOffset, StoreID) of ok -> - ar_sync_record:add(PaddedEnd, PaddedStart2, invalid_chunks, StoreID), - case delete_chunk_metadata(End, StoreID) of - ok -> - ok; - Error2 -> - ?LOG_WARNING([{event, failed_to_remove_chunks_index_key}, - {absolute_end_offset, End}, - {error, io_lib:format("~p", [Error2])}]) - end; - Error -> - ?LOG_WARNING([{event, failed_to_remove_sync_record_range}, - {range_end, PaddedEnd}, {range_start, PaddedStart2}, - {error, io_lib:format("~p", [Error])}]) - end + ok; + Error2 -> + ?LOG_WARNING([{event, failed_to_remove_chunks_index_key}, + {absolute_end_offset, AbsoluteEndOffset}, + {error, io_lib:format("~p", [Error2])}]) + end; + Error -> + ?LOG_WARNING([{event, failed_to_remove_sync_record_range}, + {range_end, PaddedEndOffset}, {range_start, StartOffset}, + {error, io_lib:format("~p", [Error])}]) + end. + +remove_invalid_sync_records(PaddedEndOffset, StartOffset, StoreID) -> + Remove1 = ar_sync_record:delete(PaddedEndOffset, StartOffset, ar_data_sync, StoreID), + IsSmallChunkBeforeThreshold = PaddedEndOffset - StartOffset < ?DATA_CHUNK_SIZE, + Remove2 = + case {Remove1, IsSmallChunkBeforeThreshold} of + {ok, false} -> + ar_sync_record:delete(PaddedEndOffset, StartOffset, + ar_chunk_storage, StoreID); + _ -> + Remove1 + end, + Remove3 = + case {Remove2, IsSmallChunkBeforeThreshold} of + {ok, false} -> + ar_sync_record:delete(PaddedEndOffset, StartOffset, + ar_chunk_storage_replica_2_9_1_entropy, StoreID); + _ -> + Remove2 + end, + case {Remove3, IsSmallChunkBeforeThreshold} of + {ok, false} -> + ar_sync_record:delete(PaddedEndOffset, StartOffset, + ar_chunk_storage_replica_2_9_1_unpacked, StoreID); + _ -> + Remove3 end. validate_fetched_chunk(Args) -> @@ -1821,15 +1914,16 @@ validate_fetched_chunk(Args) -> false -> log_chunk_error(RequestOrigin, failed_to_validate_chunk_proofs, [{absolute_end_offset, Offset}, {store_id, StoreID}]), - StartOffset = Offset - ChunkSize, - invalidate_bad_data_record({StartOffset, Offset, StoreID, 2}), + invalidate_bad_data_record({Offset, ChunkSize, StoreID, + failed_to_validate_chunk_proofs}), false end; {_BlockStart, _BlockEnd, TXRoot2} -> log_chunk_error(stored_chunk_invalid_tx_root, [{end_offset, Offset}, {tx_root, ar_util:encode(TXRoot2)}, {stored_tx_root, ar_util:encode(TXRoot)}, {store_id, StoreID}]), - invalidate_bad_data_record({Offset - ChunkSize, Offset, StoreID, 3}), + invalidate_bad_data_record({Offset, ChunkSize, StoreID, + stored_chunk_invalid_tx_root}), false end end. @@ -2133,26 +2227,6 @@ read_data_sync_state() -> disk_pool_threshold => 0 } end. -may_be_start_syncing(#sync_data_state{ started_syncing = StartedSyncing } = State) -> - case ar_node:is_joined() of - false -> - State; - true -> - case StartedSyncing of - true -> - State; - false -> - case ar_data_sync_worker_master:is_syncing_enabled() of - true -> - gen_server:cast(self(), sync_intervals), - gen_server:cast(self(), sync_data), - State#sync_data_state{ started_syncing = true }; - false -> - State - end - end - end. - recalculate_disk_pool_size(DataRootMap, State) -> #sync_data_state{ disk_pool_chunks_index = Index } = State, DataRootMap2 = maps:map(fun(_DataRootKey, {_Size, Timestamp, TXIDSet}) -> @@ -2492,23 +2566,23 @@ store_sync_state(_State) -> %% @doc Look to StoreID to find data that TargetStoreID is missing. %% Args: -%% TargetStoreID - The ID of the storage module to sync to (this module is missing data) -%% StoreID - The ID of the storage module to sync from (this module might have the data) +%% StoreID - The ID of the storage module to sync to (this module is missing data) +%% OtherStoreID - The ID of the storage module to sync from (this module might have the data) %% RangeStart - The start offset of the range to check %% RangeEnd - The end offset of the range to check -get_unsynced_intervals_from_other_storage_modules(TargetStoreID, StoreID, RangeStart, +get_unsynced_intervals_from_other_storage_modules(StoreID, OtherStoreID, RangeStart, RangeEnd) -> - get_unsynced_intervals_from_other_storage_modules(TargetStoreID, StoreID, RangeStart, + get_unsynced_intervals_from_other_storage_modules(StoreID, OtherStoreID, RangeStart, RangeEnd, []). -get_unsynced_intervals_from_other_storage_modules(_TargetStoreID, _StoreID, RangeStart, +get_unsynced_intervals_from_other_storage_modules(_StoreID, _OtherStoreID, RangeStart, RangeEnd, Intervals) when RangeStart >= RangeEnd -> Intervals; -get_unsynced_intervals_from_other_storage_modules(TargetStoreID, StoreID, RangeStart, +get_unsynced_intervals_from_other_storage_modules(StoreID, OtherStoreID, RangeStart, RangeEnd, Intervals) -> FindNextMissing = case ar_sync_record:get_next_synced_interval(RangeStart, RangeEnd, ar_data_sync, - TargetStoreID) of + StoreID) of not_found -> {request, {RangeStart, RangeEnd}}; {End, Start} when Start =< RangeStart -> @@ -2518,18 +2592,18 @@ get_unsynced_intervals_from_other_storage_modules(TargetStoreID, StoreID, RangeS end, case FindNextMissing of {skip, End2} -> - get_unsynced_intervals_from_other_storage_modules(TargetStoreID, StoreID, End2, + get_unsynced_intervals_from_other_storage_modules(StoreID, OtherStoreID, End2, RangeEnd, Intervals); {request, {Cursor, RightBound}} -> case ar_sync_record:get_next_synced_interval(Cursor, RightBound, ar_data_sync, - StoreID) of + OtherStoreID) of not_found -> - get_unsynced_intervals_from_other_storage_modules(TargetStoreID, StoreID, + get_unsynced_intervals_from_other_storage_modules(StoreID, OtherStoreID, RightBound, RangeEnd, Intervals); {End2, Start2} -> Start3 = max(Start2, Cursor), - Intervals2 = [{StoreID, {Start3, End2}} | Intervals], - get_unsynced_intervals_from_other_storage_modules(TargetStoreID, StoreID, + Intervals2 = [{OtherStoreID, {Start3, End2}} | Intervals], + get_unsynced_intervals_from_other_storage_modules(StoreID, OtherStoreID, End2, RangeEnd, Intervals2) end end. @@ -2594,8 +2668,6 @@ unpack_fetched_chunk(Cast, AbsoluteOffset, ChunkArgs, Args, State) -> {noreply, State}; false -> ar_packing_server:request_unpack(AbsoluteOffset, ChunkArgs), - ?LOG_DEBUG([{event, requested_fetched_chunk_unpacking}, - {absolute_end_offset, AbsoluteOffset}]), ar_util:cast_after(600000, self(), {expire_unpack_fetched_chunk_request, {AbsoluteOffset, unpacked}}), @@ -2780,24 +2852,24 @@ write_not_blacklisted_chunk(Offset, ChunkDataKey, Chunk, ChunkSize, DataPath, Pa case ShouldStoreInChunkStorage of true -> PaddedOffset = ar_block:get_chunk_padded_offset(Offset), + StartPut = erlang:monotonic_time(), Result = ar_chunk_storage:put(PaddedOffset, Chunk, StoreID), + PutTime = erlang:convert_time_unit(erlang:monotonic_time() - StartPut, native, microsecond) / 1000.0, + case PutTime > 500 of + true -> + ?LOG_DEBUG([{event, chunk_put_duration_milliseconds}, {elapsed, PutTime}, {store_id, StoreID}, + {pid, self()}, {name, name(StoreID)}, {chunk_storage_name, ar_chunk_storage:name(StoreID)}, + {offset, Offset}, {absolute_offset, PaddedOffset}, + {chunk_size, ChunkSize}, {packing, ar_serialize:encode_packing(Packing, true)}]); + false -> + ok + end, case Result of {ok, NewPacking} -> case put_chunk_data(ChunkDataKey, StoreID, DataPath) of ok -> {ok, NewPacking}; Error -> - ?LOG_DEBUG([{event, details_failed_to_store_chunk}, - {context, error_writing_to_chunk_data_db}, - {error, io_lib:format("~p", [Error])}, - {offset, Offset}, - {should_store_in_chunk_storage, ShouldStoreInChunkStorage}, - {chunk_data_key, ar_util:encode(ChunkDataKey)}, - {data_path_hash, ar_util:encode(crypto:hash(sha256, DataPath))}, - {chunk_size, ChunkSize}, - {packing, ar_serialize:encode_packing(Packing, true)}, - {store_id, StoreID} - ]), Error end; _ -> @@ -2808,17 +2880,6 @@ write_not_blacklisted_chunk(Offset, ChunkDataKey, Chunk, ChunkSize, DataPath, Pa ok -> {ok, Packing}; Error -> - ?LOG_DEBUG([{event, details_failed_to_store_chunk}, - {context, error_writing_to_chunk_data_db}, - {error, io_lib:format("~p", [Error])}, - {offset, Offset}, - {should_store_in_chunk_storage, ShouldStoreInChunkStorage}, - {chunk_data_key, ar_util:encode(ChunkDataKey)}, - {data_path_hash, ar_util:encode(crypto:hash(sha256, DataPath))}, - {chunk_size, ChunkSize}, - {packing, ar_serialize:encode_packing(Packing, true)}, - {store_id, StoreID} - ]), Error end end. @@ -2836,7 +2897,7 @@ update_chunks_index(Args, State) -> update_chunks_index2(Args, State) -> {AbsoluteOffset, Offset, ChunkDataKey, TXRoot, DataRoot, TXPath, ChunkSize, Packing} = Args, - #sync_data_state{ chunks_index = ChunksIndex, store_id = StoreID } = State, + #sync_data_state{ store_id = StoreID } = State, Metadata = {ChunkDataKey, TXRoot, DataRoot, TXPath, Offset, ChunkSize}, case put_chunk_metadata(AbsoluteOffset, StoreID, Metadata) of ok -> @@ -2846,35 +2907,9 @@ update_chunks_index2(Args, State) -> ok -> ok; {error, Reason} -> - ?LOG_DEBUG([{event, details_failed_to_store_chunk}, - {context, error_adding_sync_record}, - {error, io_lib:format("~p", [Reason])}, - {sync_record_id, ar_data_sync}, - {absolute_end_offset, AbsoluteOffset}, - {offset, Offset}, - {padded_offset, PaddedOffset}, - {start_offset, StartOffset}, - {chunk_data_key, ar_util:encode(ChunkDataKey)}, - {data_root, ar_util:encode(DataRoot)}, - {chunk_size, ChunkSize}, - {packing, ar_serialize:encode_packing(Packing, true)}, - {store_id, StoreID} - ]), {error, Reason} end; {error, Reason} -> - ?LOG_DEBUG([{event, details_failed_to_store_chunk}, - {context, error_writing_to_chunks_index_db}, - {error, io_lib:format("~p", [Reason])}, - {absolute_end_offset, AbsoluteOffset}, - {offset, Offset}, - {db, ChunksIndex}, - {chunk_data_key, ar_util:encode(ChunkDataKey)}, - {data_root, ar_util:encode(DataRoot)}, - {chunk_size, ChunkSize}, - {packing, ar_serialize:encode_packing(Packing, true)}, - {store_id, StoreID} - ]), {error, Reason} end. @@ -2991,11 +3026,10 @@ process_store_chunk_queue(#sync_data_state{ store_chunk_queue_len = StartLen } = process_store_chunk_queue(State, StartLen). process_store_chunk_queue(#sync_data_state{ store_chunk_queue_len = 0 } = State, StartLen) -> - log_stored_chunks(State, StartLen), State; process_store_chunk_queue(State, StartLen) -> #sync_data_state{ store_chunk_queue = Q, store_chunk_queue_len = Len, - store_chunk_queue_threshold = Threshold } = State, + store_chunk_queue_threshold = Threshold, store_id = StoreID } = State, Timestamp = element(2, gb_sets:smallest(Q)), Now = os:system_time(millisecond), Threshold2 = @@ -3014,15 +3048,16 @@ process_store_chunk_queue(State, StartLen) -> orelse Now - Timestamp > ?STORE_CHUNK_QUEUE_FLUSH_TIME_THRESHOLD of true -> {{_Offset, _Timestamp, _Ref, ChunkArgs, Args}, Q2} = gb_sets:take_smallest(Q), + store_chunk2(ChunkArgs, Args, State), + decrement_chunk_cache_size(), State2 = State#sync_data_state{ store_chunk_queue = Q2, store_chunk_queue_len = Len - 1, store_chunk_queue_threshold = min(Threshold2 + 1, ?STORE_CHUNK_QUEUE_FLUSH_SIZE_THRESHOLD) }, - process_store_chunk_queue(State2); + process_store_chunk_queue(State2, StartLen); false -> - log_stored_chunks(State, StartLen), State end. @@ -3055,18 +3090,6 @@ store_chunk2(ChunkArgs, Args, State) -> end, case CleanRecord of {error, Reason} -> - ?LOG_DEBUG([{event, details_failed_to_store_chunk}, - {context, error_deleting_sync_record}, - {error, io_lib:format("~p", [Reason])}, - {padded_offset, PaddedOffset}, - {start_offset, StartOffset}, - {should_store_in_chunk_storage, ShouldStoreInChunkStorage}, - {data_root, DataRoot}, - {data_path_hash, ar_util:encode(DataPathHash)}, - {chunk_size, ChunkSize}, - {packing, ar_serialize:encode_packing(Packing, true)}, - {store_id, StoreID} - ]), log_failed_to_store_chunk(Reason, AbsoluteOffset, Offset, DataRoot, DataPathHash, StoreID), {error, Reason}; @@ -3104,16 +3127,6 @@ store_chunk2(ChunkArgs, Args, State) -> end end. -log_stored_chunks(State, StartLen) -> - #sync_data_state{ store_chunk_queue_len = EndLen, store_id = StoreID } = State, - StoredCount = StartLen - EndLen, - case StoredCount > 0 of - true -> - ?LOG_DEBUG([{event, stored_chunks}, {count, StoredCount}, {store_id, StoreID}]); - false -> - ok - end. - log_failed_to_store_chunk(already_stored, AbsoluteOffset, Offset, DataRoot, DataPathHash, StoreID) -> ?LOG_INFO([{event, chunk_already_stored}, diff --git a/apps/arweave/src/ar_data_sync_sup.erl b/apps/arweave/src/ar_data_sync_sup.erl index f75f5374b..76b310d32 100644 --- a/apps/arweave/src/ar_data_sync_sup.erl +++ b/apps/arweave/src/ar_data_sync_sup.erl @@ -6,9 +6,6 @@ -export([init/1]). --include_lib("arweave/include/ar_sup.hrl"). --include_lib("arweave/include/ar_config.hrl"). - %%%=================================================================== %%% Public interface. %%%=================================================================== @@ -21,43 +18,8 @@ start_link() -> %% =================================================================== init([]) -> - {ok, Config} = application:get_env(arweave, config), - SyncWorkers = case ar_data_sync_worker_master:is_syncing_enabled() of - true -> - Workers = lists:map( - fun(Number) -> - Name = list_to_atom("ar_data_sync_worker_" ++ integer_to_list(Number)), - ?CHILD_WITH_ARGS(ar_data_sync_worker, worker, Name, [Name]) - end, - lists:seq(1, Config#config.sync_jobs) - ), - SyncWorkerNames = [element(1, El) || El <- Workers], - SyncWorkerMaster = ?CHILD_WITH_ARGS( - ar_data_sync_worker_master, worker, ar_data_sync_worker_master, - [SyncWorkerNames]), - Workers ++ [SyncWorkerMaster]; - false -> - [] - end, - StorageModuleWorkers = lists:map( - fun(StorageModule) -> - StoreID = ar_storage_module:id(StorageModule), - StoreLabel = ar_storage_module:label(StorageModule), - Name = list_to_atom("ar_data_sync_" ++ StoreLabel), - ?CHILD_WITH_ARGS(ar_data_sync, worker, Name, [Name, {StoreID, none}]) - end, - Config#config.storage_modules - ), - DefaultStorageModuleWorker = ?CHILD_WITH_ARGS(ar_data_sync, worker, - ar_data_sync_default, [ar_data_sync_default, {"default", none}]), - RepackInPlaceWorkers = lists:map( - fun({StorageModule, TargetPacking}) -> - StoreID = ar_storage_module:id(StorageModule), - Name = ar_data_sync:name(StoreID), - ?CHILD_WITH_ARGS(ar_data_sync, worker, Name, [Name, {StoreID, TargetPacking}]) - end, - Config#config.repack_in_place_storage_modules - ), - Children = SyncWorkers ++ StorageModuleWorkers ++ [DefaultStorageModuleWorker] - ++ RepackInPlaceWorkers, + Children = + ar_data_sync_worker_master:register_workers() ++ + ar_chunk_copy:register_workers() ++ + ar_data_sync:register_workers(), {ok, {{one_for_one, 5, 10}, Children}}. diff --git a/apps/arweave/src/ar_data_sync_worker.erl b/apps/arweave/src/ar_data_sync_worker.erl index ea409d9d7..280fb26ba 100644 --- a/apps/arweave/src/ar_data_sync_worker.erl +++ b/apps/arweave/src/ar_data_sync_worker.erl @@ -52,7 +52,7 @@ handle_cast({read_range, Args}, State) -> recast -> ok; ReadResult -> - gen_server:cast(ar_data_sync_worker_master, + gen_server:cast(ar_chunk_copy, {task_completed, {read_range, {State#state.name, ReadResult, Args}}}) end, {noreply, State}; @@ -85,10 +85,10 @@ terminate(Reason, _State) -> %%% Private functions. %%%=================================================================== -read_range({Start, End, _OriginStoreID, _TargetStoreID, _SkipSmall, _Caller, _Ref}) +read_range({Start, End, _OriginStoreID, _TargetStoreID}) when Start >= End -> ok; -read_range({Start, End, _OriginStoreID, TargetStoreID, _SkipSmall, _Caller, _Ref} = Args) -> +read_range({Start, End, _OriginStoreID, TargetStoreID} = Args) -> case ar_data_sync:is_chunk_cache_full() of false -> case ar_data_sync:is_disk_space_sufficient(TargetStoreID) of @@ -109,11 +109,10 @@ read_range2(0, Args) -> ar_util:cast_after(1000, self(), {read_range, Args}), recast; read_range2(_MessagesRemaining, - {Start, End, _OriginStoreID, _TargetStoreID, _SkipSmall, _Caller, _TaskRef}) + {Start, End, _OriginStoreID, _TargetStoreID}) when Start >= End -> ok; -read_range2(MessagesRemaining, - {Start, End, OriginStoreID, TargetStoreID, SkipSmall, Caller, TaskRef}) -> +read_range2(MessagesRemaining, {Start, End, OriginStoreID, TargetStoreID}) -> CheckIsRecordedAlready = case ar_sync_record:is_recorded(Start + 1, ar_data_sync, TargetStoreID) of {true, _} -> @@ -123,8 +122,7 @@ read_range2(MessagesRemaining, ok; {_, Start2} -> read_range2(MessagesRemaining, - {Start2, End, OriginStoreID, TargetStoreID, SkipSmall, - Caller, TaskRef}) + {Start2, End, OriginStoreID, TargetStoreID}) end; _ -> false @@ -141,9 +139,10 @@ read_range2(MessagesRemaining, {true, Packing}; SyncRecordReply -> ?LOG_ERROR([{event, cannot_read_requested_range}, - {store_id, OriginStoreID}, + {origin_store_id, OriginStoreID}, {missing_start_offset, Start + 1}, - {reading_for_store_id, TargetStoreID}, + {end_offset, End}, + {target_store_id, TargetStoreID}, {sync_record_reply, io_lib:format("~p", [SyncRecordReply])}]) end end, @@ -165,7 +164,7 @@ read_range2(MessagesRemaining, ?OFFSET_KEY_BITSIZE - ?OFFSET_KEY_PREFIX_BITSIZE)), Start3 = ((Start div PrefixSpaceSize) + 2) * PrefixSpaceSize, read_range2(MessagesRemaining, - {Start3, End, OriginStoreID, TargetStoreID, SkipSmall, Caller, TaskRef}); + {Start3, End, OriginStoreID, TargetStoreID}); {_, {error, Reason}} -> ?LOG_ERROR([{event, failed_to_query_chunk_metadata}, {offset, Start + 1}, {reason, io_lib:format("~p", [Reason])}]); @@ -173,34 +172,20 @@ read_range2(MessagesRemaining, ok; {Packing3, {ok, _Key, {AbsoluteOffset, ChunkDataKey, TXRoot, DataRoot, TXPath, RelativeOffset, ChunkSize}}} -> - Skip = SkipSmall andalso AbsoluteOffset =< ?STRICT_DATA_SPLIT_THRESHOLD - andalso ChunkSize < ?DATA_CHUNK_SIZE, - ReadChunk = - case Skip of - true -> - skip; - false -> - ar_data_sync:read_chunk(AbsoluteOffset, ChunkDataKey, OriginStoreID) - end, + ReadChunk = ar_data_sync:read_chunk(AbsoluteOffset, ChunkDataKey, OriginStoreID), case ReadChunk of - skip -> - read_range2(MessagesRemaining, - {Start + ChunkSize, End, OriginStoreID, TargetStoreID, SkipSmall, - Caller, TaskRef}); not_found -> ar_data_sync:invalidate_bad_data_record( - Start, AbsoluteOffset, OriginStoreID, 1), + AbsoluteOffset, ChunkSize, OriginStoreID, read_range_chunk_not_found), read_range2(MessagesRemaining-1, - {Start + ChunkSize, End, OriginStoreID, TargetStoreID, SkipSmall, - Caller, TaskRef}); + {Start + ChunkSize, End, OriginStoreID, TargetStoreID}); {error, Error} -> ?LOG_ERROR([{event, failed_to_read_chunk}, {absolute_end_offset, AbsoluteOffset}, {chunk_data_key, ar_util:encode(ChunkDataKey)}, {reason, io_lib:format("~p", [Error])}]), read_range2(MessagesRemaining, - {Start + ChunkSize, End, OriginStoreID, TargetStoreID, SkipSmall, - Caller, TaskRef}); + {Start + ChunkSize, End, OriginStoreID, TargetStoreID}); {ok, {Chunk, DataPath}} -> case ar_sync_record:is_recorded(AbsoluteOffset, ar_data_sync, OriginStoreID) of @@ -219,21 +204,18 @@ read_range2(MessagesRemaining, gen_server:cast(ar_data_sync:name(TargetStoreID), {pack_and_store_chunk, Args}), read_range2(MessagesRemaining-1, - {Start + ChunkSize, End, OriginStoreID, TargetStoreID, - SkipSmall, Caller, TaskRef}); + {Start + ChunkSize, End, OriginStoreID, TargetStoreID}); {true, _DifferentPacking} -> %% Unlucky timing - the chunk should have been repacked %% in the meantime. read_range2(MessagesRemaining, - {Start, End, OriginStoreID, TargetStoreID, SkipSmall, - Caller, TaskRef}); + {Start, End, OriginStoreID, TargetStoreID}); Reply -> ?LOG_ERROR([{event, chunk_record_not_found}, {absolute_end_offset, AbsoluteOffset}, {ar_sync_record_reply, io_lib:format("~p", [Reply])}]), read_range2(MessagesRemaining, - {Start + ChunkSize, End, OriginStoreID, TargetStoreID, - SkipSmall, Caller, TaskRef}) + {Start + ChunkSize, End, OriginStoreID, TargetStoreID}) end end end. @@ -286,8 +268,7 @@ sync_range({Start, End, Peer, TargetStoreID, RetryCount} = Args, State) -> %% chunks will be then requested later. Start3 = ar_block:get_chunk_padded_offset( Start2 + byte_size(Chunk)) + 1, - Label = ar_storage_module:label_by_id(TargetStoreID), - gen_server:cast(list_to_atom("ar_data_sync_" ++ Label), + gen_server:cast(ar_data_sync:name(TargetStoreID), {store_fetched_chunk, Peer, Start2 - 1, Proof}), ar_data_sync:increment_chunk_cache_size(), sync_range({Start3, End, Peer, TargetStoreID, RetryCount}, State); diff --git a/apps/arweave/src/ar_data_sync_worker_master.erl b/apps/arweave/src/ar_data_sync_worker_master.erl index b57ee2121..fc61d8a37 100644 --- a/apps/arweave/src/ar_data_sync_worker_master.erl +++ b/apps/arweave/src/ar_data_sync_worker_master.erl @@ -4,11 +4,12 @@ -behaviour(gen_server). --export([start_link/1, is_syncing_enabled/0, ready_for_work/0, read_range/7]). +-export([start_link/1, register_workers/0, is_syncing_enabled/0, ready_for_work/0]). -export([init/1, handle_cast/2, handle_call/3, handle_info/2, terminate/2]). -include_lib("arweave/include/ar.hrl"). +-include_lib("arweave/include/ar_sup.hrl"). -include_lib("arweave/include/ar_consensus.hrl"). -include_lib("arweave/include/ar_config.hrl"). -include_lib("arweave/include/ar_data_sync.hrl"). @@ -16,7 +17,6 @@ -include_lib("eunit/include/eunit.hrl"). -define(REBALANCE_FREQUENCY_MS, 10*1000). --define(READ_RANGE_CHUNKS, 10). -define(MIN_MAX_ACTIVE, 8). -define(MIN_PEER_QUEUE, 20). @@ -47,6 +47,32 @@ start_link(Workers) -> gen_server:start_link({local, ?MODULE}, ?MODULE, Workers, []). +register_workers() -> + case is_syncing_enabled() of + true -> + {Workers, WorkerNames} = register_sync_workers(), + WorkerMaster = ?CHILD_WITH_ARGS( + ar_data_sync_worker_master, worker, ar_data_sync_worker_master, + [WorkerNames]), + Workers ++ [WorkerMaster]; + false -> + [] + end. + + +register_sync_workers() -> + {ok, Config} = application:get_env(arweave, config), + {Workers, WorkerNames} = lists:foldl( + fun(Number, {AccWorkers, AccWorkerNames}) -> + Name = list_to_atom("ar_data_sync_worker_" ++ integer_to_list(Number)), + Worker = ?CHILD_WITH_ARGS(ar_data_sync_worker, worker, Name, [Name]), + {[Worker | AccWorkers], [Name | AccWorkerNames]} + end, + {[], []}, + lists:seq(1, Config#config.sync_jobs) + ), + {Workers, WorkerNames}. + %% @doc Returns true if syncing is enabled (i.e. sync_jobs > 0). is_syncing_enabled() -> {ok, Config} = application:get_env(arweave, config), @@ -62,15 +88,6 @@ ready_for_work() -> false end. -read_range(Start, End, OriginStoreID, TargetStoreID, SkipSmall, Caller, TaskRef) -> - case ar_data_sync_worker_master:ready_for_work() of - true -> - Args = {Start, End, OriginStoreID, TargetStoreID, SkipSmall, Caller, TaskRef}, - gen_server:cast(?MODULE, {read_range, Args}), - true; - false -> - false - end. %%%=================================================================== %%% Generic server callbacks. %%%=================================================================== @@ -100,33 +117,19 @@ handle_cast(process_main_queue, State) -> ar_util:cast_after(200, ?MODULE, process_main_queue), {noreply, process_main_queue(State)}; -handle_cast({read_range, _Args}, #state{ worker_count = 0 } = State) -> - {noreply, State}; -handle_cast({read_range, Args}, State) -> - {noreply, enqueue_main_task(read_range, Args, State)}; - handle_cast({sync_range, _Args}, #state{ worker_count = 0 } = State) -> {noreply, State}; handle_cast({sync_range, Args}, State) -> {noreply, enqueue_main_task(sync_range, Args, State)}; -handle_cast({task_completed, {read_range, {Worker, _, Args}}}, State) -> - {_Start, _End, _OriginStoreID, _TargetStoreID, _SkipSmall, Caller, TaskRef} = Args, - case TaskRef of - sub_task -> - ok; - _ -> - Caller ! {ar_data_sync_worker_master_read_range_task_complete, TaskRef} - end, - State2 = update_scheduled_task_count(Worker, read_range, "localhost", -1, State), - {noreply, State2}; - handle_cast({task_completed, {sync_range, {Worker, Result, Args, ElapsedNative}}}, State) -> {Start, End, Peer, _, _} = Args, DataSize = End - Start, - State2 = update_scheduled_task_count(Worker, sync_range, ar_util:format_peer(Peer), -1, State), + State2 = update_scheduled_task_count( + Worker, sync_range, ar_util:format_peer(Peer), -1, State), PeerTasks = get_peer_tasks(Peer, State2), - {PeerTasks2, State3} = complete_sync_range(PeerTasks, Result, ElapsedNative, DataSize, State2), + {PeerTasks2, State3} = complete_sync_range( + PeerTasks, Result, ElapsedNative, DataSize, State2), {PeerTasks3, State4} = process_peer_queue(PeerTasks2, State3), {noreply, set_peer_tasks(PeerTasks3, State4)}; @@ -165,8 +168,6 @@ process_main_queue(#state{ task_queue_len = 0 } = State) -> process_main_queue(State) -> {Task, Args, State2} = dequeue_main_task(State), State4 = case Task of - read_range -> - schedule_read_range(Args, State2); sync_range -> {_Start, _End, Peer, _TargetStoreID} = Args, PeerTasks = get_peer_tasks(Peer, State2), @@ -302,27 +303,7 @@ schedule_sync_range(PeerTasks, Args, State) -> PeerTasks2 = PeerTasks#peer_tasks{ active_count = PeerTasks#peer_tasks.active_count + 1 }, {PeerTasks2, State2}. -schedule_read_range(Args, State) -> - {Start, End, OriginStoreID, TargetStoreID, SkipSmall, Caller, TaskRef} = Args, - End2 = min(Start + (?READ_RANGE_CHUNKS * ?DATA_CHUNK_SIZE), End), - TaskRef2 = - case End2 == End of - true -> - TaskRef; - false -> - sub_task - end, - Args2 = {Start, End2, OriginStoreID, TargetStoreID, SkipSmall, Caller, TaskRef2}, - State2 = schedule_task(read_range, Args2, State), - case End2 == End of - true -> - State2; - false -> - Args3 = {End2, End, OriginStoreID, TargetStoreID, SkipSmall, Caller, TaskRef}, - push_main_task(read_range, Args3, State2) - end. - -%% @doc Schedule a task (either sync_range or read_range) to be run on a worker. +%% @doc Schedule a task to be run on a worker. schedule_task(Task, Args, State) -> {Worker, State2} = get_worker(State), gen_server:cast(Worker, {Task, Args}), @@ -475,7 +456,6 @@ cycle_workers(AverageLoad, #state{ workers = Workers, worker_loads = WorkerLoads format_peer(Task, Args) -> case Task of - read_range -> "localhost"; sync_range -> ar_util:format_peer(element(3, Args)) end. @@ -549,9 +529,6 @@ test_get_worker() -> {worker1, _} = get_worker(State6). test_format_peer() -> - ?assertEqual("localhost", - format_peer(read_range, {0, 100, 1, 2, true, self(), make_ref()})), - ?assertEqual("localhost", format_peer(read_range, undefined)), ?assertEqual("1.2.3.4:1984", format_peer(sync_range, {0, 100, {1, 2, 3, 4, 1984}, 2})). test_enqueue_main_task() -> @@ -562,27 +539,19 @@ test_enqueue_main_task() -> State0 = #state{}, Ref = make_ref(), - State1 = enqueue_main_task(read_range, - {0, 100, StoreID1, StoreID2, true, self(), Ref}, State0), - State2 = enqueue_main_task(sync_range, {0, 100, Peer1, StoreID1}, State1), - State3 = push_main_task(sync_range, {100, 200, Peer2, StoreID2}, State2), + State1 = enqueue_main_task(sync_range, {0, 100, Peer1, StoreID1}, State0), + State2 = push_main_task(sync_range, {100, 200, Peer2, StoreID2}, State1), assert_main_queue([ {sync_range, {100, 200, Peer2, StoreID2}}, - {read_range, {0, 100, StoreID1, StoreID2, true, self(), Ref}}, {sync_range, {0, 100, Peer1, StoreID1}} - ], State3), - ?assertEqual(3, State3#state.queued_task_count), + ], State2), + ?assertEqual(2, State2#state.queued_task_count), - {Task1, Args1, State4} = dequeue_main_task(State3), + {Task1, Args1, State3} = dequeue_main_task(State2), assert_task(sync_range, {100, 200, Peer2, StoreID2}, Task1, Args1), - {Task2, Args2, State5} = dequeue_main_task(State4), - assert_task(read_range, {0, 100, StoreID1, StoreID2, true, self(), Ref}, Task2, Args2), - assert_main_queue([ - {sync_range, {0, 100, Peer1, StoreID1}} - ], State5), %% queued_task_count isn't decremented until we schedule tasks - ?assertEqual(3, State5#state.queued_task_count). + ?assertEqual(2, State3#state.queued_task_count). test_enqueue_peer_task() -> PeerA = {1, 2, 3, 4, 1984}, @@ -617,38 +586,32 @@ test_process_main_queue() -> Peer1 = {1, 2, 3, 4, 1984}, Peer2 = {5, 6, 7, 8, 1985}, StoreID1 = ar_storage_module:id({?PARTITION_SIZE, 1, default}), - StoreID2 = ar_storage_module:id({?PARTITION_SIZE, 2, default}), State0 = #state{ workers = queue:from_list([worker1, worker2, worker3]), worker_count = 3 }, - State1 = enqueue_main_task(read_range, - {0, 100, StoreID1, StoreID2, true, self(), make_ref()}, State0), - State2 = enqueue_main_task(sync_range, {0, 100, Peer1, StoreID1}, State1), - State3 = enqueue_main_task(sync_range, {100, 200, Peer1, StoreID1}, State2), - State4 = enqueue_main_task(sync_range, {200, 300, Peer1, StoreID1}, State3), - State5 = enqueue_main_task(sync_range, {300, 400, Peer1, StoreID1}, State4), - State6 = enqueue_main_task(sync_range, {400, 500, Peer1, StoreID1}, State5), - State7 = enqueue_main_task(sync_range, {500, 600, Peer1, StoreID1}, State6), - State8 = enqueue_main_task(sync_range, {600, 700, Peer1, StoreID1}, State7), - State9 = enqueue_main_task(sync_range, {700, 800, Peer1, StoreID1}, State8), + State1 = enqueue_main_task(sync_range, {0, 100, Peer1, StoreID1}, State0), + State2 = enqueue_main_task(sync_range, {100, 200, Peer1, StoreID1}, State1), + State3 = enqueue_main_task(sync_range, {200, 300, Peer1, StoreID1}, State2), + State4 = enqueue_main_task(sync_range, {300, 400, Peer1, StoreID1}, State3), + State5 = enqueue_main_task(sync_range, {400, 500, Peer1, StoreID1}, State4), + State6 = enqueue_main_task(sync_range, {500, 600, Peer1, StoreID1}, State5), + State7 = enqueue_main_task(sync_range, {600, 700, Peer1, StoreID1}, State6), + State8 = enqueue_main_task(sync_range, {700, 800, Peer1, StoreID1}, State7), %% 9th task queued for Peer1 won't be scheduled - State10 = enqueue_main_task(sync_range, {800, 900, Peer1, StoreID1}, State9), - State11 = enqueue_main_task(sync_range, {900, 1000, Peer2, StoreID1}, State10), - State12 = enqueue_main_task(sync_range, {1000, 1100, Peer2, StoreID1}, State11), - %% Will get split into 2 tasks when processed - State13 = enqueue_main_task( - read_range, {100, 20 * 262144, StoreID1, StoreID2, true, self(), make_ref()}, State12), - ?assertEqual(13, State13#state.queued_task_count), - ?assertEqual(0, State13#state.scheduled_task_count), - - State14 = process_main_queue(State13), - assert_main_queue([], State14), - ?assertEqual(1, State14#state.queued_task_count), - ?assertEqual(13, State14#state.scheduled_task_count), - ?assertEqual([worker2, worker3, worker1], queue:to_list(State14#state.workers)), - - PeerTasks = get_peer_tasks(Peer1, State14), + State9 = enqueue_main_task(sync_range, {800, 900, Peer1, StoreID1}, State8), + State10 = enqueue_main_task(sync_range, {900, 1000, Peer2, StoreID1}, State9), + State11 = enqueue_main_task(sync_range, {1000, 1100, Peer2, StoreID1}, State10), + ?assertEqual(11, State11#state.queued_task_count), + ?assertEqual(0, State11#state.scheduled_task_count), + + State12 = process_main_queue(State11), + assert_main_queue([], State12), + ?assertEqual(1, State12#state.queued_task_count), + ?assertEqual(10, State12#state.scheduled_task_count), + ?assertEqual([worker2, worker3, worker1], queue:to_list(State12#state.workers)), + + PeerTasks = get_peer_tasks(Peer1, State12), assert_peer_tasks( [{sync_range, {800, 900, Peer1, StoreID1}}], 8, 8, PeerTasks). diff --git a/apps/arweave/src/ar_device_lock.erl b/apps/arweave/src/ar_device_lock.erl new file mode 100644 index 000000000..1a240400f --- /dev/null +++ b/apps/arweave/src/ar_device_lock.erl @@ -0,0 +1,480 @@ +-module(ar_device_lock). + +-behaviour(gen_server). + +-export([get_store_id_to_device_map/0, is_ready/0, acquire_lock/3, release_lock/2]). + +-export([start_link/0, init/1, handle_call/3, handle_info/2, handle_cast/2]). + +-include("../include/ar.hrl"). +-include("../include/ar_config.hrl"). + +-include_lib("eunit/include/eunit.hrl"). + +-record(state, { + store_id_to_device = #{}, + device_locks = #{}, + initialized = false, + num_replica_2_9_workers = 0 +}). + +-type device_mode() :: prepare | sync | repack. + +-ifdef(AR_TEST). +-define(DEVICE_LOCK_LOG_INTERVAL_MS, 10_000). %% 10 seconds +-else. +-define(DEVICE_LOCK_LOG_INTERVAL_MS, 600_000). %% 10 minutes +-endif. + +%%%=================================================================== +%%% Public interface. +%%%=================================================================== + +get_store_id_to_device_map() -> + case catch gen_server:call(?MODULE, get_state) of + {'EXIT', {Reason, {gen_server, call, _}}} -> + {error, Reason}; + State -> + State#state.store_id_to_device + end. + +is_ready() -> + case catch gen_server:call(?MODULE, get_state) of + {'EXIT', {Reason, {gen_server, call, _}}} -> + ?LOG_WARNING([{event, error_getting_device_lock_state}, + {module, ?MODULE}, {reason, Reason}]), + false; + State -> + State#state.initialized + end. + +%% @doc Helper function to wrap common logic around acquiring a device lock. +-spec acquire_lock(device_mode(), string(), atom()) -> atom(). +acquire_lock(Mode, StoreID, CurrentStatus) -> + NewStatus = case CurrentStatus of + _ when CurrentStatus == complete; CurrentStatus == off -> + % No change needed when we're done or off. + CurrentStatus; + _ -> + case catch gen_server:call(?MODULE, {acquire_lock, Mode, StoreID}) of + {'EXIT', {Reason, {gen_server, call, _}}} -> + ?LOG_WARNING([{event, error_acquiring_device_lock}, + {module, ?MODULE}, {reason, Reason}]), + CurrentStatus; + true -> + active; + false -> + paused + end + end, + + case NewStatus == CurrentStatus of + true -> + ok; + false -> + ?LOG_INFO([{event, acquire_device_lock}, {mode, Mode}, {store_id, StoreID}, + {old_status, CurrentStatus}, {new_status, NewStatus}]) + end, + NewStatus. + +release_lock(Mode, StoreID) -> + gen_server:cast(?MODULE, {release_lock, Mode, StoreID}). + +%%%=================================================================== +%%% Generic server callbacks. +%%%=================================================================== +start_link() -> + gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). + +init([]) -> + gen_server:cast(self(), initialize_state), + {ok, Config} = application:get_env(arweave, config), + ?LOG_INFO([{event, starting_device_lock_server}, + {num_replica_2_9_workers, Config#config.replica_2_9_workers}]), + {ok, #state{num_replica_2_9_workers = Config#config.replica_2_9_workers}}. + +handle_call(get_state, _From, State) -> + {reply, State, State}; +handle_call({acquire_lock, Mode, StoreID}, _From, State) -> + case State#state.initialized of + false -> + % Not yet initialized. + {reply, false, State}; + _ -> + {Acquired, State2} = do_acquire_lock(Mode, StoreID, State), + {reply, Acquired, State2} + end; +handle_call(Request, _From, State) -> + ?LOG_WARNING([{event, unhandled_call}, {module, ?MODULE}, {request, Request}]), + {reply, ok, State}. + +handle_cast(initialize_state, State) -> + State2 = case ar_node:is_joined() of + false -> + ar_util:cast_after(1000, self(), initialize_state), + State; + true -> + initialize_state(State) + end, + {noreply, State2}; +handle_cast({release_lock, Mode, StoreID}, State) -> + case State#state.initialized of + false -> + % Not yet initialized. + {noreply, State}; + _ -> + State2 = do_release_lock(Mode, StoreID, State), + ?LOG_INFO([{event, release_device_lock}, {mode, Mode}, {store_id, StoreID}]), + {noreply, State2} + end; +handle_cast(log_device_locks, State) -> + log_device_locks(State), + ar_util:cast_after(?DEVICE_LOCK_LOG_INTERVAL_MS, ?MODULE, log_device_locks), + {noreply, State}; +handle_cast(Request, State) -> + ?LOG_WARNING([{event, unhandled_cast}, {module, ?MODULE}, {request, Request}]), + {noreply, State}. + + +handle_info(Message, State) -> + ?LOG_WARNING([{event, unhandled_info}, {module, ?MODULE}, {message, Message}]), + {noreply, State}. + +%%%=================================================================== +%%% Private functions. +%%%=================================================================== + +initialize_state(State) -> + {ok, Config} = application:get_env(arweave, config), + StorageModules = Config#config.storage_modules, + RepackInPlaceModules = [element(1, El) + || El <- Config#config.repack_in_place_storage_modules], + StoreIDToDevice = lists:foldl( + fun(Module, Acc) -> + StoreID = ar_storage_module:id(Module), + Device = get_system_device(Module), + ?LOG_INFO([ + {event, storage_module_device}, {store_id, StoreID}, {device, Device}]), + maps:put(StoreID, Device, Acc) + end, + #{}, + StorageModules ++ RepackInPlaceModules + ), + State2 = State#state{ + store_id_to_device = StoreIDToDevice, + initialized = true + }, + + log_device_locks(State2), + ar_util:cast_after(?DEVICE_LOCK_LOG_INTERVAL_MS, ?MODULE, log_device_locks), + + State2. + +get_system_device(StorageModule) -> + {ok, Config} = application:get_env(arweave, config), + StoreID = ar_storage_module:id(StorageModule), + Path = ar_chunk_storage:get_chunk_storage_path(Config#config.data_dir, StoreID), + Device = ar_util:get_system_device(Path), + case Device of + "" -> StoreID; % If the command fails or returns an empty string, return StoreID + _ -> Device + end. + +do_acquire_lock(Mode, StoreID, State) -> + MaxPrepareLocks = State#state.num_replica_2_9_workers, + Device = maps:get(StoreID, State#state.store_id_to_device), + DeviceLock = maps:get(Device, State#state.device_locks, sync), + PrepareLocks = count_prepare_locks(State), + {Acquired, NewDeviceLock} = case Mode of + sync -> + %% Can only aquire a sync lock if the device is in sync mode + case DeviceLock of + sync -> {true, sync}; + _ -> {false, DeviceLock} + end; + prepare -> + %% Can only acquire a prepare lock if the device is in sync mode or this + %% StoreID already has the prepare lock + case {DeviceLock, PrepareLocks} of + {sync, _} when PrepareLocks < MaxPrepareLocks -> {true, {prepare, StoreID}}; + {{prepare, StoreID}, _} -> {true, DeviceLock}; + {{prepare_and_repack, StoreID}, _} -> {true, DeviceLock}; + {{repack, StoreID}, _} when PrepareLocks < MaxPrepareLocks -> {true, {prepare_and_repack, StoreID}}; + _ -> {false, DeviceLock} + end; + repack -> + %% Can only acquire a repack lock if the device is in sync mode or this + %% StoreID already has the repack lock + case {DeviceLock, PrepareLocks} of + {sync, _} when PrepareLocks < MaxPrepareLocks -> {true, {repack, StoreID}}; + {{prepare, StoreID}, _} -> {true, {prepare_and_repack, StoreID}}; + {{prepare_and_repack, StoreID}, _} -> {true, DeviceLock}; + {{repack, StoreID}, _} -> {true, {repack, StoreID}}; + _ -> {false, DeviceLock} + end + end, + + DeviceLocks = maps:put(Device, NewDeviceLock, State#state.device_locks), + {Acquired, State#state{device_locks = DeviceLocks}}. + +do_release_lock(Mode, StoreID, State) -> + Device = maps:get(StoreID, State#state.store_id_to_device), + DeviceLock = maps:get(Device, State#state.device_locks, sync), + NewDeviceLock = case Mode of + sync -> + %% Releasing a sync lock does nothing. + DeviceLock; + prepare -> + case DeviceLock of + {prepare, StoreID} -> + %% This StoreID had a prepare lock on this device, so now we can + %% put the device back in sync mode so it's ready to be locked again + %% if needed. + sync; + {prepare_and_repack, StoreID} -> + {repack, StoreID}; + _ -> + %% We should only be able to release a prepare lock if we previously + %% held it. If we hit this branch something is wrong. + ?LOG_WARNING([{event, invalid_release_lock}, + {module, ?MODULE}, {mode, Mode}, {store_id, StoreID}, + {current_lock, DeviceLock}]), + DeviceLock + end; + repack -> + case DeviceLock of + {repack, StoreID} -> + %% This StoreID had a repack lock on this device, so now we can + %% put the device back in sync mode so it's ready to be locked again + %% if needed. + sync; + {prepare_and_repack, StoreID} -> + {prepare, StoreID}; + _ -> + %% We should only be able to release a repack lock if we previously + %% held it. If we hit this branch something is wrong. + ?LOG_WARNING([{event, invalid_release_lock}, + {module, ?MODULE}, {mode, Mode}, {store_id, StoreID}, + {current_lock, DeviceLock}]), + DeviceLock + end + end, + + DeviceLocks = maps:put(Device, NewDeviceLock, State#state.device_locks), + State#state{device_locks = DeviceLocks}. + +count_prepare_locks(State) -> + maps:fold( + fun(_Device, Lock, Acc) -> + case Lock of + {prepare, _} -> Acc + 1; + {prepare_and_repack, _} -> Acc + 1; + _ -> Acc + end + end, + 0, + State#state.device_locks + ). + +log_device_locks(State) -> + StoreIDToDevice = State#state.store_id_to_device, + DeviceLocks = State#state.device_locks, + SortedStoreIDList = lists:sort( + fun({StoreID1, Device1}, {StoreID2, Device2}) -> + case Device1 =:= Device2 of + true -> StoreID1 =< StoreID2; + false -> Device1 < Device2 + end + end, + maps:to_list(StoreIDToDevice)), + lists:foreach( + fun({StoreID, Device}) -> + DeviceLock = maps:get(Device, DeviceLocks, sync), + Status = case DeviceLock of + sync -> sync; + {prepare, StoreID} -> prepare; + {repack, StoreID} -> repack; + {prepare_and_repack, StoreID} -> prepare_and_repack; + _ -> paused + end, + ?LOG_INFO([{event, device_lock_status}, {device, Device}, {store_id, StoreID}, {status, Status}]) + end, + SortedStoreIDList + ). + +%%%=================================================================== +%%% Tests. +%%%=================================================================== +device_locks_test_() -> + [ + {timeout, 30, fun test_acquire_lock/0}, + {timeout, 30, fun test_release_lock/0} + ]. + +test_acquire_lock() -> + State = #state{ + store_id_to_device = #{ + "storage_module_0_unpacked" => "device1", + "storage_module_1_unpacked" => "device1", + "storage_module_2_unpacked" => "device2", + "storage_module_3_unpacked" => "device2", + "storage_module_4_unpacked" => "device3", + "storage_module_5_unpacked" => "device3" + }, + device_locks = #{ + "device1" => sync, + "device2" => {prepare, "storage_module_2_unpacked"}, + "device3" => {repack, "storage_module_4_unpacked"} + }, + num_replica_2_9_workers = 2 + }, + + ?assertEqual( + {true, State}, + do_acquire_lock(sync, "storage_module_0_unpacked", State)), + ?assertEqual( + {false, State}, + do_acquire_lock(sync, "storage_module_2_unpacked", State)), + ?assertEqual( + {false, State}, + do_acquire_lock(sync, "storage_module_3_unpacked", State)), + ?assertEqual( + {false, State}, + do_acquire_lock(sync, "storage_module_4_unpacked", State)), + + + ?assertEqual( + {true, State#state{device_locks = #{ + "device1" => {prepare, "storage_module_0_unpacked"}, + "device2" => {prepare, "storage_module_2_unpacked"}, + "device3" => {repack, "storage_module_4_unpacked"} + }}}, + do_acquire_lock(prepare, "storage_module_0_unpacked", State)), + ?assertEqual( + {true, State}, + do_acquire_lock(prepare, "storage_module_2_unpacked", State)), + ?assertEqual( + {false, State}, + do_acquire_lock(prepare, "storage_module_3_unpacked", State)), + ?assertEqual( + {true, State#state{device_locks = #{ + "device1" => sync, + "device2" => {prepare, "storage_module_2_unpacked"}, + "device3" => {prepare_and_repack, "storage_module_4_unpacked"} + }}}, + do_acquire_lock(prepare, "storage_module_4_unpacked", State)), + + ?assertEqual( + {true, State#state{device_locks = #{ + "device1" => {repack, "storage_module_0_unpacked"}, + "device2" => {prepare, "storage_module_2_unpacked"}, + "device3" => {repack, "storage_module_4_unpacked"} + }}}, + do_acquire_lock(repack, "storage_module_0_unpacked", State)), + ?assertEqual( + {true, State#state{device_locks = #{ + "device1" => sync, + "device2" => {prepare_and_repack, "storage_module_2_unpacked"}, + "device3" => {repack, "storage_module_4_unpacked"} + }}}, + do_acquire_lock(repack, "storage_module_2_unpacked", State)), + ?assertEqual( + {false, State}, + do_acquire_lock(repack, "storage_module_3_unpacked", State)), + ?assertEqual( + {true, State}, + do_acquire_lock(repack, "storage_module_4_unpacked", State)), + ?assertEqual( + {false, State}, + do_acquire_lock(repack, "storage_module_5_unpacked", State)). + +test_release_lock() -> + State = #state{ + store_id_to_device = #{ + "storage_module_0_unpacked" => "device1", + "storage_module_1_unpacked" => "device1", + "storage_module_2_unpacked" => "device2", + "storage_module_3_unpacked" => "device2", + "storage_module_4_unpacked" => "device3", + "storage_module_5_unpacked" => "device3", + "storage_module_6_unpacked" => "device4" + }, + device_locks = #{ + "device1" => sync, + "device2" => {prepare, "storage_module_2_unpacked"}, + "device3" => {repack, "storage_module_4_unpacked"}, + "device4" => {prepare_and_repack, "storage_module_6_unpacked"} + } + }, + + ?assertEqual( + State, + do_release_lock(sync, "storage_module_0_unpacked", State)), + ?assertEqual( + State, + do_release_lock(sync, "storage_module_2_unpacked", State)), + ?assertEqual( + State, + do_release_lock(sync, "storage_module_3_unpacked", State)), + ?assertEqual( + State, + do_release_lock(sync, "storage_module_4_unpacked", State)), + ?assertEqual( + State, + do_release_lock(sync, "storage_module_6_unpacked", State)), + + ?assertEqual( + State, + do_release_lock(prepare, "storage_module_0_unpacked", State)), + ?assertEqual( + State#state{device_locks = #{ + "device1" => sync, + "device2" => sync, + "device3" => {repack, "storage_module_4_unpacked"}, + "device4" => {prepare_and_repack, "storage_module_6_unpacked"} + }}, + do_release_lock(prepare, "storage_module_2_unpacked", State)), + ?assertEqual( + State, + do_release_lock(prepare, "storage_module_3_unpacked", State)), + ?assertEqual( + State, + do_release_lock(prepare, "storage_module_4_unpacked", State)), + ?assertEqual( + State#state{device_locks = #{ + "device1" => sync, + "device2" => {prepare, "storage_module_2_unpacked"}, + "device3" => {repack, "storage_module_4_unpacked"}, + "device4" => {repack, "storage_module_6_unpacked"} + }}, + do_release_lock(prepare, "storage_module_6_unpacked", State)), + + ?assertEqual( + State, + do_release_lock(repack, "storage_module_0_unpacked", State)), + ?assertEqual( + State, + do_release_lock(repack, "storage_module_2_unpacked", State)), + ?assertEqual( + State, + do_release_lock(repack, "storage_module_3_unpacked", State)), + ?assertEqual( + State#state{device_locks = #{ + "device1" => sync, + "device2" => {prepare, "storage_module_2_unpacked"}, + "device3" => sync, + "device4" => {prepare_and_repack, "storage_module_6_unpacked"} + }}, + do_release_lock(repack, "storage_module_4_unpacked", State)), + ?assertEqual( + State, + do_release_lock(repack, "storage_module_5_unpacked", State)), + ?assertEqual( + State#state{device_locks = #{ + "device1" => sync, + "device2" => {prepare, "storage_module_2_unpacked"}, + "device3" => {repack, "storage_module_4_unpacked"}, + "device4" => {prepare, "storage_module_6_unpacked"} + }}, + do_release_lock(repack, "storage_module_6_unpacked", State)). + diff --git a/apps/arweave/src/ar_entropy_storage.erl b/apps/arweave/src/ar_entropy_storage.erl index 7ce85d0e0..242705ad2 100644 --- a/apps/arweave/src/ar_entropy_storage.erl +++ b/apps/arweave/src/ar_entropy_storage.erl @@ -1,14 +1,78 @@ -module(ar_entropy_storage). --export([is_entropy_packing/1, acquire_semaphore/1, release_semaphore/1, is_recorded/2, - is_sub_chunk_recorded/3, delete_record/2, generate_entropies/3, generate_missing_entropy/2, - generate_entropy_keys/3, shift_entropy_offset/2, store_entropy/8, record_chunk/6]). +-behaviour(gen_server). + +-export([name/1, is_entropy_packing/1, acquire_semaphore/1, release_semaphore/1, is_ready/1, + is_entropy_recorded/2, delete_record/2, generate_entropies/2, + generate_missing_entropy/2, generate_entropy_keys/2, store_entropy/6, record_chunk/8]). + +-export([start_link/2, init/1, handle_cast/2, handle_call/3, handle_info/2, terminate/2]). -include("../include/ar.hrl"). -include("../include/ar_consensus.hrl"). -include_lib("eunit/include/eunit.hrl"). +-record(state, { + store_id +}). + +%%%=================================================================== +%%% Public interface. +%%%=================================================================== + +%% @doc Start the server. +start_link(Name, StoreID) -> + gen_server:start_link({local, Name}, ?MODULE, StoreID, []). + +%% @doc Return the name of the server serving the given StoreID. +name(StoreID) -> + list_to_atom("ar_entropy_storage_" ++ ar_storage_module:label_by_id(StoreID)). + +init(StoreID) -> + ?LOG_INFO([{event, ar_entropy_storage_init}, {name, name(StoreID)}, {store_id, StoreID}]), + {ok, #state{ store_id = StoreID }}. + +store_entropy( + StoreID, Entropies, BucketEndOffset, RangeEnd, Keys, RewardAddr) -> + BucketEndOffset2 = reset_entropy_offset(BucketEndOffset), + gen_server:cast(name(StoreID), {store_entropy, + Entropies, BucketEndOffset2, RangeEnd, Keys, RewardAddr}). + +is_ready(StoreID) -> + case catch gen_server:call(name(StoreID), is_ready, infinity) of + {'EXIT', {Reason, {gen_server, call, _}}} -> + ?LOG_WARNING([{event, is_ready_error}, {module, ?MODULE}, + {name, name(StoreID)}, {store_id, StoreID}, {reason, Reason}]), + false; + Reply -> + Reply + end. + +handle_cast( + {store_entropy, Entropies, BucketEndOffset, RangeEnd, Keys, RewardAddr}, State) -> + do_store_entropy(Entropies, BucketEndOffset, RangeEnd, Keys, RewardAddr, State), + {noreply, State}; +handle_cast(Cast, State) -> + ?LOG_WARNING([{event, unhandled_cast}, {module, ?MODULE}, {cast, Cast}]), + {noreply, State}. + +handle_call(is_ready, _From, State) -> + {reply, true, State}; +handle_call(Call, _From, State) -> + ?LOG_WARNING([{event, unhandled_call}, {module, ?MODULE}, {call, Call}]), + {reply, {error, unhandled_call}, State}. + +terminate(Reason, State) -> + ?LOG_INFO([{event, terminate}, {module, ?MODULE}, + {reason, Reason}, {name, name(State#state.store_id)}, + {store_id, State#state.store_id}]), + ok. + +handle_info(Info, State) -> + ?LOG_WARNING([{event, unhandled_info}, {module, ?MODULE}, {info, Info}]), + {noreply, State}. + -spec is_entropy_packing(ar_chunk_storage:packing()) -> boolean(). is_entropy_packing(unpacked_padded) -> true; @@ -17,39 +81,14 @@ is_entropy_packing({replica_2_9, _}) -> is_entropy_packing(_) -> false. -%% @doc Return true if the given sub-chunk bucket contains the 2.9 entropy. -is_sub_chunk_recorded(PaddedEndOffset, SubChunkBucketStartOffset, StoreID) -> +%% @doc Return true if the 2.9 entropy with the given offset is recorded. +is_entropy_recorded(PaddedEndOffset, StoreID) -> %% Entropy indexing changed between 2.9.0 and 2.9.1. So we'll use a new %% sync_record id (ar_chunk_storage_replica_2_9_1_entropy) going forward. %% The old id (ar_chunk_storage_replica_2_9_entropy) should not be used. ID = ar_chunk_storage_replica_2_9_1_entropy, ChunkBucketStart = ar_chunk_storage:get_chunk_bucket_start(PaddedEndOffset), - SubChunkBucketStart = ChunkBucketStart + SubChunkBucketStartOffset, - ar_sync_record:is_recorded(SubChunkBucketStart + 1, ID, StoreID). - -%% @doc Return true if the 2.9 entropy for every sub-chunk of the chunk with the -%% given offset (> start offset, =< end offset) is recorded. -%% We check every sub-chunk because the entropy is written on the sub-chunk level. -is_recorded(PaddedEndOffset, StoreID) -> - ChunkBucketStart = ar_chunk_storage:get_chunk_bucket_start(PaddedEndOffset), - is_recorded2(ChunkBucketStart, - ChunkBucketStart + ?DATA_CHUNK_SIZE, - StoreID). - -is_recorded2(Cursor, BucketEnd, _StoreID) when Cursor >= BucketEnd -> - true; -is_recorded2(Cursor, BucketEnd, StoreID) -> - %% Entropy indexing changed between 2.9.0 and 2.9.1. So we'll use a new - %% sync_record id (ar_chunk_storage_replica_2_9_1_entropy) going forward. - %% The old id (ar_chunk_storage_replica_2_9_entropy) should not be used. - ID = ar_chunk_storage_replica_2_9_1_entropy, - case ar_sync_record:is_recorded(Cursor + 1, ID, StoreID) of - false -> - false; - true -> - SubChunkSize = ?COMPOSITE_PACKING_SUB_CHUNK_SIZE, - is_recorded2(Cursor + SubChunkSize, BucketEnd, StoreID) - end. + ar_sync_record:is_recorded(ChunkBucketStart + 1, ID, StoreID). update_sync_records(IsComplete, PaddedEndOffset, StoreID, RewardAddr) -> %% Entropy indexing changed between 2.9.0 and 2.9.1. So we'll use a new @@ -59,15 +98,13 @@ update_sync_records(IsComplete, PaddedEndOffset, StoreID, RewardAddr) -> BucketEnd = ar_chunk_storage:get_chunk_bucket_end(PaddedEndOffset), BucketStart = ar_chunk_storage:get_chunk_bucket_start(PaddedEndOffset), ar_sync_record:add_async(replica_2_9_entropy, BucketEnd, BucketStart, ID, StoreID), - prometheus_counter:inc(replica_2_9_entropy_stored, [StoreID], ?DATA_CHUNK_SIZE), + prometheus_counter:inc(replica_2_9_entropy_stored, + [ar_storage_module:label_by_id(StoreID)], ?DATA_CHUNK_SIZE), case IsComplete of true -> + Packing = {replica_2_9, RewardAddr}, StartOffset = PaddedEndOffset - ?DATA_CHUNK_SIZE, - %% update_sync_records is only called when an unpacked_padded chunks has - %% been written to disk before entropy was generated. In this case we have - %% to remove the unpacked_padded sync record before we add the replica_2_9 - %% sync record. - ar_sync_record:delete(PaddedEndOffset, StartOffset, ar_data_sync, StoreID), + prometheus_counter:inc(chunks_stored, [ar_storage_module:packing_label(Packing), ar_storage_module:label_by_id(StoreID)]), ar_sync_record:add_async(replica_2_9_entropy_with_chunk, PaddedEndOffset, StartOffset, @@ -83,8 +120,6 @@ update_sync_records(IsComplete, PaddedEndOffset, StoreID, RewardAddr) -> ok end. - - delete_record(PaddedEndOffset, StoreID) -> %% Entropy indexing changed between 2.9.0 and 2.9.1. So we'll use a new %% sync_record id (ar_chunk_storage_replica_2_9_1_entropy) going forward. @@ -94,21 +129,38 @@ delete_record(PaddedEndOffset, StoreID) -> ar_sync_record:delete(BucketStart + ?DATA_CHUNK_SIZE, BucketStart, ID, StoreID). generate_missing_entropy(PaddedEndOffset, RewardAddr) -> - Entropies = generate_entropies(RewardAddr, PaddedEndOffset, 0), - EntropyIndex = ar_replica_2_9:get_slice_index(PaddedEndOffset), - take_combined_entropy_by_index(Entropies, EntropyIndex). + Entropies = generate_entropies(RewardAddr, PaddedEndOffset), + case Entropies of + {error, Reason} -> + {error, Reason}; + _ -> + EntropyIndex = ar_replica_2_9:get_slice_index(PaddedEndOffset), + take_combined_entropy_by_index(Entropies, EntropyIndex) + end. %% @doc Returns all the entropies needed to encipher the chunk at PaddedEndOffset. -%% ar_packing_server:get_replica_2_9_entropy/3 will query a cached entropy, or generate it -%% if it is not cached. -generate_entropies(_RewardAddr, _PaddedEndOffset, SubChunkStart) - when SubChunkStart == ?DATA_CHUNK_SIZE -> - []; -generate_entropies(RewardAddr, PaddedEndOffset, SubChunkStart) -> +generate_entropies(RewardAddr, PaddedEndOffset) -> SubChunkSize = ?COMPOSITE_PACKING_SUB_CHUNK_SIZE, - [ar_packing_server:get_replica_2_9_entropy(RewardAddr, PaddedEndOffset, SubChunkStart) - | generate_entropies(RewardAddr, PaddedEndOffset, SubChunkStart + SubChunkSize)]. + EntropyTasks = lists:map( + fun(Offset) -> + Ref = make_ref(), + ar_packing_server:request_entropy_generation( + Ref, self(), {RewardAddr, PaddedEndOffset, Offset}), + Ref + end, + lists:seq(0, ?DATA_CHUNK_SIZE - SubChunkSize, SubChunkSize) + ), + Entropies = collect_entropies(EntropyTasks, []), + case Entropies of + {error, _Reason} -> + flush_entropy_messages(); + _ -> + ok + end, + Entropies. +generate_entropy_keys(RewardAddr, Offset) -> + generate_entropy_keys(RewardAddr, Offset, 0). generate_entropy_keys(_RewardAddr, _Offset, SubChunkStart) when SubChunkStart == ?DATA_CHUNK_SIZE -> []; @@ -117,119 +169,96 @@ generate_entropy_keys(RewardAddr, Offset, SubChunkStart) -> [ar_replica_2_9:get_entropy_key(RewardAddr, Offset, SubChunkStart) | generate_entropy_keys(RewardAddr, Offset, SubChunkStart + SubChunkSize)]. -store_entropy(_Entropies, +collect_entropies([], Acc) -> + lists:reverse(Acc); +collect_entropies([Ref | Rest], Acc) -> + receive + {entropy_generated, Ref, {error, Reason}} -> + ?LOG_ERROR([{event, failed_to_generate_replica_2_9_entropy}, {error, Reason}]), + {error, Reason}; + {entropy_generated, Ref, Entropy} -> + collect_entropies(Rest, [Entropy | Acc]) + after 60000 -> + ?LOG_ERROR([{event, entropy_generation_timeout}, {ref, Ref}]), + {error, timeout} + end. + +flush_entropy_messages() -> + ?LOG_INFO([{event, flush_entropy_messages}]), + receive + {entropy_generated, _, _} -> + flush_entropy_messages() + after 0 -> + ok + end. + +do_store_entropy(_Entropies, BucketEndOffset, - _SubChunkStartOffset, RangeEnd, _Keys, _RewardAddr, - N, - WaitN) + _State) when BucketEndOffset > RangeEnd -> %% The amount of entropy generated per partition is slightly more than the amount needed. %% So at the end of a partition we will have finished processing chunks, but still have %% some entropy left. In this case we stop the recursion early and wait for the writes %% to complete. - wait_store_entropy_processes(WaitN), - {ok, N}; -store_entropy(Entropies, + ok; +do_store_entropy(Entropies, BucketEndOffset, - SubChunkStartOffset, RangeEnd, Keys, RewardAddr, - N, - WaitN) -> + State) -> case take_and_combine_entropy_slices(Entropies) of {<<>>, []} -> %% We've finished processing all the entropies, wait for the writes to complete. - wait_store_entropy_processes(WaitN), - {ok, N}; + ok; {ChunkEntropy, Rest} -> + %% Sanity checks true = ar_replica_2_9:get_entropy_partition(BucketEndOffset) == ar_replica_2_9:get_entropy_partition(RangeEnd), - sanity_check_replica_2_9_entropy_keys(BucketEndOffset, - RewardAddr, - SubChunkStartOffset, - Keys), - FindModules = - case ar_storage_module:get_all_packed(BucketEndOffset, {replica_2_9, RewardAddr}) of - [] -> - ?LOG_WARNING([{event, failed_to_find_storage_modules_for_2_9_entropy}, - {padded_end_offset, BucketEndOffset}]), - not_found; - StoreIDs -> - {ok, StoreIDs} - end, - case FindModules of - not_found -> - BucketEndOffset2 = shift_entropy_offset(BucketEndOffset, 1), - store_entropy(Rest, - BucketEndOffset2, - SubChunkStartOffset, - RangeEnd, - Keys, - RewardAddr, - N, - WaitN); - {ok, StoreIDs2} -> - From = self(), - WaitN2 = lists:foldl(fun(StoreID2, WaitNAcc) -> - spawn_link(fun() -> - StartTime = erlang:monotonic_time(), - - record_entropy(ChunkEntropy, - BucketEndOffset, - StoreID2, - RewardAddr), - - EndTime = erlang:monotonic_time(), - ElapsedTime = - erlang:convert_time_unit(EndTime - StartTime, - native, - microsecond), - %% bytes per second - WriteRate = - case ElapsedTime > 0 of - true -> 1000000 * byte_size(ChunkEntropy) div ElapsedTime; - false -> 0 - end, - prometheus_gauge:set(replica_2_9_entropy_store_rate, - [StoreID2], - WriteRate), - From ! {store_entropy_sub_chunk_written, WaitNAcc + 1} - end), - WaitNAcc + 1 - end, - WaitN, - StoreIDs2 - ), - BucketEndOffset2 = shift_entropy_offset(BucketEndOffset, 1), - store_entropy(Rest, - BucketEndOffset2, - SubChunkStartOffset, - RangeEnd, - Keys, - RewardAddr, - N + length(Keys), - WaitN2) - end + sanity_check_replica_2_9_entropy_keys(BucketEndOffset, RewardAddr, Keys), + %% End sanity checks + + #state{ store_id = StoreID } = State, + record_entropy( + ChunkEntropy, + BucketEndOffset, + StoreID, + RewardAddr), + + %% Jump to the next sector covered by this entropy. + BucketEndOffset2 = shift_entropy_offset(BucketEndOffset, 1), + do_store_entropy( + Rest, + BucketEndOffset2, + RangeEnd, + Keys, + RewardAddr, + State) end. -record_chunk(PaddedEndOffset, Chunk, RewardAddr, StoreID, FileIndex, IsPrepared) -> - StartOffset = PaddedEndOffset - ?DATA_CHUNK_SIZE, +record_chunk( + PaddedEndOffset, Chunk, RewardAddr, StoreID, + StoreIDLabel, PackingLabel, FileIndex, IsPrepared) -> + %% Sanity checks + true = PaddedEndOffset == ar_block:get_chunk_padded_offset(PaddedEndOffset), + %% End sanity checks + + StartOffset = ar_chunk_storage:get_chunk_bucket_start(PaddedEndOffset), {_ChunkFileStart, Filepath, _Position, _ChunkOffset} = ar_chunk_storage:locate_chunk_on_disk(PaddedEndOffset, StoreID), acquire_semaphore(Filepath), - CheckIsStoredAlready = + CheckIsChunkStoredAlready = ar_sync_record:is_recorded(PaddedEndOffset, ar_chunk_storage, StoreID), CheckIsEntropyRecorded = - case CheckIsStoredAlready of + case CheckIsChunkStoredAlready of true -> {error, already_stored}; false -> - is_recorded(PaddedEndOffset, StoreID) + is_entropy_recorded(PaddedEndOffset, StoreID) end, ReadEntropy = case CheckIsEntropyRecorded of @@ -245,44 +274,38 @@ record_chunk(PaddedEndOffset, Chunk, RewardAddr, StoreID, FileIndex, IsPrepared) true -> ar_chunk_storage:get(StartOffset, StartOffset, StoreID) end, - case ReadEntropy of + RecordChunk = case ReadEntropy of {error, _} = Error2 -> - ?LOG_DEBUG([{event, details_failed_to_store_chunk}, - {context, error_recording_chunk_to_entropy_storage}, - {error, io_lib:format("~p", [Error2])}, - {padded_offset, PaddedEndOffset}, - {start_offset, StartOffset}, - {check_is_stored_already, CheckIsStoredAlready}, - {check_is_entropy_recorded, CheckIsEntropyRecorded}, - {store_id, StoreID}, - {filepath, Filepath} - ]), - release_semaphore(Filepath), Error2; not_found -> - release_semaphore(Filepath), {error, not_prepared_yet2}; missing_entropy -> Packing = {replica_2_9, RewardAddr}, + ?LOG_WARNING([{event, missing_entropy}, {padded_end_offset, PaddedEndOffset}, + {store_id, StoreID}, {packing, ar_serialize:encode_packing(Packing, true)}]), Entropy = generate_missing_entropy(PaddedEndOffset, RewardAddr), - PackedChunk = ar_packing_server:encipher_replica_2_9_chunk(Chunk, Entropy), - Result = ar_chunk_storage:record_chunk( - PaddedEndOffset, PackedChunk, Packing, StoreID, FileIndex), - release_semaphore(Filepath), - Result; + case Entropy of + {error, Reason} -> + {error, Reason}; + _ -> + PackedChunk = ar_packing_server:encipher_replica_2_9_chunk(Chunk, Entropy), + ar_chunk_storage:record_chunk( + PaddedEndOffset, PackedChunk, Packing, StoreID, + StoreIDLabel, PackingLabel, FileIndex) + end; no_entropy_yet -> - Result = ar_chunk_storage:record_chunk( - PaddedEndOffset, Chunk, unpacked_padded, StoreID, FileIndex), - release_semaphore(Filepath), - Result; + ar_chunk_storage:record_chunk( + PaddedEndOffset, Chunk, unpacked_padded, StoreID, + StoreIDLabel, PackingLabel, FileIndex); {_EndOffset, Entropy} -> Packing = {replica_2_9, RewardAddr}, PackedChunk = ar_packing_server:encipher_replica_2_9_chunk(Chunk, Entropy), - Result = ar_chunk_storage:record_chunk( - PaddedEndOffset, PackedChunk, Packing, StoreID, FileIndex), - release_semaphore(Filepath), - Result - end. + ar_chunk_storage:record_chunk( + PaddedEndOffset, PackedChunk, Packing, StoreID, + StoreIDLabel, PackingLabel, FileIndex) + end, + release_semaphore(Filepath), + RecordChunk. %% @doc Return the byte (>= ChunkStartOffset, < ChunkEndOffset) %% that necessarily belongs to the chunk stored @@ -290,15 +313,26 @@ record_chunk(PaddedEndOffset, Chunk, RewardAddr, StoreID, FileIndex, IsPrepared) get_chunk_byte_from_bucket_end(BucketEndOffset) -> case BucketEndOffset >= ?STRICT_DATA_SPLIT_THRESHOLD of true -> - ?STRICT_DATA_SPLIT_THRESHOLD - + ar_util:floor_int(BucketEndOffset - ?STRICT_DATA_SPLIT_THRESHOLD, - ?DATA_CHUNK_SIZE); + RelativeBucketEndOffset = BucketEndOffset - ?STRICT_DATA_SPLIT_THRESHOLD, + case RelativeBucketEndOffset rem ?DATA_CHUNK_SIZE of + 0 -> + %% The chunk beginning at this offset is the rightmost possible + %% chunk that will be routed to this bucket. + %% The chunk ending at this offset plus one is the leftmost possible + %% chunk routed to this bucket. + BucketEndOffset - ?DATA_CHUNK_SIZE; + _ -> + ?STRICT_DATA_SPLIT_THRESHOLD + + ar_util:floor_int(RelativeBucketEndOffset, ?DATA_CHUNK_SIZE) + end; false -> BucketEndOffset - 1 end. record_entropy(ChunkEntropy, BucketEndOffset, StoreID, RewardAddr) -> + %% Sanity checks true = byte_size(ChunkEntropy) == ?DATA_CHUNK_SIZE, + %% End sanity checks Byte = get_chunk_byte_from_bucket_end(BucketEndOffset), CheckUnpackedChunkRecorded = ar_sync_record:get_interval( @@ -309,9 +343,21 @@ record_entropy(ChunkEntropy, BucketEndOffset, StoreID, RewardAddr) -> not_found -> {false, BucketEndOffset}; {_IntervalEnd, IntervalStart} -> - {true, IntervalStart + EndOffset2 = IntervalStart + ar_util:floor_int(Byte - IntervalStart, ?DATA_CHUNK_SIZE) - + ?DATA_CHUNK_SIZE} + + ?DATA_CHUNK_SIZE, + case ar_chunk_storage:get_chunk_bucket_end(EndOffset2) of + BucketEndOffset -> + {true, EndOffset2}; + _ -> + %% This chunk is from a different bucket. It may happen near the + %% strict data split threshold where there is no single byte + %% unambiguosly determining the bucket the chunk will be routed to. + ?LOG_INFO([{event, record_entropy_read_chunk_from_another_bucket}, + {bucket_end_offset, BucketEndOffset}, + {chunk_end_offset, EndOffset2}]), + {false, BucketEndOffset} + end end, {ChunkFileStart, Filepath, _Position, _ChunkOffset} = @@ -327,27 +373,12 @@ record_entropy(ChunkEntropy, BucketEndOffset, StoreID, RewardAddr) -> true -> case ar_chunk_storage:get(Byte, Byte, StoreID) of not_found -> - ?LOG_DEBUG([{event, details_failed_to_store_chunk}, - {context, unpacked_padded_chunk_not_found}, - {bucket_end_offset, BucketEndOffset}, - {byte, Byte}, - {store_id, StoreID}, - {filepath, Filepath}, - {is_unpacked_chunk_recorded, IsUnpackedChunkRecorded} - ]), {error, not_found}; {error, _} = Error -> - ?LOG_DEBUG([{event, details_failed_to_store_chunk}, - {context, unpacked_padded_chunk_error}, - {error, io_lib:format("~p", [Error])}, - {bucket_end_offset, BucketEndOffset}, - {byte, Byte}, - {store_id, StoreID}, - {filepath, Filepath}, - {is_unpacked_chunk_recorded, IsUnpackedChunkRecorded} - ]), Error; {_, UnpackedChunk} -> + ar_sync_record:delete( + EndOffset, EndOffset - ?DATA_CHUNK_SIZE, ar_data_sync, StoreID), ar_packing_server:encipher_replica_2_9_chunk(UnpackedChunk, ChunkEntropy) end; false -> @@ -356,11 +387,13 @@ record_entropy(ChunkEntropy, BucketEndOffset, StoreID, RewardAddr) -> %% to make sure we pass offset validation on read. ChunkEntropy end, + Result = case Chunk of {error, _} = Error2 -> Error2; _ -> - case ar_chunk_storage:write_chunk(EndOffset, Chunk, #{}, StoreID) of + WriteChunkResult = ar_chunk_storage:write_chunk(EndOffset, Chunk, #{}, StoreID), + case WriteChunkResult of {ok, Filepath} -> ets:insert(chunk_storage_file_index, {{ChunkFileStart, StoreID}, Filepath}), @@ -373,8 +406,9 @@ record_entropy(ChunkEntropy, BucketEndOffset, StoreID, RewardAddr) -> case Result of {error, Reason} -> - ?LOG_ERROR([{event, failed_to_store_replica_2_9_sub_chunk_entropy}, + ?LOG_ERROR([{event, failed_to_store_replica_2_9_chunk_entropy}, {filepath, Filepath}, + {byte, Byte}, {padded_end_offset, EndOffset}, {bucket_end_offset, BucketEndOffset}, {store_id, StoreID}, @@ -385,6 +419,19 @@ record_entropy(ChunkEntropy, BucketEndOffset, StoreID, RewardAddr) -> release_semaphore(Filepath). +%% @doc If we are not at the beginning of the entropy, shift the offset to +%% the left. store_entropy will traverse the entire 2.9 partition shifting +%% the offset by sector size. It may happen some sub-chunks will be written +%% to the neighbouring storage module(s) on the left or on the right +%% since the storage module may be configured to be smaller than the +%% partition. +reset_entropy_offset(BucketEndOffset) -> + %% Sanity checks + BucketEndOffset = ar_chunk_storage:get_chunk_bucket_end(BucketEndOffset), + %% End sanity checks + + SliceIndex = ar_replica_2_9:get_slice_index(BucketEndOffset), + shift_entropy_offset(BucketEndOffset, -SliceIndex). %% @doc Take the first slice of each entropy and combine into a single binary. This binary %% can be used to encipher a single chunk. @@ -414,41 +461,35 @@ take_combined_entropy_by_index([], _Index, Acc) -> iolist_to_binary(Acc); take_combined_entropy_by_index([Entropy | Entropies], Index, Acc) -> SubChunkSize = ?COMPOSITE_PACKING_SUB_CHUNK_SIZE, - take_combined_entropy_by_index(Entropies, - Index, - [Acc, binary:part(Entropy, Index * SubChunkSize, SubChunkSize)]). - -sanity_check_replica_2_9_entropy_keys(_PaddedEndOffset, - _RewardAddr, - _SubChunkStartOffset, - []) -> + take_combined_entropy_by_index( + Entropies, + Index, + [Acc, binary:part(Entropy, Index * SubChunkSize, SubChunkSize)]). + +sanity_check_replica_2_9_entropy_keys(PaddedEndOffset, RewardAddr, Keys) -> + sanity_check_replica_2_9_entropy_keys(PaddedEndOffset, RewardAddr, 0, Keys). + +sanity_check_replica_2_9_entropy_keys( + _PaddedEndOffset, _RewardAddr, _SubChunkStartOffset, []) -> ok; -sanity_check_replica_2_9_entropy_keys(PaddedEndOffset, - RewardAddr, - SubChunkStartOffset, - [Key | Keys]) -> - Key = ar_replica_2_9:get_entropy_key(RewardAddr, PaddedEndOffset, SubChunkStartOffset), +sanity_check_replica_2_9_entropy_keys( + PaddedEndOffset, RewardAddr, SubChunkStartOffset, [Key | Keys]) -> + Key = ar_replica_2_9:get_entropy_key(RewardAddr, PaddedEndOffset, SubChunkStartOffset), SubChunkSize = ?COMPOSITE_PACKING_SUB_CHUNK_SIZE, sanity_check_replica_2_9_entropy_keys(PaddedEndOffset, - RewardAddr, - SubChunkStartOffset + SubChunkSize, - Keys). - -wait_store_entropy_processes(0) -> - ok; -wait_store_entropy_processes(N) -> - receive - {store_entropy_sub_chunk_written, N} -> - wait_store_entropy_processes(N - 1) - end. + RewardAddr, + SubChunkStartOffset + SubChunkSize, + Keys). shift_entropy_offset(Offset, SectorCount) -> SectorSize = ar_replica_2_9:get_sector_size(), - ar_chunk_storage:get_chunk_bucket_end(ar_block:get_chunk_padded_offset(Offset + SectorSize * SectorCount)). + ar_chunk_storage:get_chunk_bucket_end(Offset + SectorSize * SectorCount). acquire_semaphore(Filepath) -> case ets:insert_new(ar_entropy_storage, {{semaphore, Filepath}}) of false -> + ?LOG_DEBUG([ + {event, details_store_chunk}, {section, waiting_on_semaphore}, {filepath, Filepath}]), timer:sleep(20), acquire_semaphore(Filepath); true -> @@ -457,3 +498,71 @@ acquire_semaphore(Filepath) -> release_semaphore(Filepath) -> ets:delete(ar_entropy_storage, {semaphore, Filepath}). + +%%%=================================================================== +%%% Tests. +%%%=================================================================== + + +reset_entropy_offset_test() -> + ?assertEqual(786432, ar_replica_2_9:get_sector_size()), + ?assertEqual(786432, ?STRICT_DATA_SPLIT_THRESHOLD), + + %% Slice index of 0 means no shift (all offsets at or below the strict data split + %% threshold are not padded) + assert_reset_entropy_offset(262144, 0), + assert_reset_entropy_offset(262144, 1), + assert_reset_entropy_offset(262144, ?DATA_CHUNK_SIZE - 1), + assert_reset_entropy_offset(262144, ?DATA_CHUNK_SIZE), + assert_reset_entropy_offset(262144, ?DATA_CHUNK_SIZE + 1), + + assert_reset_entropy_offset(524288, ?DATA_CHUNK_SIZE * 2), + + assert_reset_entropy_offset(786432, ?DATA_CHUNK_SIZE * 3), + + %% Slice index of 1 shift down a sector + assert_reset_entropy_offset(262144, ?DATA_CHUNK_SIZE * 3 + 1), + assert_reset_entropy_offset(262144, ?DATA_CHUNK_SIZE * 4 - 1), + assert_reset_entropy_offset(262144, ?DATA_CHUNK_SIZE * 4), + + assert_reset_entropy_offset(524288, ?DATA_CHUNK_SIZE * 4 + 1), + assert_reset_entropy_offset(524288, ?DATA_CHUNK_SIZE * 5 - 1), + assert_reset_entropy_offset(524288, ?DATA_CHUNK_SIZE * 5), + + assert_reset_entropy_offset(786432, ?DATA_CHUNK_SIZE * 5 + 1), + assert_reset_entropy_offset(786432, ?DATA_CHUNK_SIZE * 6), + + %% Slice index of 2 shift down 2 sectors + assert_reset_entropy_offset(262144, ?DATA_CHUNK_SIZE * 7), + assert_reset_entropy_offset(524288, ?DATA_CHUNK_SIZE * 8), + + %% First chunk of new partition, restart slice index at 0, so no shift + assert_reset_entropy_offset(?DATA_CHUNK_SIZE * 9, ?DATA_CHUNK_SIZE * 8 + 1), + assert_reset_entropy_offset(?DATA_CHUNK_SIZE * 9, ?DATA_CHUNK_SIZE * 9 - 1), + assert_reset_entropy_offset(?DATA_CHUNK_SIZE * 9, ?DATA_CHUNK_SIZE * 9), + + assert_reset_entropy_offset(?DATA_CHUNK_SIZE * 10, ?DATA_CHUNK_SIZE * 9 + 1), + assert_reset_entropy_offset(?DATA_CHUNK_SIZE * 10, ?DATA_CHUNK_SIZE * 10), + + assert_reset_entropy_offset(?DATA_CHUNK_SIZE * 11, ?DATA_CHUNK_SIZE * 11), + + %% Slice index of 1 shift down a sector + assert_reset_entropy_offset(?DATA_CHUNK_SIZE * 9, ?DATA_CHUNK_SIZE * 12), + assert_reset_entropy_offset(?DATA_CHUNK_SIZE * 10, ?DATA_CHUNK_SIZE * 13), + assert_reset_entropy_offset(?DATA_CHUNK_SIZE * 11, ?DATA_CHUNK_SIZE * 14), + + %% Slice index of 2 shift down 2 sectors + assert_reset_entropy_offset(?DATA_CHUNK_SIZE * 9, ?DATA_CHUNK_SIZE * 15), + assert_reset_entropy_offset(?DATA_CHUNK_SIZE * 10, ?DATA_CHUNK_SIZE * 16), + + %% First chunk of new partition, restart slice index at 0, so no shift + assert_reset_entropy_offset(?DATA_CHUNK_SIZE * 17, ?DATA_CHUNK_SIZE * 17). + +assert_reset_entropy_offset(ExpectedShiftedOffset, Offset) -> + BucketEndOffset = ar_chunk_storage:get_chunk_bucket_end(Offset), + ?assertEqual( + ExpectedShiftedOffset, + reset_entropy_offset(BucketEndOffset), + iolist_to_binary(io_lib:format("Offset: ~p, BucketEndOffset: ~p", + [Offset, BucketEndOffset])) + ). diff --git a/apps/arweave/src/ar_events_sup.erl b/apps/arweave/src/ar_events_sup.erl index bc734ff8f..92297951f 100644 --- a/apps/arweave/src/ar_events_sup.erl +++ b/apps/arweave/src/ar_events_sup.erl @@ -49,8 +49,6 @@ init([]) -> %% Events: initialized, valid, invalid, validation_error, refuse_validation, %% computed_output. ?CHILD(ar_events, nonce_limiter, worker), - %% Events: found_solution. - ?CHILD(ar_events, miner, worker), %% Events: removed_file. ?CHILD(ar_events, chunk_storage, worker), %% Events: add_range, remove_range, global_remove_range, cut, global_cut. diff --git a/apps/arweave/src/ar_http_iface_client.erl b/apps/arweave/src/ar_http_iface_client.erl index 49895c6bc..121247017 100644 --- a/apps/arweave/src/ar_http_iface_client.erl +++ b/apps/arweave/src/ar_http_iface_client.erl @@ -10,8 +10,9 @@ get_block/3, get_tx/2, get_txs/2, get_tx_from_remote_peer/2, get_tx_data/2, get_wallet_list_chunk/2, get_wallet_list_chunk/3, get_wallet_list/2, add_peer/1, get_info/1, get_info/2, get_peers/1, - get_time/2, get_height/1, get_block_index/3, get_sync_record/1, - get_sync_record/3, get_chunk_binary/3, get_mempool/1, + get_time/2, get_height/1, get_block_index/3, + get_sync_record/1, get_sync_record/2, get_sync_record/3, + get_chunk_binary/3, get_mempool/1, get_sync_buckets/1, get_recent_hash_list/1, get_recent_hash_list_diff/2, get_reward_history/3, get_block_time_history/3, push_nonce_limiter_update/3, @@ -360,7 +361,14 @@ decode_block_index(Bin, json) -> end. get_sync_record(Peer) -> - Headers = [{<<"Content-Type">>, <<"application/etf">>}], + get_sync_record(Peer, binary). + +get_sync_record(Peer, Encoding) -> + ContentType = case Encoding of + binary -> <<"application/etf">>; + json -> <<"application/json">> + end, + Headers = [{<<"Content-Type">>, ContentType}], handle_sync_record_response(ar_http:req(#{ peer => Peer, method => get, diff --git a/apps/arweave/src/ar_http_iface_middleware.erl b/apps/arweave/src/ar_http_iface_middleware.erl index 0da74c534..82bb87c69 100644 --- a/apps/arweave/src/ar_http_iface_middleware.erl +++ b/apps/arweave/src/ar_http_iface_middleware.erl @@ -2007,6 +2007,9 @@ handle_get_chunk(OffsetBinary, Req, Encoding) -> case ar_sync_record:is_recorded(Offset, ar_data_sync) of false -> {none, {reply, {404, #{}, <<>>, Req}}}; + {true, _} -> + %% Chunk is recorded but packing is unknown. + {none, {reply, {404, #{}, <<>>, Req}}}; {{true, RequestedPacking}, _StoreID} -> ok = ar_semaphore:acquire(get_chunk, infinity), {RequestedPacking, ok}; @@ -2051,13 +2054,21 @@ handle_get_chunk(OffsetBinary, Req, Encoding) -> {200, #{}, Reply, Req}; {error, chunk_not_found} -> {404, #{}, <<>>, Req}; + {error, invalid_padding} -> + {404, #{}, <<>>, Req}; {error, chunk_failed_validation} -> {404, #{}, <<>>, Req}; {error, chunk_stored_in_different_packing_only} -> {404, #{}, <<>>, Req}; {error, not_joined} -> not_joined(Req); - {error, failed_to_read_chunk} -> + {error, Error} -> + ?LOG_ERROR([{event, get_chunk_error}, {offset, Offset}, + {requested_packing, + ar_serialize:encode_packing(RequestedPacking, false)}, + {read_packing, + ar_serialize:encode_packing(ReadPacking, false)}, + {error, Error}]), {500, #{}, <<>>, Req} end end; diff --git a/apps/arweave/src/ar_kv.erl b/apps/arweave/src/ar_kv.erl index d0842ae94..19a4b350c 100644 --- a/apps/arweave/src/ar_kv.erl +++ b/apps/arweave/src/ar_kv.erl @@ -433,7 +433,6 @@ new_dbrec(CfNames, CfDescriptors, DataDirRelativePath, UserOptions) -> open(#db{db_handle = undefined, cf_descriptors = undefined, filepath = Filepath, db_options = DbOptions} = DbRec0) -> case rocksdb:open(Filepath, DbOptions) of {ok, Db} -> - ?LOG_DEBUG([{event, db_operation}, {op, open}, {name, io_lib:format("~p", [DbRec0#db.name])}]), DbRec1 = DbRec0#db{db_handle = Db}, true = ets:insert(?MODULE, DbRec1), ok; @@ -452,7 +451,6 @@ open(#db{ {ok, Db, Cfs} -> FirstDbRec = lists:foldr( fun({Cf, CfName}, _) -> - ?LOG_DEBUG([{event, db_operation}, {op, open}, {name, io_lib:format("~p", [CfName])}]), DbRec1 = DbRec0#db{name = CfName, db_handle = Db, cf_handle = Cf}, true = ets:insert(?MODULE, DbRec1), DbRec1 @@ -489,8 +487,7 @@ close(#db{db_handle = Db, name = Name}) -> try case rocksdb:close(Db) of ok -> - true = ets:match_delete(?MODULE, #db{db_handle = Db, _ = '_'}), - ?LOG_DEBUG([{event, db_operation}, {op, close}, {name, io_lib:format("~p", [Name])}]); + true = ets:match_delete(?MODULE, #db{db_handle = Db, _ = '_'}); {error, CloseError} -> ?LOG_ERROR([ {event, db_operation_failed}, {op, close}, {name, io_lib:format("~p", [Name])}, @@ -521,7 +518,6 @@ db_flush(#db{name = Name, db_handle = Db}) -> {reason, io_lib:format("~p", [FlushError])}]), {error, failed}; _ -> - ?LOG_DEBUG([{event, db_operation}, {op, db_flush}, {name, io_lib:format("~p", [Name])}]), ok end. @@ -541,7 +537,6 @@ wal_sync(#db{name = Name, db_handle = Db}) -> {reason, io_lib:format("~p", [SyncError])}]), {error, failed}; _ -> - ?LOG_DEBUG([{event, db_operation}, {op, wal_sync}, {name, io_lib:format("~p", [Name])}]), ok end. diff --git a/apps/arweave/src/ar_metrics.erl b/apps/arweave/src/ar_metrics.erl index a2b9459ad..87a61a049 100644 --- a/apps/arweave/src/ar_metrics.erl +++ b/apps/arweave/src/ar_metrics.erl @@ -2,7 +2,7 @@ -include_lib("arweave/include/ar.hrl"). --export([register/0, get_status_class/1]). +-export([register/0, get_status_class/1, record_rate_metric/4]). %%%=================================================================== %%% Public interface. @@ -446,63 +446,34 @@ register() -> "The packing label can be 'spora_2_5', 'spora_2_6', 'composite', " " or replica_2_9."} ]), - prometheus_gauge:new([ - {name, packing_latency_benchmark}, - {labels, [benchmark, type, packing]}, - {help, "The benchmark packing latency. The benchmark label indicates which " - "benchmark is being recorded - 'protocol' records the ?PACKING_LATENCY " - "value, and 'init' records the latency sampled at node startup. " - "The type label can be 'pack' or 'unpack'. The packing label can be " - "'spora_2_5', 'spora_2_6', 'composite', or 'replica_2_9'. " - "The 'packing_duration_milliseconds' metric " - "records the actual latency observed during node operation."} - ]), - prometheus_gauge:new([ - {name, packing_rate_benchmark}, - {labels, [benchmark]}, - {help, "The benchmark packing rate. The benchmark label indicates which " - "benchmark is being recorded - 'protocol' records the maximum rate allowed by " - "the protocol, 'configured' records the packing rate configured by the user. " - "The 'packing_duration_milliseconds' metric records the actual rate observed " - "during node operation."} - ]), - prometheus_gauge:new([ - {name, packing_schedulers}, - {help, "The number of schedulers available for packing."} - ]), prometheus_gauge:new([{name, packing_buffer_size}, {help, "The number of chunks in the packing server queue."}]), prometheus_gauge:new([{name, chunk_cache_size}, {help, "The number of chunks scheduled for downloading."}]), prometheus_counter:new([{name, chunks_stored}, - {labels, [packing]}, + {labels, [packing, store_id]}, {help, "The counter is incremented every time a chunk is written to " "chunk_storage."}]), - - prometheus_gauge:new([{name, sync_tasks}, {labels, [state, type, peer]}, {help, "The number of syncing tasks. 'state' can be 'queued' or 'scheduled'. " "'type' can be 'sync_range' or 'read_range'. 'peer' is the peer the task " "is intended for - for 'read_range' tasks this will be 'localhost'."}]), + %% --------------------------------------------------------------------------------------- %% Replica 2.9 metrics %% --------------------------------------------------------------------------------------- prometheus_counter:new([{name, replica_2_9_entropy_stored}, {labels, [store_id]}, {help, "The number of bytes of replica.2.9 entropy written to chunk storage."}]), - prometheus_gauge:new([{name, replica_2_9_entropy_store_rate}, - {labels, [store_id]}, - {help, "The rate at which replica.2.9 is written to chunk storage in MiB/s."}]), - prometheus_counter:new([{name, replica_2_9_entropy_cache_query}, - {labels, [hit_or_miss, partition]}, - {help, "The counter is incremented everytime an 8 MiB replica.2.9 entropy " - "is requested from the cache."}]), prometheus_histogram:new([ {name, replica_2_9_entropy_duration_milliseconds}, + {labels, [count]}, {buckets, [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 250, 500, 1000]}, - {help, "The time, in milliseconds, to generate 8 MiB of replica.2.9 entropy."} + {help, "The time, in milliseconds, to generate replica.2.9 entropy. The count label " + "indicates whether this is the time to generate a single 8 MiB entropy or " + "the time to generate all 32 entropies needed for full chunks."} ]), %% --------------------------------------------------------------------------------------- @@ -539,6 +510,20 @@ register() -> {labels, [type, instance, section, metric]}, {help, "Erlang VM memory allocator metrics. Only set when debug=true."}]). +record_rate_metric(StartTime, Bytes, Metric, Labels) -> + EndTime = erlang:monotonic_time(), + ElapsedTime = + erlang:convert_time_unit(EndTime - StartTime, + native, + microsecond), + %% bytes per second + Rate = + case ElapsedTime > 0 of + true -> 1_000_000 * Bytes / ElapsedTime; + false -> 0 + end, + prometheus_histogram:observe(Metric, Labels, Rate). + %% @doc Return the HTTP status class label for cowboy_requests_total and gun_requests_total %% metrics. diff --git a/apps/arweave/src/ar_mining_io.erl b/apps/arweave/src/ar_mining_io.erl index 14f6b34a6..6d7746e9e 100644 --- a/apps/arweave/src/ar_mining_io.erl +++ b/apps/arweave/src/ar_mining_io.erl @@ -94,7 +94,8 @@ garbage_collect() -> %%%=================================================================== init(Mode) -> - {ok, start_io_threads(#state{ mode = Mode })}. + gen_server:cast(self(), initialize_state), + {ok, #state{ mode = Mode }}. handle_call({set_largest_seen_upper_bound, PartitionUpperBound}, _From, State) -> #state{ partition_upper_bound = CurrentUpperBound } = State, @@ -125,6 +126,22 @@ handle_call(Request, _From, State) -> ?LOG_WARNING([{event, unhandled_call}, {module, ?MODULE}, {request, Request}]), {reply, ok, State}. +handle_cast(initialize_state, State) -> + State3 = case ar_device_lock:is_ready() of + false -> + ar_util:cast_after(1000, self(), initialize_state), + State; + true -> + case start_io_threads(State) of + {error, _} -> + ar_util:cast_after(1000, self(), initialize_state), + State; + State2 -> + State2 + end + end, + {noreply, State3}; + handle_cast(garbage_collect, State) -> erlang:garbage_collect(self(), [{async, {ar_mining_io, self(), erlang:monotonic_time()}}]), @@ -175,51 +192,41 @@ terminate(_Reason, _State) -> %%% Private functions. %%%=================================================================== -get_system_device(StorageModule) -> - {ok, Config} = application:get_env(arweave, config), - StoreID = ar_storage_module:id(StorageModule), - Path = ar_chunk_storage:get_chunk_storage_path(Config#config.data_dir, StoreID), - Command = "df -P " ++ Path ++ " | awk 'NR==2 {print $1}'", - Device = os:cmd(Command), - TrimmedDevice = string:trim(Device), - case TrimmedDevice of - "" -> StoreID; % If the command fails or returns an empty string, return StoreID - _ -> TrimmedDevice - end. - start_io_threads(State) -> #state{ mode = Mode } = State, % Step 1: Group StoreIDs by their system device - DeviceToStoreIDs = map_device_to_store_ids(), - - % Step 2: Start IO threads for each device and populate map indices - maps:fold( - fun(Device, StoreIDs, StateAcc) -> - #state{ io_threads = Threads, io_thread_monitor_refs = Refs, - store_id_to_device = StoreIDToDevice, - partition_to_store_ids = PartitionToStoreIDs } = StateAcc, - - Thread = start_io_thread(Mode, StoreIDs), - ThreadRef = monitor(process, Thread), - - StoreIDToDevice2 = lists:foldl( - fun(StoreID, Acc) -> - maps:put(StoreID, Device, Acc) + case ar_device_lock:get_store_id_to_device_map() of + {error, Reason} -> + ?LOG_ERROR([{event, error_initializing_state}, {module, ?MODULE}, + {reason, io_lib:format("~p", [Reason])}]), + {error, Reason}; + StoreIDToDevice -> + DeviceToStoreIDs = ar_util:invert_map(StoreIDToDevice), + % Step 2: Start IO threads for each device and populate map indices + State2 = maps:fold( + fun(Device, StoreIDs, StateAcc) -> + #state{ io_threads = Threads, io_thread_monitor_refs = Refs, + partition_to_store_ids = PartitionToStoreIDs } = StateAcc, + + StoreIDs2 = sets:to_list(StoreIDs), + + Thread = start_io_thread(Mode, StoreIDs2), + ThreadRef = monitor(process, Thread), + + PartitionToStoreIDs2 = map_partition_to_store_ids(StoreIDs2, PartitionToStoreIDs), + StateAcc#state{ + io_threads = maps:put(Device, Thread, Threads), + io_thread_monitor_refs = maps:put(ThreadRef, Device, Refs), + partition_to_store_ids = PartitionToStoreIDs2 + } end, - StoreIDToDevice, StoreIDs), - - PartitionToStoreIDs2 = map_partition_to_store_ids(StoreIDs, PartitionToStoreIDs), - StateAcc#state{ - io_threads = maps:put(Device, Thread, Threads), - io_thread_monitor_refs = maps:put(ThreadRef, Device, Refs), - store_id_to_device = StoreIDToDevice2, - partition_to_store_ids = PartitionToStoreIDs2 - } - end, - State, - DeviceToStoreIDs - ). + State, + DeviceToStoreIDs + ), + + State2#state{ store_id_to_device = StoreIDToDevice } + end. start_io_thread(Mode, StoreIDs) -> Now = os:system_time(millisecond), @@ -233,41 +240,23 @@ start_io_thread(Mode, StoreIDs) -> map_partition_to_store_ids([], PartitionToStoreIDs) -> PartitionToStoreIDs; map_partition_to_store_ids([StoreID | StoreIDs], PartitionToStoreIDs) -> - StorageModule = ar_storage_module:get_by_id(StoreID), - {Start, End} = ar_storage_module:module_range(StorageModule, 0), - Partitions = get_store_id_partitions({Start, End}, []), - PartitionToStoreIDs2 = lists:foldl( - fun(Partition, Acc) -> - maps:update_with(Partition, - fun(PartitionStoreIDs) -> [StoreID | PartitionStoreIDs] end, - [StoreID], Acc) - end, - PartitionToStoreIDs, Partitions), - map_partition_to_store_ids(StoreIDs, PartitionToStoreIDs2). - -map_device_to_store_ids() -> - {ok, Config} = application:get_env(arweave, config), - lists:foldl( - fun(Module, Acc) -> - StoreID = ar_storage_module:id(Module), - Device = get_system_device(Module), - maps:update_with(Device, fun(StoreIDs) -> [StoreID | StoreIDs] end, [StoreID], Acc) - end, - #{}, - Config#config.storage_modules - ). - -get_store_ids_for_device(Device, #state{store_id_to_device = StoreIDToDevice}) -> - maps:fold( - fun(StoreID, MappedDevice, Acc) -> - case MappedDevice == Device of - true -> [StoreID | Acc]; - false -> Acc - end - end, - [], - StoreIDToDevice - ). + case ar_storage_module:get_by_id(StoreID) of + not_found -> + %% Occasionally happens in tests. + ?LOG_ERROR([{event, mining_storage_module_not_found}, {store_id, StoreID}]), + map_partition_to_store_ids(StoreIDs, PartitionToStoreIDs); + StorageModule -> + {Start, End} = ar_storage_module:module_range(StorageModule, 0), + Partitions = get_store_id_partitions({Start, End}, []), + PartitionToStoreIDs2 = lists:foldl( + fun(Partition, Acc) -> + maps:update_with(Partition, + fun(PartitionStoreIDs) -> [StoreID | PartitionStoreIDs] end, + [StoreID], Acc) + end, + PartitionToStoreIDs, Partitions), + map_partition_to_store_ids(StoreIDs, PartitionToStoreIDs2) + end. get_store_id_partitions({Start, End}, Partitions) when Start >= End -> Partitions; @@ -287,15 +276,17 @@ open_files(StoreIDs) -> end, StoreIDs). -handle_io_thread_down(Ref, Reason, - #state{ mode = Mode, io_threads = Threads, io_thread_monitor_refs = Refs } = State) -> +handle_io_thread_down(Ref, Reason, State) -> + #state{ mode = Mode, io_threads = Threads, io_thread_monitor_refs = Refs, + store_id_to_device = StoreIDToDevice } = State, ?LOG_WARNING([{event, mining_io_thread_down}, {reason, io_lib:format("~p", [Reason])}]), Device = maps:get(Ref, Refs), Refs2 = maps:remove(Ref, Refs), Threads2 = maps:remove(Device, Threads), - StoreIDs = get_store_ids_for_device(Device, State), - Thread = start_io_thread(Mode, StoreIDs), + DeviceToStoreIDs = ar_util:invert_map(StoreIDToDevice), + StoreIDs = maps:get(Device, DeviceToStoreIDs, sets:new()), + Thread = start_io_thread(Mode, sets:to_list(StoreIDs)), ThreadRef = monitor(process, Thread), State#state{ io_threads = maps:put(Device, Thread, Threads2), io_thread_monitor_refs = maps:put(ThreadRef, Device, Refs2) }. diff --git a/apps/arweave/src/ar_mining_server.erl b/apps/arweave/src/ar_mining_server.erl index 1394af3cc..2f470f222 100644 --- a/apps/arweave/src/ar_mining_server.erl +++ b/apps/arweave/src/ar_mining_server.erl @@ -106,7 +106,7 @@ is_one_chunk_solution(Solution) -> Ret :: ok. -ifdef(AR_TEST). -log_prepare_solution_failure(Solution, stale_step_number, AdditionalLogData) -> +log_prepare_solution_failure(_Solution, stale_step_number, _AdditionalLogData) -> ok; log_prepare_solution_failure(Solution, FailureReason, AdditionalLogData) -> log_prepare_solution_failure2(Solution, FailureReason, AdditionalLogData). @@ -527,22 +527,18 @@ distribute_output(Candidate, State) -> distribute_output(ar_mining_io:get_partitions(), Candidate, State). distribute_output([], _Candidate, _State) -> - ?LOG_DEBUG([{event, distribute_output_done}]), ok; distribute_output([{_Partition, _MiningAddress, PackingDifficulty} | _Partitions], _Candidate, #state{ allow_composite_packing = false }) when PackingDifficulty >= 1 -> %% Do not mine with the composite packing until some time after the fork 2.8. - ?LOG_DEBUG([{event, distribute_output_skipping_composite_packing}]), ok; distribute_output([{_Partition, _MiningAddress, PackingDifficulty} | _Partitions], _Candidate, #state{ allow_replica_2_9_mining = false }) when PackingDifficulty == ?REPLICA_2_9_PACKING_DIFFICULTY -> %% Do not mine with replica_2_9 until some time after the fork 2.9. - ?LOG_DEBUG([{event, distribute_output_skipping_replica_2_9_mining}]), ok; distribute_output([{Partition, MiningAddress, PackingDifficulty} | Partitions], Candidate, State) -> - ?LOG_DEBUG([{event, distribute_output}, {partition, Partition}]), case get_worker({Partition, PackingDifficulty}, State) of not_found -> ?LOG_ERROR([{event, worker_not_found}, {partition, Partition}]), @@ -693,7 +689,9 @@ prepare_solution(steps, Candidate, Solution) -> {start_step_number, PrevStepNumber}, {next_step_number, StepNumber}, {next_seed, ar_util:safe_encode(PrevNextSeed)}, - {next_vdf_difficulty, PrevNextVDFDifficulty}], + {next_vdf_difficulty, PrevNextVDFDifficulty}, + {h1, ar_util:safe_encode(Candidate#mining_candidate.h1)}, + {h2, ar_util:safe_encode(Candidate#mining_candidate.h2)}], ?LOG_INFO([{event, found_solution_but_failed_to_find_checkpoints} | LogData]), may_be_leave_it_to_exit_peer( @@ -950,7 +948,7 @@ post_solution(not_set, Solution, State) -> ar:console("WARNING: the solution we found is invalid. Check logs for more " "details~n"); {true, PoACache, PoA2Cache} -> - ar_events:send(miner, {found_solution, miner, Solution, PoACache, PoA2Cache}) + ar_node_worker:found_solution(miner, Solution, PoACache, PoA2Cache) end; post_solution(ExitPeer, Solution, #state{ is_pool_client = true }) -> case ar_http_iface_client:post_partial_solution(ExitPeer, Solution) of @@ -1077,6 +1075,8 @@ read_poa(RecallByte, ChunkOrSubChunk, Packing, Nonce) -> true -> {ok, PoA#poa{ chunk = ChunkOrSubChunk }}; false -> + dump_invalid_solution_data({sub_chunk_mismatch, RecallByte, + ChunkOrSubChunk, PoA, Packing, PoAReply, Nonce}), {error, sub_chunk_mismatch}; Error2 -> Error2 @@ -1086,6 +1086,8 @@ read_poa(RecallByte, ChunkOrSubChunk, Packing, Nonce) -> true -> {ok, PoA#poa{ chunk = ChunkOrSubChunk }}; false -> + dump_invalid_solution_data({sub_chunk_mismatch, RecallByte, + ChunkOrSubChunk, PoA, Packing, PoAReply, Nonce}), {error, sub_chunk_mismatch}; Error2 -> Error2 @@ -1094,12 +1096,20 @@ read_poa(RecallByte, ChunkOrSubChunk, Packing, Nonce) -> {ok, PoA}; {_ChunkOrSubChunk, {ok, #poa{ chunk = ChunkOrSubChunk } = PoA}, _Packing} -> {ok, PoA}; - {_ChunkOrSubChunk, {ok, #poa{}}, _Packing} -> + {_ChunkOrSubChunk, {ok, #poa{} = PoA}, _Packing} -> + dump_invalid_solution_data({chunk_mismatch, RecallByte, + ChunkOrSubChunk, PoA, Packing, PoAReply, Nonce}), {error, chunk_mismatch}; {_ChunkOrSubChunk, Error, _Packing} -> Error end. +dump_invalid_solution_data(Data) -> + {ok, Config} = application:get_env(arweave, config), + ID = binary_to_list(ar_util:encode(crypto:strong_rand_bytes(16))), + File = filename:join(Config#config.data_dir, "invalid_solution_data_dump_" ++ ID), + file:write_file(File, term_to_binary(Data)). + get_sub_chunk(Chunk, 0, _Nonce) -> Chunk; get_sub_chunk(Chunk, PackingDifficulty, Nonce) -> diff --git a/apps/arweave/src/ar_mining_worker.erl b/apps/arweave/src/ar_mining_worker.erl index b5476980e..9c42513dd 100644 --- a/apps/arweave/src/ar_mining_worker.erl +++ b/apps/arweave/src/ar_mining_worker.erl @@ -461,6 +461,7 @@ handle_task({computed_h1, Candidate, _ExtraArgs}, State) -> {step, Candidate#mining_candidate.step_number}, {worker, State2#state.name}, {h1, ar_util:encode(H1)}, + {p1, Candidate#mining_candidate.partition_number}, {difficulty, get_difficulty(State2, Candidate)}]), ar_mining_stats:h1_solution(), %% Decrement 1 for chunk1: @@ -524,6 +525,8 @@ handle_task({computed_h2, Candidate, _ExtraArgs}, State) -> {worker, State#state.name}, {step, Candidate#mining_candidate.step_number}, {h2, ar_util:encode(H2)}, + {p1, Candidate#mining_candidate.partition_number}, + {p2, Candidate#mining_candidate.partition_number2}, {difficulty, get_difficulty(State2, Candidate)}, {partial_difficulty, get_partial_difficulty(State2, Candidate)}]), ar_mining_stats:h2_solution(); @@ -532,6 +535,8 @@ handle_task({computed_h2, Candidate, _ExtraArgs}, State) -> {worker, State2#state.name}, {step, Candidate#mining_candidate.step_number}, {h2, ar_util:encode(H2)}, + {p1, Candidate#mining_candidate.partition_number}, + {p2, Candidate#mining_candidate.partition_number2}, {partial_difficulty, get_partial_difficulty(State2, Candidate)}]) end, case {PassesDiffChecks, Peer} of diff --git a/apps/arweave/src/ar_node_worker.erl b/apps/arweave/src/ar_node_worker.erl index 21d2890f3..f129d7ef8 100644 --- a/apps/arweave/src/ar_node_worker.erl +++ b/apps/arweave/src/ar_node_worker.erl @@ -9,7 +9,7 @@ -module(ar_node_worker). -export([start_link/0, calculate_delay/1, is_mempool_or_block_cache_tx/1, - tx_id_prefix/1]). + tx_id_prefix/1, found_solution/4]). -export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2]). -export([set_reward_addr/1]). @@ -79,6 +79,9 @@ is_mempool_or_block_cache_tx(TXID) -> set_reward_addr(Addr) -> gen_server:call(?MODULE, {set_reward_addr, Addr}). +found_solution(Source, Solution, PoACache, PoA2Cache) -> + gen_server:cast(?MODULE, {found_solution, Source, Solution, PoACache, PoA2Cache}). + %%%=================================================================== %%% Generic server callbacks. %%%=================================================================== @@ -86,7 +89,7 @@ set_reward_addr(Addr) -> init([]) -> %% Trap exit to avoid corrupting any open files on quit. process_flag(trap_exit, true), - [ok, ok, ok, ok, ok] = ar_events:subscribe([tx, block, nonce_limiter, miner, node_state]), + [ok, ok, ok, ok] = ar_events:subscribe([tx, block, nonce_limiter, node_state]), %% Read persisted mempool. ar_mempool:load_from_disk(), %% Join the network. @@ -318,6 +321,16 @@ calculate_delay(Bytes) -> handle_call({set_reward_addr, Addr}, _From, State) -> {reply, ok, State#{ reward_addr => Addr }}. + +handle_cast({found_solution, miner, _Solution, _PoACache, _PoA2Cache}, + #{ automine := false, miner_2_6 := undefined } = State) -> + {noreply, State}; +handle_cast({found_solution, Source, Solution, PoACache, PoA2Cache}, State) -> + [{_, PrevH}] = ets:lookup(node_state, current), + PrevB = ar_block_cache:get(block_cache, PrevH), + handle_found_solution({Source, Solution, PoACache, PoA2Cache}, PrevB, State); + + handle_cast(process_task_queue, #{ task_queue := TaskQueue } = State) -> RunTask = case gb_sets:is_empty(TaskQueue) of @@ -475,17 +488,6 @@ handle_info({event, nonce_limiter, {refuse_validation, H}}, State) -> handle_info({event, nonce_limiter, _}, State) -> {noreply, State}; -handle_info({event, miner, {found_solution, miner, _Solution, _PoACache, _PoA2Cache}}, - #{ automine := false, miner_2_6 := undefined } = State) -> - {noreply, State}; -handle_info({event, miner, {found_solution, Source, Solution, PoACache, PoA2Cache}}, State) -> - [{_, PrevH}] = ets:lookup(node_state, current), - PrevB = ar_block_cache:get(block_cache, PrevH), - handle_found_solution({Source, Solution, PoACache, PoA2Cache}, PrevB, State); - -handle_info({event, miner, _}, State) -> - {noreply, State}; - handle_info({tx_ready_for_mining, TX}, State) -> ar_mempool:add_tx(TX, ready_for_mining), ar_events:send(tx, {ready_for_mining, TX}), @@ -1872,6 +1874,7 @@ handle_found_solution(Args, PrevB, State) -> packing_difficulty = PackingDifficulty, replica_format = ReplicaFormat } = Solution, + ?LOG_INFO([{event, handle_found_solution}, {solution, ar_util:encode(SolutionH)}]), MerkleRebaseThreshold = ?MERKLE_REBASE_SUPPORT_THRESHOLD, #block{ indep_hash = PrevH, timestamp = PrevTimestamp, @@ -1952,7 +1955,14 @@ handle_found_solution(Args, PrevB, State) -> false -> ar_events:send(solution, {stale, #{ source => Source }}), ar_mining_server:log_prepare_solution_failure(Solution, - vdf_seed_data_does_not_match_current_block, []), + vdf_seed_data_does_not_match_current_block, [ + {interval_number, IntervalNumber}, + {prev_interval_number, PrevIntervalNumber}, + {nonce_limiter_next_seed, ar_util:encode(NonceLimiterNextSeed)}, + {prev_nonce_limiter_next_seed, ar_util:encode(PrevNextSeed)}, + {nonce_limiter_next_vdf_difficulty, NonceLimiterNextVDFDifficulty}, + {prev_nonce_limiter_next_vdf_difficulty, PrevNextVDFDifficulty} + ]), {false, seed_data}; true -> true diff --git a/apps/arweave/src/ar_packing_server.erl b/apps/arweave/src/ar_packing_server.erl index 6d2e784c2..7b774fdc4 100644 --- a/apps/arweave/src/ar_packing_server.erl +++ b/apps/arweave/src/ar_packing_server.erl @@ -7,8 +7,8 @@ pack/4, unpack/5, repack/6, unpack_sub_chunk/5, is_buffer_full/0, record_buffer_size_metric/0, pad_chunk/1, unpad_chunk/3, unpad_chunk/4, - encipher_replica_2_9_chunk/2, get_replica_2_9_entropy/3, - pack_replica_2_9_chunk/3]). + encipher_replica_2_9_chunk/2, generate_replica_2_9_entropy/3, + pack_replica_2_9_chunk/3, request_entropy_generation/3]). -export([init/1, handle_cast/2, handle_call/3, handle_info/2, terminate/2]). @@ -21,9 +21,6 @@ -include_lib("eunit/include/eunit.hrl"). -%% The packing latency as it is chosen for the protocol. --define(PACKING_LATENCY_MS, 60). - -record(state, { workers, num_workers @@ -54,6 +51,9 @@ request_repack(Ref, Args) -> request_repack(Ref, ReplyTo, Args) -> gen_server:cast(?MODULE, {repack_request, ReplyTo, Ref, Args}). +request_entropy_generation(Ref, ReplyTo, Args) -> + gen_server:cast(?MODULE, {generate_entropy, ReplyTo, Ref, Args}). + %% @doc Pack the chunk for mining. Packing ensures every mined chunk of data is globally %% unique and cannot be easily inferred during mining from any metadata stored in RAM. pack(Packing, ChunkOffset, TXRoot, Chunk) -> @@ -183,6 +183,9 @@ unpad_chunk(Unpacked, ChunkSize, PackedSize) -> _ -> case is_zero(Padding) of false -> + ?LOG_WARNING([{event, unpad_chunk_error}, {packed_size, PackedSize}, + {chunk_size, ChunkSize}, {padding, binary_part(Padding, 0, 64)}, + {unpacked, binary_part(Unpacked, 0, 64)}]), error; true -> binary:part(Unpacked, 0, ChunkSize) @@ -220,42 +223,25 @@ get_randomx_state_for_h0(PackingDifficulty, PackingState) -> encipher_replica_2_9_chunk(Chunk, Entropy) -> iolist_to_binary(encipher_replica_2_9_sub_chunks(Chunk, Entropy)). -%% @doc Generate or take from the cache the 2.9 entropy. If new entropy is generated, -%% cache it. --spec get_replica_2_9_entropy( +%% @doc Generate the 2.9 entropy. +-spec generate_replica_2_9_entropy( RewardAddr :: binary(), AbsoluteEndOffset :: non_neg_integer(), SubChunkStartOffset :: non_neg_integer() ) -> binary(). -get_replica_2_9_entropy(RewardAddr, AbsoluteEndOffset, SubChunkStartOffset) -> - Partition = ar_node:get_partition_number(AbsoluteEndOffset), - +generate_replica_2_9_entropy(RewardAddr, AbsoluteEndOffset, SubChunkStartOffset) -> Key = ar_replica_2_9:get_entropy_key(RewardAddr, AbsoluteEndOffset, SubChunkStartOffset), PackingState = get_packing_state(), RandomXState = get_randomx_state_by_packing({replica_2_9, RewardAddr}, PackingState), - case ar_shared_entropy_cache:get(Key) of - not_found -> - prometheus_counter:inc(replica_2_9_entropy_cache_query, [miss, Partition]), - - {ok, Config} = application:get_env(arweave, config), - MaxCacheSize = Config#config.replica_2_9_entropy_cache_size, - ar_shared_entropy_cache:allocate_space(?REPLICA_2_9_ENTROPY_SIZE, MaxCacheSize), - Entropy = prometheus_histogram:observe_duration( - replica_2_9_entropy_duration_milliseconds, [], - fun() -> - ar_mine_randomx:randomx_generate_replica_2_9_entropy(RandomXState, Key) - end), - %% Primarily needed for testing where the entropy generated exceeds the entropy - %% needed for tests. - Entropy2 = binary_part(Entropy, 0, ?REPLICA_2_9_ENTROPY_SIZE), - ar_shared_entropy_cache:put(Key, Entropy2, ?REPLICA_2_9_ENTROPY_SIZE), - Entropy2; - {ok, Entropy} -> - prometheus_counter:inc(replica_2_9_entropy_cache_query, [hit, Partition]), - - Entropy - end. + Entropy = prometheus_histogram:observe_duration( + replica_2_9_entropy_duration_milliseconds, [1], + fun() -> + ar_mine_randomx:randomx_generate_replica_2_9_entropy(RandomXState, Key) + end), + %% Primarily needed for testing where the entropy generated exceeds the entropy + %% needed for tests. + binary_part(Entropy, 0, ?REPLICA_2_9_ENTROPY_SIZE). %% @doc Pad (to ?DATA_CHUNK_SIZE) and pack the chunk according to the 2.9 replication format. %% Return the chunk and the combined entropy used on that chunk. @@ -290,42 +276,12 @@ init([]) -> H1String = io_lib:format("~.3f", [H1 / 1000]), ar:console("Hashing benchmark~nH0: ~s ms~nH1/H2: ~s ms~n", [H0String, H1String]), ?LOG_INFO([{event, hash_benchmark}, {h0_ms, H0String}, {h1_ms, H1String}]), - Schedulers = erlang:system_info(dirty_cpu_schedulers_online), - {ActualRatePack2_6, ActualRatePackComposite} = get_packing_latency(PackingState), - PackingLatency = ActualRatePackComposite, - MaxRate = Schedulers * 1000 / PackingLatency, - TheoreticalMaxRate = Schedulers * 1000 / (?PACKING_LATENCY_MS), - {PackingRate, SchedulersRequired} = - case Config#config.packing_rate of - undefined -> - ChosenRate = max(1, ceil(2 * MaxRate / 3)), - ChosenRate2 = ar_util:ceil_int(ChosenRate, 10), - log_packing_rate(ChosenRate2, MaxRate), - SchedulersRequired2 = ceil(ChosenRate2 / (1000 / (?PACKING_LATENCY_MS))), - {ChosenRate2, SchedulersRequired2}; - ConfiguredRate -> - SchedulersRequired2 = ceil(ConfiguredRate / (1000 / PackingLatency)), - case SchedulersRequired2 > Schedulers of - true -> - log_insufficient_core_count(Schedulers, ConfiguredRate, MaxRate); - false -> - log_packing_rate(ConfiguredRate, MaxRate) - end, - {ConfiguredRate, SchedulersRequired2} - end, - - record_packing_benchmarks(TheoreticalMaxRate, PackingRate, Schedulers, - ActualRatePack2_6, ActualRatePackComposite), - SpawnSchedulers = min(SchedulersRequired, Schedulers), - ar:console("~nStarting ~B packing threads.~n", [SpawnSchedulers]), - %% Since the total rate of spawned processes might exceed the desired rate, - %% artificially throttle processes uniformly. - ThrottleDelay = calculate_throttle_delay(SpawnSchedulers, PackingRate), + NumWorkers = Config#config.packing_workers, + ar:console("~nStarting ~B packing threads.~n", [NumWorkers]), + ?LOG_INFO([{event, starting_packing_threads}, {num_threads, NumWorkers}]), Workers = queue:from_list( - [spawn_link(fun() -> worker(ThrottleDelay, PackingState) end) - || _ <- lists:seq(1, SpawnSchedulers)]), + [spawn_link(fun() -> worker(PackingState) end) || _ <- lists:seq(1, NumWorkers)]), ets:insert(?MODULE, {buffer_size, 0}), - {ok, Config} = application:get_env(arweave, config), MaxSize = case Config#config.packing_cache_size_limit of undefined -> @@ -341,7 +297,7 @@ init([]) -> ets:insert(?MODULE, {buffer_size_limit, MaxSize}), timer:apply_interval(200, ?MODULE, record_buffer_size_metric, []), {ok, #state{ - workers = Workers, num_workers = SpawnSchedulers }}. + workers = Workers, num_workers = NumWorkers }}. handle_call(Request, _From, State) -> ?LOG_WARNING([{event, unhandled_call}, {module, ?MODULE}, {request, Request}]), @@ -384,6 +340,11 @@ handle_cast({repack_request, From, Ref, Args}, State) -> }, {noreply, State#state{ workers = queue:in(Worker, Workers2) }} end; +handle_cast({generate_entropy, From, Ref, Args}, State) -> + #state{ workers = Workers } = State, + {{value, Worker}, Workers2} = queue:out(Workers), + Worker ! {generate_entropy, Ref, From, Args}, + {noreply, State#state{ workers = queue:in(Worker, Workers2) }}; handle_cast(Cast, State) -> ?LOG_WARNING([{event, unhandled_cast}, {module, ?MODULE}, {cast, Cast}]), {noreply, State}. @@ -418,53 +379,21 @@ get_randomx_state_by_packing({spora_2_6, _}, {RandomXState, _, _}) -> get_randomx_state_by_packing(spora_2_5, {RandomXState, _, _}) -> RandomXState. -log_insufficient_core_count(Schedulers, PackingRate, Max) -> - ar:console("~nThe number of cores on your machine (~B) is not sufficient for " - "packing ~B chunks per second. Estimated maximum rate: ~.2f chunks/s.~n", - [Schedulers, PackingRate, Max]), - ?LOG_WARNING([{event, insufficient_core_count_to_sustain_desired_packing_rate}, - {cores, Schedulers}, {packing_rate, PackingRate}]). - -log_packing_rate(PackingRate, Max) -> - ar:console("~nThe node is configured to pack around ~B chunks per second. " - "To increase the packing rate, start with `packing_rate [number]`. " - "Estimated maximum rate: ~.2f chunks/s.~n", - [PackingRate, Max]). - -calculate_throttle_delay(0, _PackingRate) -> - 0; -calculate_throttle_delay(_SpawnSchedulers, 0) -> - 0; -calculate_throttle_delay(SpawnSchedulers, PackingRate) -> - Load = PackingRate / (SpawnSchedulers * (1000 / (?PACKING_LATENCY_MS))), - case Load >= 1 of - true -> - 0; - false -> - trunc((1 - Load) * (?PACKING_LATENCY_MS)) - end. - -worker(ThrottleDelay, PackingState) -> +worker(PackingState) -> receive {unpack, Ref, From, Args} -> {Packing, Chunk, AbsoluteOffset, TXRoot, ChunkSize} = Args, case unpack(Packing, AbsoluteOffset, TXRoot, Chunk, ChunkSize, PackingState, internal) of - {ok, U, AlreadyUnpacked} -> + {ok, U, _AlreadyUnpacked} -> From ! {chunk, {unpacked, Ref, {Packing, U, AbsoluteOffset, TXRoot, - ChunkSize}}}, - case AlreadyUnpacked of - already_unpacked -> - ok; - _ -> - timer:sleep(ThrottleDelay) - end; + ChunkSize}}}; {error, invalid_packed_size} -> - ?LOG_WARNING([{event, got_packed_chunk_of_invalid_size}]); + ?LOG_WARNING([{event, got_unpacked_chunk_of_invalid_size}]); {error, invalid_chunk_size} -> - ?LOG_WARNING([{event, got_packed_chunk_with_invalid_chunk_size}]); + ?LOG_WARNING([{event, got_unpacked_chunk_with_invalid_chunk_size}]); {error, invalid_padding} -> - ?LOG_WARNING([{event, got_packed_chunk_with_invalid_padding}, + ?LOG_WARNING([{event, got_unpacked_chunk_with_invalid_padding}, {absolute_end_offset, AbsoluteOffset}]); {exception, Error} -> ?LOG_ERROR([{event, failed_to_unpack_chunk}, @@ -472,19 +401,13 @@ worker(ThrottleDelay, PackingState) -> {error, io_lib:format("~p", [Error])}]) end, decrement_buffer_size(), - worker(ThrottleDelay, PackingState); + worker(PackingState); {pack, Ref, From, Args} -> {Packing, Chunk, AbsoluteOffset, TXRoot, ChunkSize} = Args, case pack(Packing, AbsoluteOffset, TXRoot, Chunk, PackingState, internal) of - {ok, Packed, AlreadyPacked} -> + {ok, Packed, _AlreadyPacked} -> From ! {chunk, {packed, Ref, {Packing, Packed, AbsoluteOffset, TXRoot, - ChunkSize}}}, - case AlreadyPacked of - already_packed -> - ok; - _ -> - timer:sleep(ThrottleDelay) - end; + ChunkSize}}}; {error, invalid_unpacked_size} -> ?LOG_WARNING([{event, got_unpacked_chunk_of_invalid_size}]); {exception, Error} -> @@ -493,23 +416,14 @@ worker(ThrottleDelay, PackingState) -> {error, io_lib:format("~p", [Error])}]) end, decrement_buffer_size(), - worker(ThrottleDelay, PackingState); + worker(PackingState); {repack, Ref, From, Args} -> {RequestedPacking, Packing, Chunk, AbsoluteOffset, TXRoot, ChunkSize} = Args, case repack(RequestedPacking, Packing, AbsoluteOffset, TXRoot, Chunk, ChunkSize, PackingState, internal) of {ok, Packed, _RepackInput} -> From ! {chunk, {packed, Ref, - {RequestedPacking, Packed, AbsoluteOffset, TXRoot, ChunkSize}}}, - case RequestedPacking == Packing of - true -> - %% When RequestdPacking and Packing are the same - %% the repack does no work and just returns - %% the original chunk. In this case we don't need a throttle. - ok; - _ -> - timer:sleep(ThrottleDelay) - end; + {RequestedPacking, Packed, AbsoluteOffset, TXRoot, ChunkSize}}}; {error, invalid_packed_size} -> ?LOG_WARNING([{event, got_packed_chunk_of_invalid_size}]); {error, invalid_chunk_size} -> @@ -525,7 +439,11 @@ worker(ThrottleDelay, PackingState) -> {error, io_lib:format("~p", [Error])}]) end, decrement_buffer_size(), - worker(ThrottleDelay, PackingState) + worker(PackingState); + {generate_entropy, Ref, From, {RewardAddr, PaddedEndOffset, SubChunkStart}} -> + Entropy = ar_packing_server:generate_replica_2_9_entropy(RewardAddr, PaddedEndOffset, SubChunkStart), + From ! {entropy_generated, Ref, Entropy}, + worker(PackingState) end. chunk_key(spora_2_5, ChunkOffset, TXRoot) -> @@ -576,7 +494,7 @@ pack({replica_2_9, RewardAddr} = Packing, AbsoluteEndOffset, _TXRoot, Chunk, Pac SubChunks = get_sub_chunks(PaddedChunk), case pack_replica_2_9_sub_chunks(RewardAddr, AbsoluteEndOffset, RandomXState, SubChunks) of - {ok, Packed} -> + {ok, Packed, _Entropy} -> {ok, Packed, was_not_already_packed}; Error -> Error @@ -616,7 +534,7 @@ pack_replica_2_9_sub_chunks(_RewardAddr, _AbsoluteEndOffset, _RandomXState, pack_replica_2_9_sub_chunks(RewardAddr, AbsoluteEndOffset, RandomXState, SubChunkStartOffset, [SubChunk | SubChunks], PackedSubChunks, EntropyParts) -> EntropySubChunkIndex = ar_replica_2_9:get_slice_index(AbsoluteEndOffset), - Entropy = get_replica_2_9_entropy(RewardAddr, AbsoluteEndOffset, SubChunkStartOffset), + Entropy = generate_replica_2_9_entropy(RewardAddr, AbsoluteEndOffset, SubChunkStartOffset), case prometheus_histogram:observe_duration(packing_duration_milliseconds, [pack_sub_chunk, replica_2_9, internal], fun() -> ar_mine_randomx:randomx_encrypt_replica_2_9_sub_chunk({RandomXState, @@ -634,15 +552,15 @@ pack_replica_2_9_sub_chunks(RewardAddr, AbsoluteEndOffset, RandomXState, end. unpack_replica_2_9_sub_chunks(RewardAddr, AbsoluteEndOffset, RandomXState, SubChunks) -> - unpack_replica_2_9_sub_chunks(RewardAddr, AbsoluteEndOffset, RandomXState, 0, SubChunks, []). + unpack_replica_2_9_sub_chunks( + RewardAddr, AbsoluteEndOffset, RandomXState, 0, SubChunks, []). unpack_replica_2_9_sub_chunks(_RewardAddr, _AbsoluteEndOffset, _RandomXState, _SubChunkStartOffset, [], UnpackedSubChunks) -> {ok, iolist_to_binary(lists:reverse(UnpackedSubChunks))}; unpack_replica_2_9_sub_chunks(RewardAddr, AbsoluteEndOffset, RandomXState, SubChunkStartOffset, [SubChunk | SubChunks], UnpackedSubChunks) -> - Key = ar_replica_2_9:get_entropy_key(RewardAddr, - AbsoluteEndOffset, SubChunkStartOffset), + Key = ar_replica_2_9:get_entropy_key(RewardAddr, AbsoluteEndOffset, SubChunkStartOffset), EntropySubChunkIndex = ar_replica_2_9:get_slice_index(AbsoluteEndOffset), case prometheus_histogram:observe_duration(packing_duration_milliseconds, [unpack_sub_chunk, replica_2_9, internal], fun() -> @@ -866,49 +784,6 @@ record_buffer_size_metric() -> ok end. -get_packing_latency(PackingState) -> - Chunk = crypto:strong_rand_bytes(?DATA_CHUNK_SIZE), - Key = crypto:hash(sha256, crypto:strong_rand_bytes(256)), - Addr = crypto:strong_rand_bytes(32), - Spora2_6Packing = {spora_2_6, Addr}, - CompositePacking = {composite, Addr, 1}, - Spora2_6RandomXState = get_randomx_state_by_packing(Spora2_6Packing, PackingState), - CompositeRandomXState = get_randomx_state_by_packing(CompositePacking, PackingState), - %% Run each randomx routine Repetitions times and return the minimum runtime. We use - %% minimum rather than average since it more closely approximates the fastest that this - %% machine can do the calculation. - Repetitions = 5, - {minimum_run_time(ar_mine_randomx, randomx_encrypt_chunk, - [Spora2_6Packing, Spora2_6RandomXState, Key, Chunk], Repetitions), - minimum_run_time(ar_mine_randomx, randomx_encrypt_chunk, - [CompositePacking, CompositeRandomXState, Key, Chunk], Repetitions)}. - -record_packing_benchmarks(TheoreticalMaxRate, ChosenRate, Schedulers, - ActualRatePack2_6, ActualRatePackComposite) -> - prometheus_gauge:set(packing_latency_benchmark, - [protocol, pack, spora_2_6], ?PACKING_LATENCY_MS), - prometheus_gauge:set(packing_latency_benchmark, - [protocol, unpack, spora_2_6], ?PACKING_LATENCY_MS), - prometheus_gauge:set(packing_rate_benchmark, - [protocol], TheoreticalMaxRate), - prometheus_gauge:set(packing_rate_benchmark, - [configured], ChosenRate), - prometheus_gauge:set(packing_schedulers, - Schedulers), - prometheus_gauge:set(packing_latency_benchmark, - [init, pack, spora_2_6], ActualRatePack2_6), - prometheus_gauge:set(packing_latency_benchmark, - [init, pack, composite], ActualRatePackComposite). - -minimum_run_time(Module, Function, Args, Repetitions) -> - minimum_run_time(Module, Function, Args, Repetitions, infinity). -minimum_run_time(_Module, _Function, _Args, 0, MinTime) -> - %% round microseconds to the nearest millisecond - max(1, (MinTime + 500) div 1000); -minimum_run_time(Module, Function, Args, Repetitions, MinTime) -> - {RunTime, _} = timer:tc(Module, Function, Args), - minimum_run_time(Module, Function, Args, Repetitions-1, erlang:min(MinTime, RunTime)). - %% @doc Walk up the stack trace to the parent of the current function. E.g. %% example() -> %% get_caller(). @@ -1029,18 +904,3 @@ pack_test() -> Cases )), ?assertEqual(length(PackedList), sets:size(sets:from_list(PackedList))). - -calculate_throttle_delay_test() -> - %% 1000 / ?PACKING_LATENCY_MS = 16.666666 - ?assertEqual(0, calculate_throttle_delay(1, 17), - "PackingRate > SpawnSchedulers capacity -> no throttle"), - ?assertEqual(0, calculate_throttle_delay(8, 1000), - "PackingRate > SpawnSchedulers capacity -> no throttle"), - ?assertEqual(2, calculate_throttle_delay(1, 16), - "PackingRate < SpawnSchedulers capacity -> throttle"), - ?assertEqual(15, calculate_throttle_delay(8, 100), - "PackingRate < SpawnSchedulers capacity -> throttle"), - ?assertEqual(0, calculate_throttle_delay(0, 100), - "0 schedulers -> no throttle"), - ?assertEqual(0, calculate_throttle_delay(8, 0), - "no packing -> no throttle"). diff --git a/apps/arweave/src/ar_pool.erl b/apps/arweave/src/ar_pool.erl index aa744692a..11819e1f2 100644 --- a/apps/arweave/src/ar_pool.erl +++ b/apps/arweave/src/ar_pool.erl @@ -585,8 +585,7 @@ process_partial_solution_vdf(Solution, Ref, PoACache, PoA2Cache) -> %% ar_node_worker will fetch the required steps based on the prev block. steps = not_found }, - ar_events:send(miner, {found_solution, {pool, Ref}, - Solution2, PoACache, PoA2Cache}), + ar_node_worker:found_solution({pool, Ref}, Solution2, PoACache, PoA2Cache), noreply; _ -> %% {Output, Seed, PartitionUpperBound} mismatch (pattern matching against @@ -996,7 +995,8 @@ process_solution_test_() -> 1 end end}, - {ar_events, send, fun(_Type, _Payload) -> ok end}], + {ar_events, send, fun(_Type, _Payload) -> ok end}, + {ar_node_worker, found_solution, fun(_, _, _, _) -> ok end}], fun test_process_solution/0 ). diff --git a/apps/arweave/src/ar_repack.erl b/apps/arweave/src/ar_repack.erl index 63939728b..145b301cd 100644 --- a/apps/arweave/src/ar_repack.erl +++ b/apps/arweave/src/ar_repack.erl @@ -216,15 +216,6 @@ send_chunk_for_repacking(AbsoluteOffset, ChunkMeta, Args) -> PaddedOffset = ar_block:get_chunk_padded_offset(AbsoluteOffset), {ChunkDataKey, TXRoot, DataRoot, TXPath, RelativeOffset, ChunkSize} = ChunkMeta, - ?LOG_DEBUG([{event, send_chunk_for_repacking}, - {tags, [repack_in_place]}, - {pid, self()}, - {storage_module, StoreID}, - {offset, AbsoluteOffset}, - {padded_offset, PaddedOffset}, - {start_offset, ar_chunk_storage:get_chunk_bucket_start(AbsoluteOffset)}, - {chunk_size, ChunkSize}, - {required_packing, ar_serialize:encode_packing(RequiredPacking, true)}]), case ar_sync_record:is_recorded(PaddedOffset, ar_data_sync, StoreID) of {true, unpacked_padded} -> %% unpacked_padded is a special internal packing used @@ -331,46 +322,35 @@ chunk_repacked(ChunkArgs, Args, StoreID, FileIndex, IsPrepared, RewardAddr) -> Error -> Error end, - ?LOG_DEBUG([{event, chunk_repacked}, - {padded_end_offset, PaddedEndOffset}, - {chunk_size, ChunkSize}, - {packing, ar_serialize:encode_packing(Packing, true)}, - {remove_from_sync_record_result, RemoveFromSyncRecordResult}, - {remove_from_sync_record_result2, RemoveFromSyncRecordResult2}, - {is_storage_supported, IsStorageSupported}]), + case {RemoveFromSyncRecordResult2, IsStorageSupported} of {ok, false} -> gen_server:cast(ar_data_sync:name(StoreID), {store_chunk, ChunkArgs, Args}), {ok, FileIndex}; {ok, true} -> - case ar_chunk_storage:store_chunk(PaddedEndOffset, Chunk, Packing, - StoreID, FileIndex, IsPrepared, RewardAddr) of + StoreResults = ar_chunk_storage:store_chunk(PaddedEndOffset, Chunk, Packing, + StoreID, FileIndex, IsPrepared, RewardAddr), + case StoreResults of {ok, FileIndex2, NewPacking} -> - ?LOG_DEBUG([{event, ar_chunk_storage_packed}, {e, PaddedEndOffset}, - {s, StartOffset}, {id, ar_data_sync}, {store_id, StoreID}, - {old_packing, ar_serialize:encode_packing(Packing, true)}, - {new_packing, ar_serialize:encode_packing(NewPacking, true)}]), ar_sync_record:add_async(repacked_chunk, PaddedEndOffset, StartOffset, NewPacking, ar_data_sync, StoreID), {ok, FileIndex2}; Error3 -> - PackingStr = ar_serialize:encode_packing(Packing, true), ?LOG_ERROR([{event, failed_to_store_repacked_chunk}, {tags, [repack_in_place]}, {storage_module, StoreID}, {padded_end_offset, PaddedEndOffset}, - {packing, PackingStr}, + {requested_packing, ar_serialize:encode_packing(Packing, true)}, {error, io_lib:format("~p", [Error3])}]), {ok, FileIndex} end; {Error4, _} -> - PackingStr = ar_serialize:encode_packing(Packing, true), ?LOG_ERROR([{event, failed_to_store_repacked_chunk}, {tags, [repack_in_place]}, {storage_module, StoreID}, {padded_end_offset, PaddedEndOffset}, - {packing, PackingStr}, + {requested_packing, ar_serialize:encode_packing(Packing, true)}, {error, io_lib:format("~p", [Error4])}]), {ok, FileIndex} end. diff --git a/apps/arweave/src/ar_replica_2_9.erl b/apps/arweave/src/ar_replica_2_9.erl index b1fe91784..a823a5d33 100644 --- a/apps/arweave/src/ar_replica_2_9.erl +++ b/apps/arweave/src/ar_replica_2_9.erl @@ -1,7 +1,7 @@ -module(ar_replica_2_9). -export([get_entropy_partition/1, get_entropy_key/3, get_sector_size/0, - get_slice_index/1]). + get_slice_index/1, get_partition_offset/1]). -include_lib("arweave/include/ar.hrl"). -include_lib("arweave/include/ar_consensus.hrl"). @@ -130,9 +130,9 @@ get_sector_size() -> ) -> non_neg_integer(). get_slice_index(AbsoluteChunkEndOffset) -> PartitionRelativeOffset = get_partition_offset(AbsoluteChunkEndOffset), - SubChunkCount = ?REPLICA_2_9_ENTROPY_SIZE div ?COMPOSITE_PACKING_SUB_CHUNK_SIZE, + SubChunksPerEntropy = ?REPLICA_2_9_ENTROPY_SIZE div ?COMPOSITE_PACKING_SUB_CHUNK_SIZE, SectorSize = get_sector_size(), - (PartitionRelativeOffset div SectorSize) rem SubChunkCount. + (PartitionRelativeOffset div SectorSize) rem SubChunksPerEntropy. %%%=================================================================== diff --git a/apps/arweave/src/ar_serialize.erl b/apps/arweave/src/ar_serialize.erl index 91c579b18..fb3ec8f44 100644 --- a/apps/arweave/src/ar_serialize.erl +++ b/apps/arweave/src/ar_serialize.erl @@ -2227,6 +2227,8 @@ partition_to_json_struct(Bucket, BucketSize, Addr, PackingDifficulty) -> encode_packing(undefined, false) -> "undefined"; +encode_packing(none, false) -> + "none"; encode_packing(any, false) -> "any"; encode_packing({spora_2_6, Addr}, _Strict) -> diff --git a/apps/arweave/src/ar_shared_entropy_cache.erl b/apps/arweave/src/ar_shared_entropy_cache.erl deleted file mode 100644 index 65b45b579..000000000 --- a/apps/arweave/src/ar_shared_entropy_cache.erl +++ /dev/null @@ -1,191 +0,0 @@ -%%% @doc The module for managing the cache of entropies for -%%% the 2.9 replication scheme. --module(ar_shared_entropy_cache). - --behaviour(gen_server). - --export([start_link/0]). --export([init/1, handle_cast/2, handle_call/3, handle_info/2, terminate/2]). - --export([get/1, allocate_space/2, put/3]). - --include_lib("arweave/include/ar.hrl"). - --include_lib("eunit/include/eunit.hrl"). - --record(state, {}). - -%%%=================================================================== -%%% Public interface. -%%%=================================================================== - -%% @doc Return the stored value, if any, for the given Key. --spec get(Key :: string()) -> {ok, term()} | not_found. -get(Key) -> - get(Key, replica_2_9_entropy_cache). - -%% @doc Make sure the cache has enough space to store Size worth of elements such that -%% the total size does not exceed MaxSize. In other words, if you want to store new -%% elements with the total size Size, call allocate_space(Size, MaxSize) then -%% call put/3 to store new elements. --spec allocate_space( - Size :: non_neg_integer(), - MaxSize :: non_neg_integer() -) -> ok. -allocate_space(Size, MaxSize) -> - gen_server:cast(?MODULE, {allocate_space, Size, MaxSize}). - -%% @doc Store the given Value in the cache. Associate it with the given Size and -%% increase the total cache size accordingly. --spec put( - Key :: string(), - Value :: term(), - Size :: non_neg_integer() -) -> ok. -put(Key, Value, Size) -> - gen_server:cast(?MODULE, {put, Key, Value, Size}). - -%% @doc Start the server. -start_link() -> - gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). - -%%%=================================================================== -%%% Generic server callbacks. -%%%=================================================================== - -init([]) -> - {ok, #state{}}. - -handle_call(Request, _From, State) -> - ?LOG_WARNING([{event, unhandled_call}, {module, ?MODULE}, {request, Request}]), - {reply, ok, State}. - -handle_cast({allocate_space, Size, MaxSize}, State) -> - Table = replica_2_9_entropy_cache, - OrderedKeyTable = replica_2_9_entropy_cache_ordered_keys, - allocate_space(Size, MaxSize, Table, OrderedKeyTable), - {noreply, State}; - -handle_cast({put, Key, Value, Size}, State) -> - Table = replica_2_9_entropy_cache, - OrderedKeyTable = replica_2_9_entropy_cache_ordered_keys, - put(Key, Value, Size, Table, OrderedKeyTable), - {noreply, State}; - -handle_cast(Cast, State) -> - ?LOG_WARNING([{event, unhandled_cast}, {module, ?MODULE}, {cast, Cast}]), - {noreply, State}. - -handle_info(_Message, State) -> - {noreply, State}. - -terminate(Reason, _State) -> - ?LOG_INFO([{event, terminate}, {module, ?MODULE}, - {reason, io_lib:format("~p", [Reason])}]), - ok. - -%%%=================================================================== -%%% Private functions. -%%%=================================================================== - -get(Key, Table) -> - case ets:lookup(Table, {key, Key}) of - [] -> - not_found; - [{_, Value}] -> - ?LOG_DEBUG([{event, entropy_cache_hit}, {key, ar_util:safe_encode(Key)}]), - %% Track the number of used keys per entropy to estimate the efficiency - %% of the cache. - ets:update_counter(Table, {fetched_key_count, Key}, 1, - {{fetched_key_count, Key}, 0}), - {ok, Value} - end. - -allocate_space(Size, MaxSize, Table, OrderedKeyTable) -> - TotalSize = - case ets:lookup(Table, total_size) of - [] -> - 0; - [{_, Value}] -> - Value - end, - case TotalSize + Size > MaxSize of - true -> - case ets:first(OrderedKeyTable) of - '$end_of_table' -> - ok; - {_Timestamp, Key, ElementSize} = EarliestKey -> - ets:delete(Table, {key, Key}), - ets:update_counter(Table, total_size, -ElementSize, {total_size, 0}), - ets:delete(OrderedKeyTable, EarliestKey), - FetchedKeyCount = get_fetched_key_count(Table, Key), - ?LOG_DEBUG([{event, release_replica_2_9_entropy}, - {fetched_key_count, FetchedKeyCount}]), - ets:delete(Table, {fetched_key_count, Key}), - allocate_space(Size, MaxSize, Table, OrderedKeyTable) - end; - false -> - ok - end. - -get_fetched_key_count(Table, Key) -> - case ets:lookup(Table, {fetched_key_count, Key}) of - [] -> - 0; - [{_, Count}] -> - Count - end. - -put(Key, Value, Size, Table, OrderedKeyTable) -> - ets:insert(Table, {{key, Key}, Value}), - Timestamp = os:system_time(microsecond), - ets:insert(OrderedKeyTable, {{Timestamp, Key, Size}}), - ets:update_counter(Table, total_size, Size, {total_size, 0}). - -%%%=================================================================== -%%% Tests. -%%%=================================================================== - -cache_test() -> - Table = 'test_entropy_cache_table', - OrderedKeyTable = 'test_entropy_cache_ordered_key_table', - ets:new(Table, [set, public, named_table]), - ets:new(OrderedKeyTable, [ordered_set, public, named_table]), - ?assertEqual(0, get_fetched_key_count(Table, some_key)), - ?assertEqual(not_found, get(some_key, Table)), - ?assertEqual(0, get_fetched_key_count(Table, some_key)), - allocate_space(64, 128, Table, OrderedKeyTable), - put(some_key, some_value, 64, Table, OrderedKeyTable), - ?assertEqual({ok, some_value}, get(some_key, Table)), - ?assertEqual(1, get_fetched_key_count(Table, some_key)), - ?assertEqual({ok, some_value}, get(some_key, Table)), - ?assertEqual(2, get_fetched_key_count(Table, some_key)), - allocate_space(64, 128, Table, OrderedKeyTable), - ?assertEqual({ok, some_value}, get(some_key, Table)), - ?assertEqual(3, get_fetched_key_count(Table, some_key)), - allocate_space(64, 128, Table, OrderedKeyTable), - ?assertEqual({ok, some_value}, get(some_key, Table)), - ?assertEqual(4, get_fetched_key_count(Table, some_key)), - allocate_space(128, 128, Table, OrderedKeyTable), - %% We requested an allocation of > MaxSize so the old key needs to be removed. - ?assertEqual(not_found, get(some_key, Table)), - ?assertEqual(0, get_fetched_key_count(Table, some_key)), - %% The put itself does not clean up the cache. - put(some_key, some_value, 64, Table, OrderedKeyTable), - put(some_other_key, some_other_value, 64, Table, OrderedKeyTable), - put(yet_another_key, yet_another_value, 64, Table, OrderedKeyTable), - ?assertEqual(0, get_fetched_key_count(Table, some_key)), - ?assertEqual({ok, some_value}, get(some_key, Table)), - ?assertEqual({ok, some_other_value}, get(some_other_key, Table)), - ?assertEqual({ok, yet_another_value}, get(yet_another_key, Table)), - ?assertEqual(1, get_fetched_key_count(Table, some_key)), - ?assertEqual(1, get_fetched_key_count(Table, some_other_key)), - ?assertEqual(1, get_fetched_key_count(Table, yet_another_key)), - %% Basically, we are simply reducing the cache 192 -> 128. - allocate_space(0, 128, Table, OrderedKeyTable), - ?assertEqual(not_found, get(some_key, Table)), - ?assertEqual({ok, some_other_value}, get(some_other_key, Table)), - ?assertEqual({ok, yet_another_value}, get(yet_another_key, Table)), - allocate_space(64, 128, Table, OrderedKeyTable), - ?assertEqual(not_found, get(some_other_key, Table)), - ?assertEqual({ok, yet_another_value}, get(yet_another_key, Table)). diff --git a/apps/arweave/src/ar_storage_module.erl b/apps/arweave/src/ar_storage_module.erl index cdcc40a81..3d32b0c78 100644 --- a/apps/arweave/src/ar_storage_module.erl +++ b/apps/arweave/src/ar_storage_module.erl @@ -3,10 +3,8 @@ -export([id/1, label/1, address_label/2, module_address/1, module_packing_difficulty/1, packing_label/1, label_by_id/1, get_by_id/1, get_range/1, module_range/1, module_range/2, get_packing/1, get_size/1, - get/2, get_strict/2, get_all/1, get_all/2, get_all_packed/2, - has_any/1, has_range/2, get_cover/3]). - --export([get_unique_sorted_intervals/1]). + get/2, get_strict/2, get_all/1, get_all/2, get_all_packed/3, get_all_module_ranges/0, + has_any/1, has_range/2, get_cover/3, get_overlap/1]). -include("../include/ar.hrl"). -include("../include/ar_consensus.hrl"). @@ -25,13 +23,12 @@ -ifdef(AR_TEST). -define(REPLICA_2_9_OVERLAP, 262144). -else. --define(REPLICA_2_9_OVERLAP, (262144 * 20)). +-define(REPLICA_2_9_OVERLAP, (262144 * 10)). -endif. -type storage_module() :: {integer(), integer(), {atom(), binary()}} | {integer(), integer(), {atom(), binary(), integer()}}. - %%%=================================================================== %%% Public interface. %%%=================================================================== @@ -158,6 +155,17 @@ get_by_id(ID, [Module | Modules]) -> get_by_id(ID, Modules) end. +get_all_module_ranges() -> + {ok, Config} = application:get_env(arweave, config), + RepackInPlaceModulesStoreIDs = [ + {{BucketSize, Bucket, TargetPacking}, ar_storage_module:id(Module)} + || {{BucketSize, Bucket, _Packing} = Module, TargetPacking} <- Config#config.repack_in_place_storage_modules], + ModuleStoreIDs = [{Module, ar_storage_module:id(Module)} + || Module <- Config#config.storage_modules], + + [{module_range(Module), Packing, StoreID} || {{_, _, Packing} = Module, StoreID} <- + ModuleStoreIDs ++ RepackInPlaceModulesStoreIDs]. + %% @doc Return {StartOffset, EndOffset} the given module is responsible for. get_range("default") -> {0, infinity}; @@ -214,13 +222,7 @@ get(Offset, Packing) -> %% Return not_found if none is found. If a module is configured with in-place repacking, %% pick the target packing (the one we are repacking to.) get_strict(Offset, Packing) -> - {ok, Config} = application:get_env(arweave, config), - RepackInPlaceModulesStoreIDs = [ - {{BucketSize, Bucket, TargetPacking}, ar_storage_module:id(Module)} - || {{BucketSize, Bucket, _Packing} = Module, TargetPacking} <- Config#config.repack_in_place_storage_modules], - ModuleStoreIDs = [{Module, ar_storage_module:id(Module)} - || Module <- Config#config.storage_modules], - get_strict(Offset, Packing, ModuleStoreIDs ++ RepackInPlaceModulesStoreIDs). + get_strict(Offset, Packing, get_all_module_ranges()). %% @doc Return the list of all configured storage modules covering the given Offset. get_all(Offset) -> @@ -231,13 +233,7 @@ get_all(Offset) -> %% covering the given Offset and Packing. If a module is configured with %% in-place repacking, pick the target packing (the one we are repacking to.) get_all_packed(Offset, Packing) -> - {ok, Config} = application:get_env(arweave, config), - RepackInPlaceModulesStoreIDs = [ - {{BucketSize, Bucket, TargetPacking}, ar_storage_module:id(Module)} - || {{BucketSize, Bucket, _Packing} = Module, TargetPacking} <- Config#config.repack_in_place_storage_modules], - ModuleStoreIDs = [{Module, ar_storage_module:id(Module)} - || Module <- Config#config.storage_modules], - get_all_packed(Offset, Packing, ModuleStoreIDs ++ RepackInPlaceModulesStoreIDs). + get_all_packed(Offset, Packing, get_all_module_ranges()). %% @doc Return the list of configured storage modules whose ranges intersect %% the given interval. @@ -319,13 +315,13 @@ get(Offset, Packing, [{BucketSize, Bucket, Packing2} | StorageModules], StorageM get(_Offset, _Packing, [], StorageModule) -> StorageModule. -get_strict(Offset, Packing, [{{BucketSize, Bucket, Packing2}, StoreID} | StorageModules]) -> - case Offset =< BucketSize * Bucket - orelse Offset > BucketSize * (Bucket + 1) + get_overlap(Packing2) of +get_strict(Offset, Packing, + [{{RangeStart, RangeEnd}, ModulePacking, StoreID} | StorageModules]) -> + case Offset =< RangeStart orelse Offset > RangeEnd of true -> get_strict(Offset, Packing, StorageModules); false -> - case Packing == Packing2 of + case Packing == ModulePacking of true -> {ok, StoreID}; false -> @@ -352,9 +348,8 @@ get_all(_Offset, [], FoundModules) -> FoundModules. get_all_packed(Offset, Packing, - [{{BucketSize, Bucket, Packing}, StoreID} | StorageModules]) -> - case Offset =< BucketSize * Bucket - orelse Offset > BucketSize * (Bucket + 1) + get_overlap(Packing) of + [{{RangeStart, RangeEnd}, Packing, StoreID} | StorageModules]) -> + case Offset =< RangeStart orelse Offset > RangeEnd of true -> get_all_packed(Offset, Packing, StorageModules); false -> diff --git a/apps/arweave/src/ar_storage_sup.erl b/apps/arweave/src/ar_storage_sup.erl index 0df01b1e9..4ca0dd196 100644 --- a/apps/arweave/src/ar_storage_sup.erl +++ b/apps/arweave/src/ar_storage_sup.erl @@ -23,4 +23,7 @@ start_link() -> init([]) -> ets:new(ar_storage, [set, public, named_table, {read_concurrency, true}]), ets:new(ar_storage_module, [set, public, named_table]), - {ok, {{one_for_one, 5, 10}, [?CHILD(ar_storage, worker)]}}. + {ok, {{one_for_one, 5, 10}, [ + ?CHILD(ar_storage, worker), + ?CHILD(ar_device_lock, worker) + ]}}. diff --git a/apps/arweave/src/ar_sup.erl b/apps/arweave/src/ar_sup.erl index fb13295a2..e53508a20 100644 --- a/apps/arweave/src/ar_sup.erl +++ b/apps/arweave/src/ar_sup.erl @@ -77,7 +77,6 @@ init([]) -> ?CHILD(ar_watchdog, worker), ?CHILD(ar_tx_blacklist, worker), ?CHILD_SUP(ar_bridge_sup, supervisor), - ?CHILD(ar_shared_entropy_cache, worker), ?CHILD(ar_packing_server, worker), ?CHILD_SUP(ar_sync_record_sup, supervisor), ?CHILD(ar_data_discovery, worker), diff --git a/apps/arweave/src/ar_sync_record.erl b/apps/arweave/src/ar_sync_record.erl index 53803430c..60be89212 100644 --- a/apps/arweave/src/ar_sync_record.erl +++ b/apps/arweave/src/ar_sync_record.erl @@ -2,7 +2,7 @@ -behaviour(gen_server). --export([start_link/2, get/2, get/3, add/4, add/5, add_async/5, add_async/6, delete/4, cut/3, +-export([start_link/2, get/2, get/3, add/4, add/5, add_async/5, add_async/6, delete/4, delete_async/5, cut/3, is_recorded/2, is_recorded/3, is_recorded/4, is_recorded_any/3, get_next_synced_interval/4, get_next_synced_interval/5, get_next_unsynced_interval/4, @@ -66,7 +66,7 @@ start_link(Name, StoreID) -> %% @doc Return the set of intervals. get(ID, StoreID) -> - GenServerID = list_to_atom("ar_sync_record_" ++ ar_storage_module:label_by_id(StoreID)), + GenServerID = name(StoreID), case catch gen_server:call(GenServerID, {get, ID}, 20000) of {'EXIT', {timeout, {gen_server, call, _}}} -> {error, timeout}; @@ -76,7 +76,7 @@ get(ID, StoreID) -> %% @doc Return the set of intervals. get(ID, Packing, StoreID) -> - GenServerID = list_to_atom("ar_sync_record_" ++ ar_storage_module:label_by_id(StoreID)), + GenServerID = name(StoreID), case catch gen_server:call(GenServerID, {get, Packing, ID}, 20000) of {'EXIT', {timeout, {gen_server, call, _}}} -> {error, timeout}; @@ -87,7 +87,7 @@ get(ID, Packing, StoreID) -> %% @doc Add the given interval to the record with the %% given ID. Store the changes on disk before returning ok. add(End, Start, ID, StoreID) -> - GenServerID = list_to_atom("ar_sync_record_" ++ ar_storage_module:label_by_id(StoreID)), + GenServerID = name(StoreID), case catch gen_server:call(GenServerID, {add, End, Start, ID}, 120000) of {'EXIT', {timeout, {gen_server, call, _}}} -> {error, timeout}; @@ -99,7 +99,7 @@ add(End, Start, ID, StoreID) -> %% given ID and Packing. Store the changes on disk before %% returning ok. add(End, Start, Packing, ID, StoreID) -> - GenServerID = list_to_atom("ar_sync_record_" ++ ar_storage_module:label_by_id(StoreID)), + GenServerID = name(StoreID), case catch gen_server:call(GenServerID, {add, End, Start, Packing, ID}, 120000) of {'EXIT', {timeout, {gen_server, call, _}}} -> {error, timeout}; @@ -109,20 +109,20 @@ add(End, Start, Packing, ID, StoreID) -> %% @doc Special case of add/4. add_async(Event, End, Start, ID, StoreID) -> - GenServerID = list_to_atom("ar_sync_record_" ++ ar_storage_module:label_by_id(StoreID)), + GenServerID = name(StoreID), gen_server:cast(GenServerID, {add_async, Event, End, Start, ID}). %% @doc Special case of add/5 for repacked chunks. When repacking the ar_sync_record add %% happens at the end so we don't need to block on it to complete. add_async(Event, End, Start, Packing, ID, StoreID) -> - GenServerID = list_to_atom("ar_sync_record_" ++ ar_storage_module:label_by_id(StoreID)), + GenServerID = name(StoreID), gen_server:cast(GenServerID, {add_async, Event, End, Start, Packing, ID}). %% @doc Remove the given interval from the record %% with the given ID. Store the changes on disk before %% returning ok. delete(End, Start, ID, StoreID) -> - GenServerID = list_to_atom("ar_sync_record_" ++ ar_storage_module:label_by_id(StoreID)), + GenServerID = name(StoreID), case catch gen_server:call(GenServerID, {delete, End, Start, ID}, 120000) of {'EXIT', {timeout, {gen_server, call, _}}} -> {error, timeout}; @@ -130,11 +130,15 @@ delete(End, Start, ID, StoreID) -> Reply end. +delete_async(Event, End, Start, ID, StoreID) -> + GenServerID = name(StoreID), + gen_server:cast(GenServerID, {delete_async, Event, End, Start, ID}). + %% @doc Remove everything strictly above the given %% Offset from the record. Store the changes on disk %% before returning ok. cut(Offset, ID, StoreID) -> - GenServerID = list_to_atom("ar_sync_record_" ++ ar_storage_module:label_by_id(StoreID)), + GenServerID = name(StoreID), case catch gen_server:call(GenServerID, {cut, Offset, ID}, 120000) of {'EXIT', {timeout, {gen_server, call, _}}} -> {error, timeout}; @@ -309,43 +313,8 @@ handle_call({add, End, Start, Packing, ID}, _From, State) -> {reply, Reply, State2}; handle_call({delete, End, Start, ID}, _From, State) -> - #state{ sync_record_by_id = SyncRecordByID, sync_record_by_id_type = SyncRecordByIDType, - state_db = StateDB, store_id = StoreID } = State, - SyncRecord = maps:get(ID, SyncRecordByID, ar_intervals:new()), - SyncRecord2 = ar_intervals:delete(SyncRecord, End, Start), - SyncRecordByID2 = maps:put(ID, SyncRecord2, SyncRecordByID), - TID = get_or_create_type_tid({ID, StoreID}), - ar_ets_intervals:delete(TID, End, Start), - SyncRecordByIDType2 = - maps:map( - fun - ({ID2, _}, ByType) when ID2 == ID -> - ar_intervals:delete(ByType, End, Start); - (_, ByType) -> - ByType - end, - SyncRecordByIDType - ), - ets:foldl( - fun - ({{ID2, _, SID}, TypeTID}, _) when ID2 == ID, SID == StoreID -> - ar_ets_intervals:delete(TypeTID, End, Start); - (_, _) -> - ok - end, - ok, - sync_records - ), - State2 = State#state{ sync_record_by_id = SyncRecordByID2, - sync_record_by_id_type = SyncRecordByIDType2 }, - {Reply, State3} = update_write_ahead_log({delete, {End, Start, ID}}, StateDB, State2), - case Reply of - ok -> - emit_remove_range(Start, End, StoreID); - _ -> - ok - end, - {reply, Reply, State3}; + {Reply, State2} = delete2(End, Start, ID, State), + {reply, Reply, State2}; handle_call({cut, Offset, ID}, _From, State) -> #state{ sync_record_by_id = SyncRecordByID, sync_record_by_id_type = SyncRecordByIDType, @@ -400,12 +369,10 @@ handle_cast({add_async, Event, End, Start, ID}, State) -> {Reply, State2} = add2(End, Start, ID, State), case Reply of ok -> - ?LOG_DEBUG([{event, Event}, - {status, success}, - {sync_record_id, ID}, - {offset, End}]); + ok; Error -> ?LOG_ERROR([{event, Event}, + {operation, add_async}, {status, failed}, {sync_record_id, ID}, {offset, End}, @@ -417,13 +384,10 @@ handle_cast({add_async, Event, End, Start, Packing, ID}, State) -> {Reply, State2} = add2(End, Start, Packing, ID, State), case Reply of ok -> - ?LOG_DEBUG([{event, Event}, - {status, success}, - {sync_record_id, ID}, - {offset, End}, - {packing, ar_serialize:encode_packing(Packing, true)}]); + ok; Error -> ?LOG_ERROR([{event, Event}, + {operation, add_async}, {status, failed}, {sync_record_id, ID}, {offset, End}, @@ -432,6 +396,22 @@ handle_cast({add_async, Event, End, Start, Packing, ID}, State) -> end, {noreply, State2}; +handle_cast({delete_async, Event, End, Start, ID}, State) -> + {Reply, State2} = delete2(End, Start, ID, State), + case Reply of + ok -> + ok; + Error -> + ?LOG_ERROR([{event, Event}, + {operation, delete_async}, + {status, failed}, + {sync_record_id, ID}, + {offset, End}, + {error, io_lib:format("~p", [Error])}, + {module, ?MODULE}]) + end, + {noreply, State2}; + handle_cast(Cast, State) -> ?LOG_WARNING([{event, unhandled_cast}, {module, ?MODULE}, {cast, Cast}]), {noreply, State}. @@ -448,6 +428,9 @@ terminate(Reason, State) -> %%% Private functions. %%%=================================================================== +name(StoreID) -> + list_to_atom("ar_sync_record_" ++ ar_storage_module:label_by_id(StoreID)). + add2(End, Start, ID, State) -> #state{ sync_record_by_id = SyncRecordByID, state_db = StateDB, store_id = StoreID } = State, @@ -490,6 +473,45 @@ add2(End, Start, Packing, ID, State) -> end, {Reply, State3}. +delete2(End, Start, ID, State) -> + #state{ sync_record_by_id = SyncRecordByID, sync_record_by_id_type = SyncRecordByIDType, + state_db = StateDB, store_id = StoreID } = State, + SyncRecord = maps:get(ID, SyncRecordByID, ar_intervals:new()), + SyncRecord2 = ar_intervals:delete(SyncRecord, End, Start), + SyncRecordByID2 = maps:put(ID, SyncRecord2, SyncRecordByID), + TID = get_or_create_type_tid({ID, StoreID}), + ar_ets_intervals:delete(TID, End, Start), + SyncRecordByIDType2 = + maps:map( + fun + ({ID2, _}, ByType) when ID2 == ID -> + ar_intervals:delete(ByType, End, Start); + (_, ByType) -> + ByType + end, + SyncRecordByIDType + ), + ets:foldl( + fun + ({{ID2, _, SID}, TypeTID}, _) when ID2 == ID, SID == StoreID -> + ar_ets_intervals:delete(TypeTID, End, Start); + (_, _) -> + ok + end, + ok, + sync_records + ), + State2 = State#state{ sync_record_by_id = SyncRecordByID2, + sync_record_by_id_type = SyncRecordByIDType2 }, + {Reply, State3} = update_write_ahead_log({delete, {End, Start, ID}}, StateDB, State2), + case Reply of + ok -> + emit_remove_range(Start, End, StoreID); + _ -> + ok + end, + {Reply, State3}. + is_recorded_any_by_type(Offset, ID, [StorageModule | StorageModules]) -> StoreID = ar_storage_module:id(StorageModule), {_, _, Packing} = StorageModule, @@ -626,8 +648,10 @@ replay_write_ahead_log(SyncRecordByID, SyncRecordByIDType, N, WAL, StateDB, Stor end end. -emit_add_range(Start, End, ID, StoreID) -> - ar_events:send(sync_record, {add_range, Start, End, ID, StoreID}). +emit_add_range(Start, End, ar_data_sync, StoreID) -> + ar_events:send(sync_record, {add_range, Start, End, ar_data_sync, StoreID}); +emit_add_range(_Start, _End, _ID, _StoreID) -> + ok. emit_remove_range(Start, End, StoreID) -> ar_events:send(sync_record, {remove_range, Start, End, StoreID}). diff --git a/apps/arweave/src/ar_util.erl b/apps/arweave/src/ar_util.erl index af694ac97..062db130a 100644 --- a/apps/arweave/src/ar_util.erl +++ b/apps/arweave/src/ar_util.erl @@ -3,17 +3,17 @@ -export([bool_to_int/1, int_to_bool/1, ceil_int/2, floor_int/2, between/3, integer_to_binary/1, binary_to_integer/1, pick_random/1, pick_random/2, encode/1, decode/1, safe_encode/1, safe_decode/1, timestamp_to_seconds/1, + invert_map/1, parse_peer/1, peer_to_str/1, parse_port/1, safe_parse_peer/1, format_peer/1, unique/1, count/2, genesis_wallets/0, pmap/2, pfilter/2, do_until/3, block_index_entry_from_block/1, bytes_to_mb_string/1, cast_after/3, encode_list_indices/1, parse_list_indices/1, take_every_nth/2, safe_divide/2, terminal_clear/0, print_stacktrace/0, shuffle_list/1, - assert_file_exists_and_readable/1]). + assert_file_exists_and_readable/1, get_system_device/1]). -include_lib("arweave/include/ar.hrl"). -include_lib("eunit/include/eunit.hrl"). --include_lib("kernel/include/file.hrl"). bool_to_int(true) -> 1; bool_to_int(_) -> 0. @@ -87,6 +87,20 @@ safe_decode(E) -> timestamp_to_seconds({MegaSecs, Secs, _MicroSecs}) -> MegaSecs * 1000000 + Secs. +%% @doc Convert a map from Key => Value, to Value => set(Keys) +-spec invert_map(map()) -> map(). +invert_map(Map) -> + maps:fold( + fun(Key, Value, Acc) -> + CurrentSet = maps:get(Value, Acc, sets:new()), + UpdatedSet = sets:add_element(Key, CurrentSet), + maps:put(Value, UpdatedSet, Acc) + end, + #{}, + Map + ). + + %% @doc Parse a string representing a remote host into our internal format. parse_peer("") -> throw(empty_peer_string); parse_peer(BitStr) when is_bitstring(BitStr) -> @@ -358,6 +372,12 @@ terminal_clear() -> end ). +-spec get_system_device(string()) -> string(). +get_system_device(Path) -> + Command = "df -P " ++ Path ++ " | awk 'NR==2 {print $1}'", + Device = os:cmd(Command), + string:trim(Device). + print_stacktrace() -> try throw(dummy) %% In OTP21+ try/catch is the recommended way to get the stacktrace diff --git a/apps/arweave/src/ar_verify_chunks.erl b/apps/arweave/src/ar_verify_chunks.erl index 7f10a731c..58c3dfd51 100644 --- a/apps/arweave/src/ar_verify_chunks.erl +++ b/apps/arweave/src/ar_verify_chunks.erl @@ -165,7 +165,7 @@ invalidate_chunk(Type, Offset, ChunkSize, State) -> invalidate_chunk(Type, Offset, ChunkSize, Logs, State) -> #state{ store_id = StoreID } = State, - ar_data_sync:invalidate_bad_data_record(Offset - ChunkSize, Offset, StoreID, 5), + ar_data_sync:invalidate_bad_data_record(Offset, ChunkSize, StoreID, Type), log_error(Type, Offset, ChunkSize, Logs, State). log_error(Type, Offset, ChunkSize, Logs, State) -> diff --git a/apps/arweave/test/ar_config_tests.erl b/apps/arweave/test/ar_config_tests.erl index a745b7c1e..9259654fb 100644 --- a/apps/arweave/test/ar_config_tests.erl +++ b/apps/arweave/test/ar_config_tests.erl @@ -69,6 +69,8 @@ test_parse_config() -> tx_validators = 3, post_tx_timeout = 50, max_emitters = 4, + replica_2_9_workers = 16, + packing_workers = 25, tx_propagation_parallelization = undefined, sync_jobs = 10, header_sync_jobs = 1, @@ -117,7 +119,6 @@ test_parse_config() -> gateway_arql := 3, get_sync_record := 10 }, - packing_rate = 20, max_nonce_limiter_validation_thread_count = 2, max_nonce_limiter_last_step_validation_thread_count = 3, nonce_limiter_server_trusted_peers = ["127.0.0.1", "2.3.4.5", "6.7.8.9:1982"], @@ -156,12 +157,12 @@ test_validate_repack_in_place() -> ar_config:validate_config(#config{ storage_modules = [{?PARTITION_SIZE, 0, {spora_2_6, Addr1}}], repack_in_place_storage_modules = [ - {{?PARTITION_SIZE, 1, {spora_2_6, Addr1}}, {spora_2_6, Addr2}}]})), + {{?PARTITION_SIZE, 1, {spora_2_6, Addr1}}, {replica_2_9, Addr2}}]})), ?assertEqual(false, ar_config:validate_config(#config{ storage_modules = [{?PARTITION_SIZE, 0, {spora_2_6, Addr1}}], repack_in_place_storage_modules = [ - {{?PARTITION_SIZE, 0, {spora_2_6, Addr1}}, {spora_2_6, Addr2}}]})), + {{?PARTITION_SIZE, 0, {spora_2_6, Addr1}}, {replica_2_9, Addr2}}]})), %% Repacking in place *from* replica_2_9 to any format is not currently supported. ?assertEqual(false, ar_config:validate_config(#config{ @@ -182,8 +183,39 @@ test_validate_repack_in_place() -> ar_config:validate_config(#config{ storage_modules = [], repack_in_place_storage_modules = [ - {{?PARTITION_SIZE, 0, {replica_2_9, Addr2}}, {composite, Addr1, 1}}]})). - + {{?PARTITION_SIZE, 0, {replica_2_9, Addr2}}, {composite, Addr1, 1}}]})), + %% Only repacking in place *to* replica_2_9 is supported. + ?assertEqual(true, + ar_config:validate_config(#config{ + storage_modules = [], + repack_in_place_storage_modules = [ + {{?PARTITION_SIZE, 0, unpacked}, {replica_2_9, Addr2}}]})), + ?assertEqual(true, + ar_config:validate_config(#config{ + storage_modules = [], + repack_in_place_storage_modules = [ + {{?PARTITION_SIZE, 0, {spora_2_6, Addr1}}, {replica_2_9, Addr2}}]})), + ?assertEqual(true, + ar_config:validate_config(#config{ + storage_modules = [], + repack_in_place_storage_modules = [ + {{?PARTITION_SIZE, 0, {composite, Addr1, 1}}, {replica_2_9, Addr2}}]})), + ?assertEqual(false, + ar_config:validate_config(#config{ + storage_modules = [], + repack_in_place_storage_modules = [ + {{?PARTITION_SIZE, 0, unpacked}, {spora_2_6, Addr2}}]})), + ?assertEqual(false, + ar_config:validate_config(#config{ + storage_modules = [], + repack_in_place_storage_modules = [ + {{?PARTITION_SIZE, 0, {spora_2_6, Addr1}}, {composite, Addr1, 1}}]})), + ?assertEqual(false, + ar_config:validate_config(#config{ + storage_modules = [], + repack_in_place_storage_modules = [ + {{?PARTITION_SIZE, 0, {composite, Addr1, 1}}, {spora_2_6, Addr2}}]})). + diff --git a/apps/arweave/test/ar_config_tests_config_fixture.json b/apps/arweave/test/ar_config_tests_config_fixture.json index b97f9680e..7ad38aa08 100644 --- a/apps/arweave/test/ar_config_tests_config_fixture.json +++ b/apps/arweave/test/ar_config_tests_config_fixture.json @@ -100,6 +100,8 @@ "gateway_arql": 3 }, "packing_rate": 20, + "replica_2_9_workers": 16, + "packing_workers": 25, "max_nonce_limiter_validation_thread_count": 2, "max_nonce_limiter_last_step_validation_thread_count": 3, "vdf_server_trusted_peer": "127.0.0.1", diff --git a/apps/arweave/test/ar_data_sync_tests.erl b/apps/arweave/test/ar_data_sync_tests.erl index 9ff51e434..aff36c65d 100644 --- a/apps/arweave/test/ar_data_sync_tests.erl +++ b/apps/arweave/test/ar_data_sync_tests.erl @@ -2,12 +2,25 @@ -include_lib("eunit/include/eunit.hrl"). --include_lib("arweave/include/ar.hrl"). --include_lib("arweave/include/ar_consensus.hrl"). --include_lib("arweave/include/ar_config.hrl"). +-include("../include/ar.hrl"). +-include("../include/ar_consensus.hrl"). +-include("../include/ar_config.hrl"). -import(ar_test_node, [assert_wait_until_height/2, test_with_mocked_functions/2]). +recovers_from_corruption_test_() -> + {timeout, 140, fun test_recovers_from_corruption/0}. + +test_recovers_from_corruption() -> + ar_test_data_sync:setup_nodes(), + {ok, Config} = application:get_env(arweave, config), + StoreID = ar_storage_module:id(hd(ar_storage_module:get_all(262144 * 3))), + ?debugFmt("Corrupting ~s...", [StoreID]), + [ar_chunk_storage:write_chunk(PaddedEndOffset, << 0:(262144*8) >>, #{}, StoreID) + || PaddedEndOffset <- lists:seq(262144, 262144 * 3, 262144)], + ar_test_node:mine(), + ar_test_node:assert_wait_until_height(main, 1). + syncs_data_test_() -> {timeout, 240, fun test_syncs_data/0}. diff --git a/apps/arweave/test/ar_post_block_tests.erl b/apps/arweave/test/ar_post_block_tests.erl index 5f5c7212d..58f9f90cd 100644 --- a/apps/arweave/test/ar_post_block_tests.erl +++ b/apps/arweave/test/ar_post_block_tests.erl @@ -559,6 +559,15 @@ test_reject_block_invalid_double_signing_proof(KeyType) -> post_block(B6, valid), post_block(B7, valid), post_block(B7_2, valid), + %% Wait until the node records conflicting proofs. + true = ar_util:do_until( + fun() -> + map_size(maps:get(double_signing_proofs, + sys:get_state(ar_node_worker), #{})) > 0 + end, + 200, + 5000 + ), ar_test_node:connect_to_peer(peer1), ar_test_node:mine(), BI3 = assert_wait_until_height(peer1, 3), diff --git a/apps/arweave/test/ar_test_node.erl b/apps/arweave/test/ar_test_node.erl index 1ecee8839..fd05c0899 100644 --- a/apps/arweave/test/ar_test_node.erl +++ b/apps/arweave/test/ar_test_node.erl @@ -139,7 +139,7 @@ try_boot_peer(TestType, Node, Retries) -> Cmd = io_lib:format( "erl +S ~B:~B -pa ~s -config config/sys.config -noshell " ++ "-name ~s -setcookie ~s -run ar main debug port ~p " ++ - "data_dir .tmp/data_~s_~s no_auto_join packing_rate 20 " ++ + "data_dir .tmp/data_~s_~s no_auto_join " ++ "> ~s-~s.out 2>&1 &", [Schedulers, Schedulers, string:join(Paths, " "), NodeName, Cookie, Port, atom_to_list(TestType), NodeName, Node, get_node_namespace()]), @@ -226,7 +226,6 @@ update_config(Config) -> auto_join = Config#config.auto_join, mining_addr = Config#config.mining_addr, sync_jobs = Config#config.sync_jobs, - packing_rate = Config#config.packing_rate, disk_pool_jobs = Config#config.disk_pool_jobs, header_sync_jobs = Config#config.header_sync_jobs, enable = Config#config.enable ++ BaseConfig#config.enable, @@ -322,7 +321,6 @@ base_cm_config(Peers) -> auto_join = true, mining_addr = RewardAddr, sync_jobs = 2, - packing_rate = 20, disk_pool_jobs = 2, header_sync_jobs = 2, enable = [search_in_rocksdb_when_mining, serve_tx_data_without_limits, @@ -518,7 +516,7 @@ remote_call(Node, Module, Function, Args, Timeout) -> ), case Result of {error, timeout} -> - ?debugFmt("Timed out (~pms) waiting for the rpc reply; module: ~p, function: ~p, " + ?LOG_ERROR("Timed out (~pms) waiting for the rpc reply; module: ~p, function: ~p, " "args: ~p, node: ~p.~n", [Timeout, Module, Function, Args, Node]); _ -> ok @@ -598,7 +596,6 @@ start(B0, RewardAddr, Config, StorageModules) -> storage_modules = StorageModules, disk_space_check_frequency = 1000, sync_jobs = 2, - packing_rate = 20, disk_pool_jobs = 2, header_sync_jobs = 2, enable = [search_in_rocksdb_when_mining, serve_tx_data_without_limits, @@ -896,8 +893,11 @@ wait_until_syncs_genesis_data() -> {ok, Config} = application:get_env(arweave, config), B = ar_node:get_current_block(), WeaveSize = B#block.weave_size, + ?LOG_INFO([{event, wait_until_syncs_genesis_data}, {status, initial_sync_started}, + {weave_size, WeaveSize}]), [wait_until_syncs_data(N * Size, (N + 1) * Size, WeaveSize, any) || {Size, N, _Packing} <- Config#config.storage_modules], + ?LOG_INFO([{event, wait_until_syncs_genesis_data}, {status, initial_sync_complete}]), %% Once the data is stored in the disk pool, make the storage modules %% copy the missing data over from each other. This procedure is executed on startup %% but the disk pool did not have any data at the time. @@ -905,6 +905,7 @@ wait_until_syncs_genesis_data() -> sync_data) || Module <- Config#config.storage_modules], [wait_until_syncs_data(N * Size, (N + 1) * Size, WeaveSize, Packing) || {Size, N, Packing} <- Config#config.storage_modules], + ?LOG_INFO([{event, wait_until_syncs_genesis_data}, {status, cross_module_sync_complete}]), ok. wait_until_height(Node, TargetHeight) ->