From d17819b9d053027657497b9e4692d2bbfaeed48d Mon Sep 17 00:00:00 2001 From: Dan Gudmundsson Date: Wed, 8 Nov 2023 16:03:17 +0100 Subject: [PATCH] Retry to early add_table_copy add_table copy/3 could abort with system_limit as reason if the "added" node was starting, i.e. alive but not merged schema yet. Now abort with node_not_running, which should restart the transaction automaticly after a short sleep. Also improve debug (verbose) printouts and saved coredump info. --- lib/mnesia/src/mnesia_lib.erl | 2 +- lib/mnesia/src/mnesia_loader.erl | 3 +++ lib/mnesia/src/mnesia_schema.erl | 3 ++- lib/mnesia/src/mnesia_tm.erl | 30 ++++++++++++++++++++++++++---- 4 files changed, 32 insertions(+), 6 deletions(-) diff --git a/lib/mnesia/src/mnesia_lib.erl b/lib/mnesia/src/mnesia_lib.erl index 4dce5cefc0af..b1ca9ebf351a 100644 --- a/lib/mnesia/src/mnesia_lib.erl +++ b/lib/mnesia/src/mnesia_lib.erl @@ -1065,7 +1065,7 @@ save2(DbgInfo) -> Key = {'$$$_report', current_pos}, P = case ?ets_lookup_element(mnesia_gvar, Key, 2) of - 30 -> -1; + 100 -> -1; I -> I end, set({'$$$_report', current_pos}, P+1), diff --git a/lib/mnesia/src/mnesia_loader.erl b/lib/mnesia/src/mnesia_loader.erl index b16d00da6116..1a65dac2220a 100644 --- a/lib/mnesia/src/mnesia_loader.erl +++ b/lib/mnesia/src/mnesia_loader.erl @@ -226,6 +226,7 @@ do_get_network_copy(Tab, Reason, Ns, Storage, Cs) -> dbg_out("Table ~tp copied from ~p to ~p~n", [Tab, Node, node()]), {loaded, ok}; Err = {error, _} when element(1, Reason) == dumper -> + verbose("Copy failed: ~tp ~p~n", [Tab, Err]), {not_loaded,Err}; restart -> try_net_load_table(Tab, Reason, Tail ++ [Node], Cs); @@ -339,6 +340,7 @@ start_receiver(Tab,Storage,Cs,SenderPid,TabSize,DetsData,{dumper,{add_table_copy Init = table_init_fun(SenderPid, Storage), case do_init_table(Tab,Storage,Cs,SenderPid,TabSize,DetsData,self(), Init) of Err = {error, _} -> + verbose("Init table failed: ~tp ~p~n", [Tab, Err]), SenderPid ! {copier_done, node()}, Err; Else -> @@ -363,6 +365,7 @@ wait_on_load_complete(Pid) -> {Pid, Res} -> Res; {'EXIT', Pid, Reason} -> + verbose("Loader crashed : ~tp ~p~n", [Pid, Reason]), error(Reason); Else -> Pid ! Else, diff --git a/lib/mnesia/src/mnesia_schema.erl b/lib/mnesia/src/mnesia_schema.erl index e0f561937e43..27a7b118b558 100644 --- a/lib/mnesia/src/mnesia_schema.erl +++ b/lib/mnesia/src/mnesia_schema.erl @@ -2466,13 +2466,14 @@ prepare_op(Tid, {op, add_table_copy, Storage, Node, TabDef}, _WaitFor) -> _ -> ok end, - mnesia_lib:verbose("~w:~w Adding table~n",[?MODULE,?LINE]), case mnesia_controller:get_network_copy(Tid, Tab, Cs) of {loaded, ok} -> %% Tables are created by mnesia_loader get_network code insert_cstruct(Tid, Cs, true), {true, optional}; + {not_loaded, {not_active, schema, Node}} -> + mnesia:abort({node_not_running, Node}); {not_loaded, ErrReason} -> Reason = {system_limit, Tab, {Node, ErrReason}}, mnesia:abort(Reason) diff --git a/lib/mnesia/src/mnesia_tm.erl b/lib/mnesia/src/mnesia_tm.erl index 5a070cf0cde5..3a080e1d3260 100644 --- a/lib/mnesia/src/mnesia_tm.erl +++ b/lib/mnesia/src/mnesia_tm.erl @@ -892,27 +892,26 @@ restart(Mod, Tid, Ts, Fun, Args, Factor0, Retries0, Type, Why) -> return_abort(Fun, Args, Why), Factor = 1, SleepTime = mnesia_lib:random_time(Factor, Tid#tid.counter), - dbg_out("Restarting transaction ~w: in ~wms ~w~n", [Tid, SleepTime, Why]), + log_restart("Restarting transaction ~w: in ~wms ~w~n", [Tid, SleepTime, Why]), timer:sleep(SleepTime), execute_outer(Mod, Fun, Args, Factor, Retries, Type); {node_not_running, _N} -> %% Avoids hanging in receive_release_tid_ack return_abort(Fun, Args, Why), Factor = 1, SleepTime = mnesia_lib:random_time(Factor, Tid#tid.counter), - dbg_out("Restarting transaction ~w: in ~wms ~w~n", [Tid, SleepTime, Why]), + log_restart("Restarting transaction ~w: in ~wms ~w~n", [Tid, SleepTime, Why]), timer:sleep(SleepTime), execute_outer(Mod, Fun, Args, Factor, Retries, Type); _ -> SleepTime = mnesia_lib:random_time(Factor0, Tid#tid.counter), dbg_out("Restarting transaction ~w: in ~wms ~w~n", [Tid, SleepTime, Why]), - + if Factor0 /= 10 -> ignore; true -> %% Our serial may be much larger than other nodes ditto AllNodes = val({current, db_nodes}), - verbose("Sync serial ~p~n", [Tid]), rpc:abcast(AllNodes, ?MODULE, {sync_trans_serial, Tid}) end, intercept_friends(Tid, Ts), @@ -931,6 +930,24 @@ restart(Mod, Tid, Ts, Fun, Args, Factor0, Retries0, Type, Why) -> end end. +log_restart(F,A) -> + case get(transaction_client) of + undefined -> + dbg_out(F,A); + _ -> + case get(transaction_count) of + undefined -> + put(transaction_count, 1), + verbose(F,A); + N when (N rem 10) == 0 -> + put(transaction_count, N+1), + verbose(F,A); + N -> + put(transaction_count, N+1), + dbg_out(F,A) + end + end. + get_restarted(Tid) -> case Res = rec() of {restarted, Tid} -> @@ -2086,6 +2103,7 @@ new_cr_format(#commit{ext=Snmp}=Cr) -> Cr#commit{ext=[{snmp,Snmp}]}. rec_all([Node | Tail], Tid, Res, Pids) -> + put({?MODULE, ?FUNCTION_NAME}, {Node, Tail}), receive {?MODULE, Node, {vote_yes, Tid}} -> rec_all(Tail, Tid, Res, Pids); @@ -2104,8 +2122,12 @@ rec_all([Node | Tail], Tid, Res, Pids) -> Abort = {do_abort, {bad_commit, Node}}, ?SAFE({?MODULE, Node} ! {Tid, Abort}), rec_all(Tail, Tid, Abort, Pids) + after 15000 -> + mnesia_lib:verbose("~p: trans ~p waiting ~p~n", [self(), Tid, Node]), + rec_all([Node | Tail], Tid, Res, Pids) end; rec_all([], _Tid, Res, Pids) -> + erase({?MODULE, ?FUNCTION_NAME}), {Res, Pids}. get_transactions() ->