Skip to content

Commit

Permalink
Retry to early add_table_copy
Browse files Browse the repository at this point in the history
add_table copy/3 could abort with system_limit as reason
if the "added" node was starting, i.e. alive but not merged
schema yet.

Now abort with node_not_running, which should restart the transaction
automaticly after a short sleep.

Also improve debug (verbose) printouts and saved coredump info.
  • Loading branch information
dgud committed Nov 8, 2023
1 parent 5e5963b commit d17819b
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 6 deletions.
2 changes: 1 addition & 1 deletion lib/mnesia/src/mnesia_lib.erl
Original file line number Diff line number Diff line change
Expand Up @@ -1065,7 +1065,7 @@ save2(DbgInfo) ->
Key = {'$$$_report', current_pos},
P =
case ?ets_lookup_element(mnesia_gvar, Key, 2) of
30 -> -1;
100 -> -1;
I -> I
end,
set({'$$$_report', current_pos}, P+1),
Expand Down
3 changes: 3 additions & 0 deletions lib/mnesia/src/mnesia_loader.erl
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,7 @@ do_get_network_copy(Tab, Reason, Ns, Storage, Cs) ->
dbg_out("Table ~tp copied from ~p to ~p~n", [Tab, Node, node()]),
{loaded, ok};
Err = {error, _} when element(1, Reason) == dumper ->
verbose("Copy failed: ~tp ~p~n", [Tab, Err]),
{not_loaded,Err};
restart ->
try_net_load_table(Tab, Reason, Tail ++ [Node], Cs);
Expand Down Expand Up @@ -339,6 +340,7 @@ start_receiver(Tab,Storage,Cs,SenderPid,TabSize,DetsData,{dumper,{add_table_copy
Init = table_init_fun(SenderPid, Storage),
case do_init_table(Tab,Storage,Cs,SenderPid,TabSize,DetsData,self(), Init) of
Err = {error, _} ->
verbose("Init table failed: ~tp ~p~n", [Tab, Err]),
SenderPid ! {copier_done, node()},
Err;
Else ->
Expand All @@ -363,6 +365,7 @@ wait_on_load_complete(Pid) ->
{Pid, Res} ->
Res;
{'EXIT', Pid, Reason} ->
verbose("Loader crashed : ~tp ~p~n", [Pid, Reason]),
error(Reason);
Else ->
Pid ! Else,
Expand Down
3 changes: 2 additions & 1 deletion lib/mnesia/src/mnesia_schema.erl
Original file line number Diff line number Diff line change
Expand Up @@ -2466,13 +2466,14 @@ prepare_op(Tid, {op, add_table_copy, Storage, Node, TabDef}, _WaitFor) ->
_ ->
ok
end,
mnesia_lib:verbose("~w:~w Adding table~n",[?MODULE,?LINE]),

case mnesia_controller:get_network_copy(Tid, Tab, Cs) of
{loaded, ok} ->
%% Tables are created by mnesia_loader get_network code
insert_cstruct(Tid, Cs, true),
{true, optional};
{not_loaded, {not_active, schema, Node}} ->
mnesia:abort({node_not_running, Node});
{not_loaded, ErrReason} ->
Reason = {system_limit, Tab, {Node, ErrReason}},
mnesia:abort(Reason)
Expand Down
30 changes: 26 additions & 4 deletions lib/mnesia/src/mnesia_tm.erl
Original file line number Diff line number Diff line change
Expand Up @@ -892,27 +892,26 @@ restart(Mod, Tid, Ts, Fun, Args, Factor0, Retries0, Type, Why) ->
return_abort(Fun, Args, Why),
Factor = 1,
SleepTime = mnesia_lib:random_time(Factor, Tid#tid.counter),
dbg_out("Restarting transaction ~w: in ~wms ~w~n", [Tid, SleepTime, Why]),
log_restart("Restarting transaction ~w: in ~wms ~w~n", [Tid, SleepTime, Why]),
timer:sleep(SleepTime),
execute_outer(Mod, Fun, Args, Factor, Retries, Type);
{node_not_running, _N} -> %% Avoids hanging in receive_release_tid_ack
return_abort(Fun, Args, Why),
Factor = 1,
SleepTime = mnesia_lib:random_time(Factor, Tid#tid.counter),
dbg_out("Restarting transaction ~w: in ~wms ~w~n", [Tid, SleepTime, Why]),
log_restart("Restarting transaction ~w: in ~wms ~w~n", [Tid, SleepTime, Why]),
timer:sleep(SleepTime),
execute_outer(Mod, Fun, Args, Factor, Retries, Type);
_ ->
SleepTime = mnesia_lib:random_time(Factor0, Tid#tid.counter),
dbg_out("Restarting transaction ~w: in ~wms ~w~n", [Tid, SleepTime, Why]),

if
Factor0 /= 10 ->
ignore;
true ->
%% Our serial may be much larger than other nodes ditto
AllNodes = val({current, db_nodes}),
verbose("Sync serial ~p~n", [Tid]),
rpc:abcast(AllNodes, ?MODULE, {sync_trans_serial, Tid})
end,
intercept_friends(Tid, Ts),
Expand All @@ -931,6 +930,24 @@ restart(Mod, Tid, Ts, Fun, Args, Factor0, Retries0, Type, Why) ->
end
end.

log_restart(F,A) ->
case get(transaction_client) of
undefined ->
dbg_out(F,A);
_ ->
case get(transaction_count) of
undefined ->
put(transaction_count, 1),
verbose(F,A);
N when (N rem 10) == 0 ->
put(transaction_count, N+1),
verbose(F,A);
N ->
put(transaction_count, N+1),
dbg_out(F,A)
end
end.

get_restarted(Tid) ->
case Res = rec() of
{restarted, Tid} ->
Expand Down Expand Up @@ -2086,6 +2103,7 @@ new_cr_format(#commit{ext=Snmp}=Cr) ->
Cr#commit{ext=[{snmp,Snmp}]}.

rec_all([Node | Tail], Tid, Res, Pids) ->
put({?MODULE, ?FUNCTION_NAME}, {Node, Tail}),
receive
{?MODULE, Node, {vote_yes, Tid}} ->
rec_all(Tail, Tid, Res, Pids);
Expand All @@ -2104,8 +2122,12 @@ rec_all([Node | Tail], Tid, Res, Pids) ->
Abort = {do_abort, {bad_commit, Node}},
?SAFE({?MODULE, Node} ! {Tid, Abort}),
rec_all(Tail, Tid, Abort, Pids)
after 15000 ->
mnesia_lib:verbose("~p: trans ~p waiting ~p~n", [self(), Tid, Node]),
rec_all([Node | Tail], Tid, Res, Pids)
end;
rec_all([], _Tid, Res, Pids) ->
erase({?MODULE, ?FUNCTION_NAME}),
{Res, Pids}.

get_transactions() ->
Expand Down

0 comments on commit d17819b

Please sign in to comment.