Skip to content

Commit

Permalink
fix(gorgone): correctly handle gorgone pullwss module shutdown (#1530)
Browse files Browse the repository at this point in the history
Zmq object need to be undef at the end of the module life or a C stack trace will appear in the logs.
The sigterm handler was not called by perl, Mojo::IOLoop::Signal allow to correctly call the signal handler when quiting.

Refs:MON-34540
  • Loading branch information
kduret authored Jul 17, 2024
1 parent 4f70b28 commit 2516e38
Show file tree
Hide file tree
Showing 5 changed files with 65 additions and 32 deletions.
61 changes: 37 additions & 24 deletions gorgone/gorgone/modules/core/pullwss/class.pm
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ use gorgone::standard::library;
use gorgone::standard::constants qw(:all);
use gorgone::standard::misc;
use Mojo::UserAgent;
use Mojo::IOLoop::Signal;
use IO::Socket::SSL;
use IO::Handle;
use JSON::XS;
Expand All @@ -44,18 +45,19 @@ sub new {

$connector->{ping_timer} = -1;
$connector->{connected} = 0;
$connector->{stop} = 0;

$connector->set_signal_handlers();
return $connector;
}

sub set_signal_handlers {
my $self = shift;
# see https://metacpan.org/pod/EV#PERL-SIGNALS
# EV and Mojo::IOLoop don't seem to work in this module for setting a signal handler.
Mojo::IOLoop::Signal->on(TERM => sub { $self->handle_TERM() });
Mojo::IOLoop::Signal->on(HUP => sub { $self->handle_HUP() });

$SIG{TERM} = \&class_handle_TERM;
$handlers{TERM}->{$self} = sub { $self->handle_TERM() };
$SIG{HUP} = \&class_handle_HUP;
$handlers{HUP}->{$self} = sub { $self->handle_HUP() };
}

sub handle_HUP {
Expand Down Expand Up @@ -83,23 +85,32 @@ sub handle_TERM {
);

if ($self->{connected} == 1) {
$self->{tx}->send({text => $message });
$self->{tx}->on(drain => sub { Mojo::IOLoop->stop_gracefully(); });
} else {
Mojo::IOLoop->stop_gracefully();
# if the websocket is still connected, we send a message to the other end so it know we are shutting down
# And we say to mojo to stop when he don't have other message to process.
$self->{logger}->writeLogDebug("[pullwss] sending UNREGISTERNODES message to central before quiting as we are still connected to them.");
$self->{tx}->send( {text => $message });

$self->{tx}->on(drain => sub {
$self->{logger}->writeLogDebug("[pullwss] starting the stop_gracefully mojo sub");
Mojo::IOLoop->stop_gracefully()
});
}
}

sub class_handle_TERM {
foreach (keys %{$handlers{TERM}}) {
&{$handlers{TERM}->{$_}}();
else {
# if the websocket is not connected, we simply remove zmq socket and shutdown
# we need to shutdown the zmq socket ourself or there is a c++ stack trace error in the log.
disconnect_zmq_socket_and_exit();
}
}

sub class_handle_HUP {
foreach (keys %{$handlers{HUP}}) {
&{$handlers{HUP}->{$_}}();
}
sub disconnect_zmq_socket_and_exit {
$connector->{logger}->writeLogDebug("[pullwss] removing zmq socket : $connector->{internal_socket}");
# Following my tests we need both close() and undef to correctly close the zmq socket
# If we add only one of them the following error can arise after shutdown :
# Bad file descriptor (src/epoll.cpp:73)
$connector->{internal_socket}->close();
undef $connector->{internal_socket};
$connector->{logger}->writeLogInfo("[pullwss] exit now.");
exit(0);
}

sub send_message {
Expand Down Expand Up @@ -135,7 +146,7 @@ sub ping {
sub wss_connect {
my ($self, %options) = @_;

return if ($connector->{connected} == 1);
return if ($self->{stop} == 1 or $connector->{connected} == 1);

$self->{ua} = Mojo::UserAgent->new();
$self->{ua}->transactor->name('gorgone mojo');
Expand Down Expand Up @@ -219,14 +230,16 @@ sub run {
Mojo::IOLoop->singleton->reactor->watch($socket, 1, 0);

Mojo::IOLoop->singleton->recurring(60 => sub {
$connector->{logger}->writeLogDebug('[pullwss] recurring timeout loop');
$connector->wss_connect();
$connector->ping();
if (!$connector->{stop}){
$connector->{logger}->writeLogDebug('[pullwss] recurring timeout loop');
$connector->wss_connect();
$connector->ping();
}
});

Mojo::IOLoop->start() unless (Mojo::IOLoop->is_running);

exit(0);
disconnect_zmq_socket_and_exit();

}

sub transmit_back {
Expand Down Expand Up @@ -268,7 +281,7 @@ sub transmit_back {
sub read_zmq_events {
my ($self, %options) = @_;

while ($self->{internal_socket}->has_pollin()) {
while (!$self->{stop} and $self->{internal_socket}->has_pollin()) {
my ($message) = $connector->read_message();
$message = transmit_back(message => $message);
next if (!defined($message));
Expand Down
2 changes: 1 addition & 1 deletion gorgone/gorgone/standard/library.pm
Original file line number Diff line number Diff line change
Expand Up @@ -705,7 +705,7 @@ sub connect_com {
if ($options{type} eq 'tcp') {
$socket->set(ZMQ_TCP_KEEPALIVE, 'int', defined($options{zmq_tcp_keepalive}) ? $options{zmq_tcp_keepalive} : -1);
}

$options{logger}->writeLogInfo("connection to zmq socket : " . $options{type} . '://' . $options{path});
$socket->connect($options{type} . '://' . $options{path});
return $socket;
}
Expand Down
2 changes: 2 additions & 0 deletions gorgone/packaging/centreon-gorgone.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ overrides:
- perl-Libssh-Session >= 0.8
- perl-CryptX
- perl-Mojolicious
- perl(Mojo::IOLoop::Signal)
- perl(Archive::Tar)
- perl(Schedule::Cron)
- perl(ZMQ::FFI)
Expand Down Expand Up @@ -211,6 +212,7 @@ overrides:
- libhash-merge-perl
- libcryptx-perl
- libmojolicious-perl
- libmojo-ioloop-signal-perl
- libauthen-simple-perl
- libauthen-simple-net-perl
- libnet-curl-perl
Expand Down
18 changes: 13 additions & 5 deletions gorgone/tests/robot/resources/resources.resource
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ Library Process
Library RequestsLibrary
Library OperatingSystem

Library DatabaseLibrary
*** Variables ***
${gorgone_binary} /usr/bin/gorgoned
${ROOT_CONFIG} ${CURDIR}${/}..${/}config${/}
Expand Down Expand Up @@ -44,7 +45,7 @@ Stop Gorgone And Remove Gorgone Config

FOR ${process} IN @{process_alias}
${result} Terminate Process ${process}
BuiltIn.Run Keyword And Continue On Failure Should Be True ${result.rc} == -15 or ${result.rc} == -9 or ${result.rc} == 0 Engine badly stopped alias = ${process} - code returned ${result.rc}.
BuiltIn.Run Keyword And Continue On Failure Should Be True ${result.rc} == -15 or ${result.rc} == 0 Gorgone ${process} badly stopped, code returned is ${result.rc}.
END

Gorgone Execute Sql
Expand Down Expand Up @@ -90,7 +91,6 @@ Check Poller Is Connected
BREAK
END
END
Log To Console TCP connection establishing after ${i} attempt
Should Be True ${i} < 39 Gorgone did not establish tcp connection in 160 seconds.
Log To Console TCP connection established after ${i} attempt (4 seconds each)

Expand All @@ -111,7 +111,7 @@ Check Poller Communicate
END
END
Log To Console json response : ${response.json()}
Should Be True ${i} < 20 timeout after ${i} time waiting for poller status in gorgone rest api (/api/internal/constatus) : ${response.json()}
Should Be True ${i} < 19 timeout after ${i} time waiting for poller status in gorgone rest api (/api/internal/constatus) : ${response.json()}
Should Be True 0 == ${response.json()}[data][${poller_id}][ping_failed] there was failed ping between the central and the poller ${poller_id}
Should Be True 0 < ${response.json()}[data][${poller_id}][ping_ok] there was no successful ping between the central and the poller ${poller_id}

Expand Down Expand Up @@ -159,5 +159,13 @@ Wait Until Port Is Bind
BREAK
END
END
Should Be True ${i} < 10 Gorgone did not listen on port ${port} on time.
Log To Console tcp port ${port} bind after ${i} attempt (0.5 seconds each)
Should Be True ${i} < 9 Gorgone did not listen on port ${port} on time.
Log To Console tcp port ${port} bind after ${i} attempt (0.5 seconds each)

Ctn Check No Error In Logs
[Arguments] ${gorgone_id}
${cmd}= Set Variable grep -vP '^\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2} ' /var/log/centreon-gorgone/${gorgone_id}/gorgoned.log
Log To Console \n\n${cmd}\n\n

${log_line_wrong} RUN ${cmd}
Should Be Empty ${log_line_wrong} There is Log in ${gorgone_id} not mathcing the standard gorgone format : ${log_line_wrong}
14 changes: 12 additions & 2 deletions gorgone/tests/robot/tests/core/pullwss.robot
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,19 @@ Test Timeout 220s
@{process_list} pullwss_gorgone_poller_2 pullwss_gorgone_central

*** Test Cases ***
check one poller can connect to a central gorgone
[Teardown] Stop Gorgone And Remove Gorgone Config @{process_list} sql_file=${ROOT_CONFIG}db_delete_poller.sql
check one poller can connect to a central and gorgone central stop first
[Teardown] Stop Gorgone And Remove Gorgone Config @{process_list}
@{process_list} Set Variable pullwss_gorgone_central pullwss_gorgone_poller_2
Log To Console \nStarting the gorgone setup
Setup Two Gorgone Instances communication_mode=pullwss central_name=pullwss_gorgone_central poller_name=pullwss_gorgone_poller_2
Ctn Check No Error In Logs pullwss_gorgone_poller_2
Log To Console End of tests.

check one poller can connect to a central and gorgone poller stop first
[Teardown] Stop Gorgone And Remove Gorgone Config @{process_list}
@{process_list} Set Variable pullwss_gorgone_poller_2 pullwss_gorgone_central
Log To Console \nStarting the gorgone setup

Setup Two Gorgone Instances communication_mode=pullwss central_name=pullwss_gorgone_central poller_name=pullwss_gorgone_poller_2
Ctn Check No Error In Logs pullwss_gorgone_poller_2
Log To Console End of tests.

0 comments on commit 2516e38

Please sign in to comment.