Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MON-48766-community-pr-fix-file-descriptors-leak-in-gorgone-that-occur-when-pollers-are-disconnected #1512

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/docker/Dockerfile.gorgone-testing-alma8
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@ FROM almalinux:8

RUN bash -e <<EOF

dnf install -y dnf-plugins-core zstd curl mariadb iproute
dnf install -y dnf-plugins-core zstd curl mariadb iproute procps lsof
dnf install -y https://rpms.remirepo.net/enterprise/remi-release-8.rpm
dnf config-manager --set-enabled 'powertools'
dnf -y config-manager --add-repo https://packages.centreon.com/rpm-standard/23.10/el8/centreon-23.10.repo
dnf -y clean all --enablerepo=*
dnf install -y python3.11 python3.11-pip
pip3.11 install robotframework robotframework-examples robotframework-databaselibrary pymysql robotframework-requests
pip3.11 install robotframework robotframework-examples robotframework-databaselibrary pymysql robotframework-requests robotframework-jsonlibrary

dnf clean all

Expand Down
4 changes: 2 additions & 2 deletions .github/docker/Dockerfile.gorgone-testing-bullseye
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ ENV LANG en_US.utf8
RUN bash -e <<EOF
apt-get update
# Install Robotframework
apt-get install -y python3 python3-dev python3-pip ca-certificates apt-transport-https software-properties-common wget gnupg2
pip3 install robotframework robotframework-examples robotframework-databaselibrary pymysql robotframework-requests
apt-get install -y python3 python3-dev python3-pip ca-certificates apt-transport-https software-properties-common wget gnupg2 procps lsof
pip3 install robotframework robotframework-examples robotframework-databaselibrary pymysql robotframework-requests robotframework-jsonlibrary

echo "deb https://packages.centreon.com/apt-standard-23.10-stable/ $(lsb_release -sc) main" | tee /etc/apt/sources.list.d/centreon.list
echo "deb https://packages.centreon.com/apt-plugins-stable/ $(lsb_release -sc) main" | tee /etc/apt/sources.list.d/centreon-plugins.list
Expand Down
72 changes: 43 additions & 29 deletions gorgone/gorgone/class/clientzmq.pm
Original file line number Diff line number Diff line change
Expand Up @@ -109,11 +109,20 @@ sub init {
$callbacks->{ $self->{identity} } = $options{callback} if (defined($options{callback}));
}

sub cleanup {
my ($self, %options) = @_;

delete $callbacks->{ $self->{identity} };
delete $connectors->{ $self->{identity} };
delete $sockets->{ $self->{identity} };
}

sub close {
my ($self, %options) = @_;

$sockets->{ $self->{identity} }->close() if (defined($sockets->{ $self->{identity} }));
$self->{core_watcher}->stop() if (defined($self->{core_watcher}));
delete $self->{core_watcher};
$sockets->{ $self->{identity} }->close();
}

sub get_connect_identity {
Expand All @@ -128,14 +137,19 @@ sub get_server_pubkey {
$sockets->{ $self->{identity} }->send('[GETPUBKEY]', ZMQ_DONTWAIT);
$self->event(identity => $self->{identity});

my $w1 = $self->{connect_loop}->timer(
my $w1 = $self->{connect_loop}->io(
$sockets->{ $self->{identity} }->get_fd(),
EV::READ,
sub {
$self->event(identity => $self->{identity});
}
);
my $w2 = $self->{connect_loop}->timer(
10,
0,
sub {
$self->{connect_loop}->break();
}
sub {}
);
$self->{connect_loop}->run();
$self->{connect_loop}->run(EV::RUN_ONCE);
}

sub read_key_protocol {
Expand Down Expand Up @@ -278,9 +292,9 @@ sub ping {
time() - $self->{ping_timeout_time} > $self->{ping_timeout}) {
$self->{logger}->writeLogError("[clientzmq] No ping response") if (defined($self->{logger}));
$self->{ping_progress} = 0;
$sockets->{ $self->{identity} }->close();
delete $self->{core_watcher};

$self->close();
# new identity for a new handshake (for module pull)
$self->{extra_identity} = gorgone::standard::library::generate_token(length => 12);
$self->init();
$status = 1;
}
Expand All @@ -305,25 +319,23 @@ sub event {

$connectors->{ $options{identity} }->{ping_time} = time();
while ($sockets->{ $options{identity} }->has_pollin()) {
my ($rv, $message) = gorgone::standard::library::zmq_dealer_read_message(socket => $sockets->{ $options{identity} });
next if ($connectors->{ $options{identity} }->{handshake} == -1);
next if ($rv);

# We have a response. So it's ok :)
if ($connectors->{ $options{identity} }->{ping_progress} == 1) {
$connectors->{ $options{identity} }->{ping_progress} = 0;
}

my ($rv, $message) = gorgone::standard::library::zmq_dealer_read_message(socket => $sockets->{ $options{identity} });
last if ($rv);

# in progress
if ($connectors->{ $options{identity} }->{handshake} == 0) {
$self->{connect_loop}->break();
$connectors->{ $options{identity} }->{handshake} = 1;
if ($connectors->{ $options{identity} }->check_server_pubkey(message => $message) == 0) {
$connectors->{ $options{identity} }->{handshake} = -1;

}
} elsif ($connectors->{ $options{identity} }->{handshake} == 1) {
$self->{connect_loop}->break();

$self->{logger}->writeLogDebug("[clientzmq] $self->{identity} - client_get_secret recv [3]");
my ($status, $verbose, $symkey, $hostname) = $connectors->{ $options{identity} }->client_get_secret(
message => $message
Expand All @@ -332,7 +344,7 @@ sub event {
$self->{logger}->writeLogDebug("[clientzmq] $self->{identity} - client_get_secret $verbose [3]");
$connectors->{ $options{identity} }->{handshake} = -1;
$connectors->{ $options{identity} }->{verbose_last_message} = $verbose;
return ;
next;
}
$connectors->{ $options{identity} }->{handshake} = 2;
if (defined($connectors->{ $options{identity} }->{logger})) {
Expand All @@ -348,7 +360,7 @@ sub event {
if ($rv == -1 || $data !~ /^\[([a-zA-Z0-9:\-_]+?)\]\s+/) {
$connectors->{ $options{identity} }->{handshake} = -1;
$connectors->{ $options{identity} }->{verbose_last_message} = 'decrypt issue: ' . $data;
return ;
next;
}

if ($1 eq 'KEY') {
Expand Down Expand Up @@ -389,14 +401,7 @@ sub send_message {
my ($self, %options) = @_;

if ($self->{handshake} == 0) {
$self->{connect_loop} = new EV::Loop();
$self->{connect_watcher} = $self->{connect_loop}->io(
$sockets->{ $self->{identity} }->get_fd(),
EV::READ,
sub {
$self->event(identity => $self->{identity});
}
);
$self->{connect_loop} = EV::Loop->new();

if (!defined($self->{server_pubkey})) {
$self->{logger}->writeLogDebug("[clientzmq] $self->{identity} - get_server_pubkey sent [1]");
Expand Down Expand Up @@ -424,15 +429,24 @@ sub send_message {
$sockets->{ $self->{identity} }->send($ciphertext, ZMQ_DONTWAIT);
$self->event(identity => $self->{identity});

my $w1 = $self->{connect_loop}->timer(
my $w1 = $self->{connect_loop}->io(
$sockets->{ $self->{identity} }->get_fd(),
EV::READ,
sub {
$self->event(identity => $self->{identity});
}
);
my $w2 = $self->{connect_loop}->timer(
10,
0,
sub { $self->{connect_loop}->break(); }
sub {}
);
$self->{connect_loop}->run();
$self->{connect_loop}->run(EV::RUN_ONCE);
}

undef $self->{connect_loop} if (defined($self->{connect_loop}));
if (defined($self->{connect_loop})) {
delete $self->{connect_loop};
}

if ($self->{handshake} < 2) {
$self->{handshake} = 0;
Expand Down
8 changes: 7 additions & 1 deletion gorgone/gorgone/modules/core/proxy/class.pm
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,7 @@ sub action_proxyaddnode {
});

$self->{clients}->{ $data->{id} }->{class}->close();
$self->{clients}->{ $data->{id} }->{class}->cleanup();
} else {
$self->{internal_channels}->{ $data->{id} } = gorgone::standard::library::connect_com(
context => $self->{zmq_context},
Expand Down Expand Up @@ -282,6 +283,7 @@ sub action_proxycloseconnection {
$self->{logger}->writeLogInfo("[proxy] Close connectionn for $data->{id}");

$self->{clients}->{ $data->{id} }->{class}->close();
$self->{clients}->{ $data->{id} }->{class}->cleanup();
$self->{clients}->{ $data->{id} }->{delete} = 0;
$self->{clients}->{ $data->{id} }->{class} = undef;
}
Expand All @@ -293,6 +295,7 @@ sub close_connections {
if (defined($self->{clients}->{$_}->{class}) && $self->{clients}->{$_}->{type} eq 'push_zmq') {
$self->{logger}->writeLogInfo("[proxy] Close connection for $_");
$self->{clients}->{$_}->{class}->close();
$self->{clients}->{$_}->{class}->cleanup();
}
}
}
Expand Down Expand Up @@ -497,7 +500,10 @@ sub periodic_exec {
token => $connector->generate_token(),
target => ''
});
$connector->{clients}->{$_}->{class}->close() if (defined($connector->{clients}->{$_}->{class}));
if (defined($connector->{clients}->{$_}->{class})) {
$connector->{clients}->{$_}->{class}->close();
$connector->{clients}->{$_}->{class}->cleanup();
}
$connector->{clients}->{$_}->{class} = undef;
$connector->{clients}->{$_}->{delete} = 0;
$connector->{clients}->{$_}->{com_read_internal} = 0;
Expand Down
2 changes: 2 additions & 0 deletions gorgone/gorgone/modules/core/proxy/sshclient.pm
Original file line number Diff line number Diff line change
Expand Up @@ -552,4 +552,6 @@ sub close {
$self->disconnect();
}

sub cleanup {}

1;
6 changes: 3 additions & 3 deletions gorgone/tests/robot/resources/resources.resource
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ Setup Two Gorgone Instances
# gorgone can generate it's own key, but as we need the thumbprint in the configuration we need to generate them before launching gorgone.
# this script only create key if the files don't exists, and silently finish if the files already exists.
IF '${communication_mode}' == 'push_zmq'
Setup Gorgone Config ${push_central_config} ${gorgone_core_config} gorgone_name=${central_name} sql_file=${ROOT_CONFIG}push_db_1_poller.sql
Setup Gorgone Config ${push_central_config} ${gorgone_core_config} gorgone_name=${central_name} sql_file=${ROOT_CONFIG}db_add_1_poller.sql
Setup Gorgone Config ${push_poller_config} gorgone_name=${poller_name}

Start Gorgone debug ${central_name}
Expand All @@ -131,7 +131,7 @@ Setup Two Gorgone Instances
Check Poller Is Connected port=5556 expected_nb=2
Check Poller Communicate 2
ELSE IF '${communication_mode}' == 'pullwss'
Setup Gorgone Config ${pullwss_central_config} ${ROOT_CONFIG}pullwss_node_register_one_node.yaml ${gorgone_core_config} gorgone_name=${central_name} sql_file=${ROOT_CONFIG}push_db_1_poller.sql
Setup Gorgone Config ${pullwss_central_config} ${ROOT_CONFIG}pullwss_node_register_one_node.yaml ${gorgone_core_config} gorgone_name=${central_name} sql_file=${ROOT_CONFIG}db_add_1_poller.sql
Setup Gorgone Config ${gorgone_core_config} ${pullwss_poller_config} gorgone_name=${poller_name}

Start Gorgone debug ${central_name}
Expand All @@ -140,7 +140,7 @@ Setup Two Gorgone Instances
Check Poller Is Connected port=8086 expected_nb=2
Check Poller Communicate 2
ELSE IF '${communication_mode}' == 'pull'
Setup Gorgone Config ${pull_central_config} ${ROOT_CONFIG}pull_node_register_one_node.yaml ${gorgone_core_config} gorgone_name=${central_name} sql_file=${ROOT_CONFIG}push_db_1_poller.sql
Setup Gorgone Config ${pull_central_config} ${ROOT_CONFIG}pull_node_register_one_node.yaml ${gorgone_core_config} gorgone_name=${central_name} sql_file=${ROOT_CONFIG}db_add_1_poller.sql
Setup Gorgone Config ${gorgone_core_config} ${pull_poller_config} gorgone_name=${poller_name}

Start Gorgone debug ${central_name}
Expand Down
2 changes: 1 addition & 1 deletion gorgone/tests/robot/tests/core/pull.robot
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Test Timeout 300s

*** Test Cases ***
connect 1 poller to a central with pull configuration
[Teardown] Stop Gorgone And Remove Gorgone Config @{process_list} sql_file=${ROOT_CONFIG}push_db_1_poller_delete.sql
[Teardown] Stop Gorgone And Remove Gorgone Config @{process_list} sql_file=${ROOT_CONFIG}db_delete_poller.sql

Log To Console \nStarting the Gorgone setup with pull configuration
Setup Two Gorgone Instances communication_mode=pull central_name=pull_gorgone_central poller_name=pull_gorgone_poller_2
Expand Down
2 changes: 1 addition & 1 deletion gorgone/tests/robot/tests/core/pullwss.robot
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Test Timeout 220s

*** Test Cases ***
check one poller can connect to a central gorgone
[Teardown] Stop Gorgone And Remove Gorgone Config @{process_list} sql_file=${ROOT_CONFIG}push_db_1_poller_delete.sql
[Teardown] Stop Gorgone And Remove Gorgone Config @{process_list} sql_file=${ROOT_CONFIG}db_delete_poller.sql

Log To Console \nStarting the gorgone setup
Setup Two Gorgone Instances communication_mode=pullwss central_name=pullwss_gorgone_central poller_name=pullwss_gorgone_poller_2
Expand Down
2 changes: 1 addition & 1 deletion gorgone/tests/robot/tests/core/push.robot
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Test Timeout 220s

*** Test Cases ***
connect 1 poller to a central
[Teardown] Stop Gorgone And Remove Gorgone Config @{process_list} sql_file=${ROOT_CONFIG}push_db_1_poller_delete.sql
[Teardown] Stop Gorgone And Remove Gorgone Config @{process_list} sql_file=${ROOT_CONFIG}db_delete_poller.sql

Log To Console \nStarting the gorgone setup
Setup Two Gorgone Instances communication_mode=push_zmq central_name=push_zmq_gorgone_central poller_name=push_zmq_gorgone_poller_2
Expand Down
33 changes: 33 additions & 0 deletions gorgone/tests/robot/tests/long_tests/fd-leak.robot
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
*** Settings ***
Documentation check Gorgone don't leak file descriptor when a poller become unavailable

Resource ${CURDIR}${/}..${/}..${/}resources${/}import.resource
Test Timeout 1200s

*** Test Cases ***
check gorgone proxy do not leak file descriptor with a poller
[Tags] long_tests
[Teardown] Stop Gorgone And Remove Gorgone Config push_zmq_gorgone_central sql_file=${ROOT_CONFIG}db_delete_poller.sql
${cmd_count_file_descriptor}= Set Variable count=0; for pid in \$(ps aux | grep gorgone-proxy | grep -v grep | awk '{ print \$2 }') ; do num=\$(lsof | grep \$pid | wc -l); count=\$((count + \$num)) ; done ; echo \$count

Log To Console \nStarting the gorgone setup
Setup Two Gorgone Instances communication_mode=push_zmq central_name=push_zmq_gorgone_central poller_name=push_zmq_gorgone_poller_2
# We wait for gorgone to be ready, and grab all file descriptor it need.
Sleep 10
${before_kill_fd_nb} Run ${cmd_count_file_descriptor}
Stop Gorgone And Remove Gorgone Config push_zmq_gorgone_poller_2
Sleep 10
# check what is the normal number of file descriptor for gorgone to take
${initial_fd_nb} Run ${cmd_count_file_descriptor}
Log To Console \n number of file descriptor on before killing poller : ${before_kill_fd_nb} and after : ${initial_fd_nb} \n
${max}= Evaluate ${initial_fd_nb} + 15
Log To Console max is ${max}
Sleep 20
FOR ${i} IN RANGE 60
${current_fd_nb} Run ${cmd_count_file_descriptor}
IF ${i} % 10 == 0
Log To Console exec ${i} \t got ${current_fd_nb}
END
Should Be True ${max} > ${current_fd_nb} gorgone is using more and more file descriptor after a poller disconnect, starting at ${initial_fd_nb} and after ${i} iteration (5 sec each) to ${current_fd_nb}
Sleep 5
END