Skip to content

Commit

Permalink
MON-48766-community-pr-fix-file-descriptors-leak-in-gorgone-that-occu…
Browse files Browse the repository at this point in the history
…r-when-pollers-are-disconnected (#1512)

fix(gorgone): fd leak

TODO : we need to change the gha to launch this test only on develop, and if possible execute it nighty

Co-authored-by: garnier-quentin <[email protected]>
Co-authored-by: qgarnier <[email protected]>
  • Loading branch information
3 people authored Jul 16, 2024
1 parent 299de12 commit 6258e6d
Show file tree
Hide file tree
Showing 12 changed files with 95 additions and 40 deletions.
4 changes: 2 additions & 2 deletions .github/docker/Dockerfile.gorgone-testing-alma8
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@ FROM almalinux:8

RUN bash -e <<EOF

dnf install -y dnf-plugins-core zstd curl mariadb iproute
dnf install -y dnf-plugins-core zstd curl mariadb iproute procps lsof
dnf install -y https://rpms.remirepo.net/enterprise/remi-release-8.rpm
dnf config-manager --set-enabled 'powertools'
dnf -y config-manager --add-repo https://packages.centreon.com/rpm-standard/23.10/el8/centreon-23.10.repo
dnf -y clean all --enablerepo=*
dnf install -y python3.11 python3.11-pip
pip3.11 install robotframework robotframework-examples robotframework-databaselibrary pymysql robotframework-requests
pip3.11 install robotframework robotframework-examples robotframework-databaselibrary pymysql robotframework-requests robotframework-jsonlibrary

dnf clean all

Expand Down
4 changes: 2 additions & 2 deletions .github/docker/Dockerfile.gorgone-testing-bullseye
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ ENV LANG en_US.utf8
RUN bash -e <<EOF
apt-get update
# Install Robotframework
apt-get install -y python3 python3-dev python3-pip ca-certificates apt-transport-https software-properties-common wget gnupg2
pip3 install robotframework robotframework-examples robotframework-databaselibrary pymysql robotframework-requests
apt-get install -y python3 python3-dev python3-pip ca-certificates apt-transport-https software-properties-common wget gnupg2 procps lsof
pip3 install robotframework robotframework-examples robotframework-databaselibrary pymysql robotframework-requests robotframework-jsonlibrary

echo "deb https://packages.centreon.com/apt-standard-23.10-stable/ $(lsb_release -sc) main" | tee /etc/apt/sources.list.d/centreon.list
echo "deb https://packages.centreon.com/apt-plugins-stable/ $(lsb_release -sc) main" | tee /etc/apt/sources.list.d/centreon-plugins.list
Expand Down
72 changes: 43 additions & 29 deletions gorgone/gorgone/class/clientzmq.pm
Original file line number Diff line number Diff line change
Expand Up @@ -109,11 +109,20 @@ sub init {
$callbacks->{ $self->{identity} } = $options{callback} if (defined($options{callback}));
}

sub cleanup {
my ($self, %options) = @_;

delete $callbacks->{ $self->{identity} };
delete $connectors->{ $self->{identity} };
delete $sockets->{ $self->{identity} };
}

sub close {
my ($self, %options) = @_;

$sockets->{ $self->{identity} }->close() if (defined($sockets->{ $self->{identity} }));
$self->{core_watcher}->stop() if (defined($self->{core_watcher}));
delete $self->{core_watcher};
$sockets->{ $self->{identity} }->close();
}

sub get_connect_identity {
Expand All @@ -128,14 +137,19 @@ sub get_server_pubkey {
$sockets->{ $self->{identity} }->send('[GETPUBKEY]', ZMQ_DONTWAIT);
$self->event(identity => $self->{identity});

my $w1 = $self->{connect_loop}->timer(
my $w1 = $self->{connect_loop}->io(
$sockets->{ $self->{identity} }->get_fd(),
EV::READ,
sub {
$self->event(identity => $self->{identity});
}
);
my $w2 = $self->{connect_loop}->timer(
10,
0,
sub {
$self->{connect_loop}->break();
}
sub {}
);
$self->{connect_loop}->run();
$self->{connect_loop}->run(EV::RUN_ONCE);
}

sub read_key_protocol {
Expand Down Expand Up @@ -278,9 +292,9 @@ sub ping {
time() - $self->{ping_timeout_time} > $self->{ping_timeout}) {
$self->{logger}->writeLogError("[clientzmq] No ping response") if (defined($self->{logger}));
$self->{ping_progress} = 0;
$sockets->{ $self->{identity} }->close();
delete $self->{core_watcher};

$self->close();
# new identity for a new handshake (for module pull)
$self->{extra_identity} = gorgone::standard::library::generate_token(length => 12);
$self->init();
$status = 1;
}
Expand All @@ -305,25 +319,23 @@ sub event {

$connectors->{ $options{identity} }->{ping_time} = time();
while ($sockets->{ $options{identity} }->has_pollin()) {
my ($rv, $message) = gorgone::standard::library::zmq_dealer_read_message(socket => $sockets->{ $options{identity} });
next if ($connectors->{ $options{identity} }->{handshake} == -1);
next if ($rv);

# We have a response. So it's ok :)
if ($connectors->{ $options{identity} }->{ping_progress} == 1) {
$connectors->{ $options{identity} }->{ping_progress} = 0;
}

my ($rv, $message) = gorgone::standard::library::zmq_dealer_read_message(socket => $sockets->{ $options{identity} });
last if ($rv);

# in progress
if ($connectors->{ $options{identity} }->{handshake} == 0) {
$self->{connect_loop}->break();
$connectors->{ $options{identity} }->{handshake} = 1;
if ($connectors->{ $options{identity} }->check_server_pubkey(message => $message) == 0) {
$connectors->{ $options{identity} }->{handshake} = -1;

}
} elsif ($connectors->{ $options{identity} }->{handshake} == 1) {
$self->{connect_loop}->break();

$self->{logger}->writeLogDebug("[clientzmq] $self->{identity} - client_get_secret recv [3]");
my ($status, $verbose, $symkey, $hostname) = $connectors->{ $options{identity} }->client_get_secret(
message => $message
Expand All @@ -332,7 +344,7 @@ sub event {
$self->{logger}->writeLogDebug("[clientzmq] $self->{identity} - client_get_secret $verbose [3]");
$connectors->{ $options{identity} }->{handshake} = -1;
$connectors->{ $options{identity} }->{verbose_last_message} = $verbose;
return ;
next;
}
$connectors->{ $options{identity} }->{handshake} = 2;
if (defined($connectors->{ $options{identity} }->{logger})) {
Expand All @@ -348,7 +360,7 @@ sub event {
if ($rv == -1 || $data !~ /^\[([a-zA-Z0-9:\-_]+?)\]\s+/) {
$connectors->{ $options{identity} }->{handshake} = -1;
$connectors->{ $options{identity} }->{verbose_last_message} = 'decrypt issue: ' . $data;
return ;
next;
}

if ($1 eq 'KEY') {
Expand Down Expand Up @@ -389,14 +401,7 @@ sub send_message {
my ($self, %options) = @_;

if ($self->{handshake} == 0) {
$self->{connect_loop} = new EV::Loop();
$self->{connect_watcher} = $self->{connect_loop}->io(
$sockets->{ $self->{identity} }->get_fd(),
EV::READ,
sub {
$self->event(identity => $self->{identity});
}
);
$self->{connect_loop} = EV::Loop->new();

if (!defined($self->{server_pubkey})) {
$self->{logger}->writeLogDebug("[clientzmq] $self->{identity} - get_server_pubkey sent [1]");
Expand Down Expand Up @@ -424,15 +429,24 @@ sub send_message {
$sockets->{ $self->{identity} }->send($ciphertext, ZMQ_DONTWAIT);
$self->event(identity => $self->{identity});

my $w1 = $self->{connect_loop}->timer(
my $w1 = $self->{connect_loop}->io(
$sockets->{ $self->{identity} }->get_fd(),
EV::READ,
sub {
$self->event(identity => $self->{identity});
}
);
my $w2 = $self->{connect_loop}->timer(
10,
0,
sub { $self->{connect_loop}->break(); }
sub {}
);
$self->{connect_loop}->run();
$self->{connect_loop}->run(EV::RUN_ONCE);
}

undef $self->{connect_loop} if (defined($self->{connect_loop}));
if (defined($self->{connect_loop})) {
delete $self->{connect_loop};
}

if ($self->{handshake} < 2) {
$self->{handshake} = 0;
Expand Down
8 changes: 7 additions & 1 deletion gorgone/gorgone/modules/core/proxy/class.pm
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,7 @@ sub action_proxyaddnode {
});

$self->{clients}->{ $data->{id} }->{class}->close();
$self->{clients}->{ $data->{id} }->{class}->cleanup();
} else {
$self->{internal_channels}->{ $data->{id} } = gorgone::standard::library::connect_com(
context => $self->{zmq_context},
Expand Down Expand Up @@ -282,6 +283,7 @@ sub action_proxycloseconnection {
$self->{logger}->writeLogInfo("[proxy] Close connectionn for $data->{id}");

$self->{clients}->{ $data->{id} }->{class}->close();
$self->{clients}->{ $data->{id} }->{class}->cleanup();
$self->{clients}->{ $data->{id} }->{delete} = 0;
$self->{clients}->{ $data->{id} }->{class} = undef;
}
Expand All @@ -293,6 +295,7 @@ sub close_connections {
if (defined($self->{clients}->{$_}->{class}) && $self->{clients}->{$_}->{type} eq 'push_zmq') {
$self->{logger}->writeLogInfo("[proxy] Close connection for $_");
$self->{clients}->{$_}->{class}->close();
$self->{clients}->{$_}->{class}->cleanup();
}
}
}
Expand Down Expand Up @@ -497,7 +500,10 @@ sub periodic_exec {
token => $connector->generate_token(),
target => ''
});
$connector->{clients}->{$_}->{class}->close() if (defined($connector->{clients}->{$_}->{class}));
if (defined($connector->{clients}->{$_}->{class})) {
$connector->{clients}->{$_}->{class}->close();
$connector->{clients}->{$_}->{class}->cleanup();
}
$connector->{clients}->{$_}->{class} = undef;
$connector->{clients}->{$_}->{delete} = 0;
$connector->{clients}->{$_}->{com_read_internal} = 0;
Expand Down
2 changes: 2 additions & 0 deletions gorgone/gorgone/modules/core/proxy/sshclient.pm
Original file line number Diff line number Diff line change
Expand Up @@ -552,4 +552,6 @@ sub close {
$self->disconnect();
}

sub cleanup {}

1;
6 changes: 3 additions & 3 deletions gorgone/tests/robot/resources/resources.resource
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ Setup Two Gorgone Instances
# gorgone can generate it's own key, but as we need the thumbprint in the configuration we need to generate them before launching gorgone.
# this script only create key if the files don't exists, and silently finish if the files already exists.
IF '${communication_mode}' == 'push_zmq'
Setup Gorgone Config ${push_central_config} ${gorgone_core_config} gorgone_name=${central_name} sql_file=${ROOT_CONFIG}push_db_1_poller.sql
Setup Gorgone Config ${push_central_config} ${gorgone_core_config} gorgone_name=${central_name} sql_file=${ROOT_CONFIG}db_add_1_poller.sql
Setup Gorgone Config ${push_poller_config} gorgone_name=${poller_name}

Start Gorgone debug ${central_name}
Expand All @@ -131,7 +131,7 @@ Setup Two Gorgone Instances
Check Poller Is Connected port=5556 expected_nb=2
Check Poller Communicate 2
ELSE IF '${communication_mode}' == 'pullwss'
Setup Gorgone Config ${pullwss_central_config} ${ROOT_CONFIG}pullwss_node_register_one_node.yaml ${gorgone_core_config} gorgone_name=${central_name} sql_file=${ROOT_CONFIG}push_db_1_poller.sql
Setup Gorgone Config ${pullwss_central_config} ${ROOT_CONFIG}pullwss_node_register_one_node.yaml ${gorgone_core_config} gorgone_name=${central_name} sql_file=${ROOT_CONFIG}db_add_1_poller.sql
Setup Gorgone Config ${gorgone_core_config} ${pullwss_poller_config} gorgone_name=${poller_name}

Start Gorgone debug ${central_name}
Expand All @@ -140,7 +140,7 @@ Setup Two Gorgone Instances
Check Poller Is Connected port=8086 expected_nb=2
Check Poller Communicate 2
ELSE IF '${communication_mode}' == 'pull'
Setup Gorgone Config ${pull_central_config} ${ROOT_CONFIG}pull_node_register_one_node.yaml ${gorgone_core_config} gorgone_name=${central_name} sql_file=${ROOT_CONFIG}push_db_1_poller.sql
Setup Gorgone Config ${pull_central_config} ${ROOT_CONFIG}pull_node_register_one_node.yaml ${gorgone_core_config} gorgone_name=${central_name} sql_file=${ROOT_CONFIG}db_add_1_poller.sql
Setup Gorgone Config ${gorgone_core_config} ${pull_poller_config} gorgone_name=${poller_name}

Start Gorgone debug ${central_name}
Expand Down
2 changes: 1 addition & 1 deletion gorgone/tests/robot/tests/core/pull.robot
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Test Timeout 300s

*** Test Cases ***
connect 1 poller to a central with pull configuration
[Teardown] Stop Gorgone And Remove Gorgone Config @{process_list} sql_file=${ROOT_CONFIG}push_db_1_poller_delete.sql
[Teardown] Stop Gorgone And Remove Gorgone Config @{process_list} sql_file=${ROOT_CONFIG}db_delete_poller.sql

Log To Console \nStarting the Gorgone setup with pull configuration
Setup Two Gorgone Instances communication_mode=pull central_name=pull_gorgone_central poller_name=pull_gorgone_poller_2
Expand Down
2 changes: 1 addition & 1 deletion gorgone/tests/robot/tests/core/pullwss.robot
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Test Timeout 220s

*** Test Cases ***
check one poller can connect to a central gorgone
[Teardown] Stop Gorgone And Remove Gorgone Config @{process_list} sql_file=${ROOT_CONFIG}push_db_1_poller_delete.sql
[Teardown] Stop Gorgone And Remove Gorgone Config @{process_list} sql_file=${ROOT_CONFIG}db_delete_poller.sql

Log To Console \nStarting the gorgone setup
Setup Two Gorgone Instances communication_mode=pullwss central_name=pullwss_gorgone_central poller_name=pullwss_gorgone_poller_2
Expand Down
2 changes: 1 addition & 1 deletion gorgone/tests/robot/tests/core/push.robot
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Test Timeout 220s

*** Test Cases ***
connect 1 poller to a central
[Teardown] Stop Gorgone And Remove Gorgone Config @{process_list} sql_file=${ROOT_CONFIG}push_db_1_poller_delete.sql
[Teardown] Stop Gorgone And Remove Gorgone Config @{process_list} sql_file=${ROOT_CONFIG}db_delete_poller.sql

Log To Console \nStarting the gorgone setup
Setup Two Gorgone Instances communication_mode=push_zmq central_name=push_zmq_gorgone_central poller_name=push_zmq_gorgone_poller_2
Expand Down
33 changes: 33 additions & 0 deletions gorgone/tests/robot/tests/long_tests/fd-leak.robot
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
*** Settings ***
Documentation check Gorgone don't leak file descriptor when a poller become unavailable
Resource ${CURDIR}${/}..${/}..${/}resources${/}import.resource
Test Timeout 1200s

*** Test Cases ***
check gorgone proxy do not leak file descriptor with a poller
[Tags] long_tests
[Teardown] Stop Gorgone And Remove Gorgone Config push_zmq_gorgone_central sql_file=${ROOT_CONFIG}db_delete_poller.sql
${cmd_count_file_descriptor}= Set Variable count=0; for pid in \$(ps aux | grep gorgone-proxy | grep -v grep | awk '{ print \$2 }') ; do num=\$(lsof | grep \$pid | wc -l); count=\$((count + \$num)) ; done ; echo \$count

Log To Console \nStarting the gorgone setup
Setup Two Gorgone Instances communication_mode=push_zmq central_name=push_zmq_gorgone_central poller_name=push_zmq_gorgone_poller_2
# We wait for gorgone to be ready, and grab all file descriptor it need.
Sleep 10
${before_kill_fd_nb} Run ${cmd_count_file_descriptor}
Stop Gorgone And Remove Gorgone Config push_zmq_gorgone_poller_2
Sleep 10
# check what is the normal number of file descriptor for gorgone to take
${initial_fd_nb} Run ${cmd_count_file_descriptor}
Log To Console \n number of file descriptor on before killing poller : ${before_kill_fd_nb} and after : ${initial_fd_nb} \n
${max}= Evaluate ${initial_fd_nb} + 15
Log To Console max is ${max}
Sleep 20
FOR ${i} IN RANGE 60
${current_fd_nb} Run ${cmd_count_file_descriptor}
IF ${i} % 10 == 0
Log To Console exec ${i} \t got ${current_fd_nb}
END
Should Be True ${max} > ${current_fd_nb} gorgone is using more and more file descriptor after a poller disconnect, starting at ${initial_fd_nb} and after ${i} iteration (5 sec each) to ${current_fd_nb}
Sleep 5
END

1 comment on commit 6258e6d

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Robot Results

✅ Passed ❌ Failed ⏭️ Skipped Total Pass % ⏱️ Duration
6 1 0 7 85.71 3m11.807338s

Failed Tests

Name Message ⏱️ Duration Suite
EBNHGU4_BBDO3 hostgroup_1 not found in /tmp/lua-engine.log 70.813 s Hostgroups

Please sign in to comment.