Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mon 147936 native windows drivesize #1866

Merged
merged 8 commits into from
Nov 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .github/scripts/agent_installer_test.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,16 @@ function test_args_to_registry {
Write-Host "arguments: $exe_args"

$process_info= Start-Process -PassThru $exe_path $exe_args
Wait-Process -Id $process_info.Id
$process_info.WaitForExit()
if ($process_info.ExitCode -ne 0) {
Write-Host "fail to execute $exe_path with arguments $exe_args"
Write-Host "exit status = " $process_info.ExitCode
exit 1
}

#let time to windows to flush registry
Start-Sleep -Seconds 2

foreach ($value_name in $expected_registry_values.Keys) {
$expected_value = $($expected_registry_values[$value_name])
$real_value = (Get-ItemProperty -Path HKLM:\Software\Centreon\CentreonMonitoringAgent -Name $value_name).$value_name
Expand Down
23 changes: 22 additions & 1 deletion .github/scripts/agent_robot_test.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,28 @@ Set-ItemProperty -Path HKLM:\SOFTWARE\Centreon\CentreonMonitoringAgent -Name lo

Start-Process -FilePath build_windows\agent\Release\centagent.exe -ArgumentList "--standalone" -RedirectStandardOutput reports\encrypted_reversed_centagent_stdout.log -RedirectStandardError reports\encrypted_reversed_centagent_stderr.log

wsl cd $wsl_path `&`& .github/scripts/wsl-collect-test-robot.sh broker-engine/cma.robot $my_host_name $my_ip $pwsh_path ${current_dir}.replace('\','/')
$uptime = (Get-WmiObject -Class Win32_OperatingSystem).LastBootUpTime #dtmf format
$d_uptime = [Management.ManagementDateTimeConverter]::ToDateTime($uptime) #datetime format
$ts_uptime = ([DateTimeOffset]$d_uptime).ToUnixTimeSeconds() #timestamp format

$test_param = @{
'host'= $my_host_name
'ip'= $my_ip
'wsl_path'= $wsl_path
'pwsh_path'= $pwsh_path
'drive' = @()
'current_dir' = $current_dir.replace('\','/')
'uptime' = $ts_uptime
}

Get-PSDrive -PSProvider FileSystem | Select Name, Used, Free | ForEach-Object -Process {$test_param.drive += $_}

$json_test_param = $test_param | ConvertTo-Json -Compress

Write-Host "json_test_param" $json_test_param
$quoted_json_test_param = "'" + $json_test_param + "'"

wsl cd $wsl_path `&`& .github/scripts/wsl-collect-test-robot.sh broker-engine/cma.robot $quoted_json_test_param

#something wrong in robot test => exit 1 => failure
if (Test-Path -Path 'reports\windows-cma-failed' -PathType Container) {
Expand Down
12 changes: 7 additions & 5 deletions .github/scripts/wsl-collect-test-robot.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@ set -x
test_file=$1

export RUN_ENV=WSL
export HOST_NAME=$2
export USED_ADDRESS=$3
export PWSH_PATH=$4
export WINDOWS_PROJECT_PATH=$5
export JSON_TEST_PARAMS=$2
export USED_ADDRESS=`echo $JSON_TEST_PARAMS | jq -r .ip`
export HOST_NAME=`echo $JSON_TEST_PARAMS | jq -r .host`
export PWSH_PATH=`echo $JSON_TEST_PARAMS | jq -r .pwsh_path`
export WINDOWS_PROJECT_PATH=`echo $JSON_TEST_PARAMS | jq -r .current_dir`



#in order to connect to windows we neeed to use windows ip
Expand All @@ -17,7 +19,7 @@ echo "${USED_ADDRESS} ${HOST_NAME}" >> /etc/hosts
echo "##### /etc/hosts: ######"
cat /etc/hosts

echo "##### Starting tests #####"
echo "##### Starting tests ##### with params: $JSON_TEST_PARAMS"
cd tests
./init-proto.sh

Expand Down
1 change: 1 addition & 0 deletions .github/workflows/windows-agent-robot-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ jobs:
python3
python3-pip
rrdtool
jq

- name: IP info
run: |
Expand Down
2 changes: 2 additions & 0 deletions agent/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ set( SRC_COMMON
${SRC_DIR}/bireactor.cc
${SRC_DIR}/check.cc
${SRC_DIR}/check_exec.cc
${SRC_DIR}/drive_size.cc
${SRC_DIR}/opentelemetry/proto/collector/metrics/v1/metrics_service.grpc.pb.cc
${SRC_DIR}/opentelemetry/proto/collector/metrics/v1/metrics_service.pb.cc
${SRC_DIR}/opentelemetry/proto/metrics/v1/metrics.pb.cc
Expand All @@ -126,6 +127,7 @@ set( SRC_COMMON
set( SRC_WINDOWS
${SRC_DIR}/config_win.cc
${NATIVE_SRC}/check_uptime.cc
${NATIVE_SRC}/check_drive_size.cc
)

set( SRC_LINUX
Expand Down
11 changes: 10 additions & 1 deletion agent/doc/agent-doc.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,4 +114,13 @@ metrics aren't the same as linux version. We collect user, idle, kernel , interr

There are two methods, you can use internal microsoft function NtQuerySystemInformation. Yes Microsoft says that they can change signature or data format at any moment, but it's quite stable for many years. A trick, idle time is included un kernel time, so we subtract first from the second. Dpc time is yet included in interrupt time, so we don't sum it to calculate total time.
The second one relies on performance data counters (pdh API), it gives us percentage despite that sum of percentage is not quite 100%. That's why the default method is the first one.
The choice between the two methods is done by 'use-nt-query-system-information' boolean parameter.
The choice between the two methods is done by 'use-nt-query-system-information' boolean parameter.

### check_drive_size
we have to get free space on server drives. In case of network drives, this call can block in case of network failure. Unfortunately, there is no asynchronous API to do that. So a dedicated thread (drive_size_thread) computes these statistics. In order to be os independent and to test it, drive_size_thread relies on a functor that do the job: drive_size_thread::os_fs_stats. This functor is initialized in main function. drive_size thread is stopped at the end of main function.

So it works like that:
* check_drive_size post query in drive_size_thread queue
* drive_size_thread call os_fs_stats
* drive_size_thread post result in io_context
* io_context calls check_drive_size::_completion_handler
280 changes: 280 additions & 0 deletions agent/inc/com/centreon/agent/drive_size.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,280 @@
/**
* Copyright 2024 Centreon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* For more information : [email protected]
*/

#ifndef CENTREON_AGENT_NATIVE_DRIVE_SIZE_BASE_HH
#define CENTREON_AGENT_NATIVE_DRIVE_SIZE_BASE_HH

#include <cstdint>
#include <memory>
#include "absl/base/thread_annotations.h"
#include "absl/container/btree_set.h"
#include "absl/synchronization/mutex.h"
#include "boost/asio/io_context.hpp"
#include "check.hh"
#include "re2/re2.h"

namespace com::centreon::agent {
namespace check_drive_size_detail {

/**
* @brief these flags are passed in check parameter:filter-storage-type and
* filter-type
*
*/
enum e_drive_fs_type : uint64_t {
hr_unknown = 0,
hr_storage_ram = 1 << 0,
hr_storage_virtual_memory = 1 << 1,
hr_storage_fixed_disk = 1 << 2,
hr_storage_removable_disk = 1 << 3,
hr_storage_floppy_disk = 1 << 4,
hr_storage_compact_disc = 1 << 5,
hr_storage_ram_disk = 1 << 6,
hr_storage_flash_memory = 1 << 7,
hr_storage_network_disk = 1 << 8,
hr_fs_other = 1 << 9,
hr_fs_unknown = 1 << 10,
hr_fs_berkeley_ffs = 1 << 11,
hr_fs_sys5_fs = 1 << 12,
hr_fs_fat = 1 << 13,
hr_fs_hpfs = 1 << 14,
hr_fs_hfs = 1 << 15,
hr_fs_mfs = 1 << 16,
hr_fs_ntfs = 1 << 17,
hr_fs_vnode = 1 << 18,
hr_fs_journaled = 1 << 19,
hr_fs_iso9660 = 1 << 20,
hr_fs_rock_ridge = 1 << 21,
hr_fs_nfs = 1 << 22,
hr_fs_netware = 1 << 23,
hr_fs_afs = 1 << 24,
hr_fs_dfs = 1 << 25,
hr_fs_appleshare = 1 << 26,
hr_fs_rfs = 1 << 27,
hr_fs_dgcfs = 1 << 28,
hr_fs_bfs = 1 << 29,
hr_fs_fat32 = 1 << 30,
hr_fs_linux_ext2 = 1U << 31,
hr_fs_linux_ext4 = 1ULL << 32,
hr_fs_exfat = 1ULL << 33
};

/**
* @brief user can check only some fs by using filters
* This is the goal of this class
* In order to improve perf, results of previous tests are saved
* in cache sets. That's why is_allowed is not const
*
*/
class filter {
using string_set = absl::flat_hash_set<std::string>;

string_set _cache_allowed_fs ABSL_GUARDED_BY(_protect);
string_set _cache_excluded_fs ABSL_GUARDED_BY(_protect);
string_set _cache_allowed_mountpoint ABSL_GUARDED_BY(_protect);
string_set _cache_excluded_mountpoint ABSL_GUARDED_BY(_protect);

mutable absl::Mutex _protect;

unsigned _fs_type_filter;

std::unique_ptr<re2::RE2> _filter_fs, _filter_exclude_fs;
std::unique_ptr<re2::RE2> _filter_mountpoint, _filter_exclude_mountpoint;

public:
filter(const rapidjson::Value& args);

bool is_allowed(const std::string_view& fs,
const std::string_view& mount_point,
e_drive_fs_type fs_type);

bool is_fs_yet_allowed(const std::string_view& fs) const;

bool is_fs_yet_excluded(const std::string_view& fs) const;
};

/**
* @brief tupple where we store statistics of a fs
*
*/
struct fs_stat {
fs_stat() = default;
fs_stat(std::string&& fs_in, uint64_t used_in, uint64_t total_in)
: fs(fs_in), mount_point(fs), used(used_in), total(total_in) {}

fs_stat(std::string&& fs_in,
std::string&& mount_point_in,
uint64_t used_in,
uint64_t total_in)
: fs(fs_in),
mount_point(mount_point_in),
used(used_in),
total(total_in) {}

fs_stat(const std::string_view& fs_in,
const std::string_view& mount_point_in,
uint64_t used_in,
uint64_t total_in)
: fs(fs_in),
mount_point(mount_point_in),
used(used_in),
total(total_in) {}

std::string fs;
std::string mount_point;
uint64_t used;
uint64_t total;

bool is_used_more_than_threshold(uint64_t threshold) const {
return used >= threshold;
}

bool is_free_less_than_threshold(uint64_t threshold) const {
return total - used < threshold;
}

bool is_used_more_than_prct_threshold(uint64_t percent_hundredth) const {
if (!total) {
return true;
}
return (used * 10000) / total >= percent_hundredth;
}

bool is_free_less_than_prct_threshold(uint64_t percent_hundredth) const {
if (!total) {
return true;
}
return ((total - used) * 10000) / total < percent_hundredth;
}

double get_used_prct() const {
if (!total)
return 0.0;
return static_cast<double>(used * 100) / total;
}

double get_free_prct() const {
if (!total)
return 0.0;
return static_cast<double>((total - used) * 100) / total;
}
};

/**
* @brief get fs statistics can block on network drives, so we use this thread
* to do the job and not block main thread
*
*/
class drive_size_thread
: public std::enable_shared_from_this<drive_size_thread> {
std::shared_ptr<asio::io_context> _io_context;

using completion_handler = std::function<void(std::list<fs_stat>)>;

struct async_data {
std::shared_ptr<filter> request_filter;
completion_handler handler;
time_point timeout;
};

std::list<async_data> _queue ABSL_GUARDED_BY(_queue_m);
absl::Mutex _queue_m;

bool _active = true;

std::shared_ptr<spdlog::logger> _logger;

bool has_to_stop_wait() const { return !_active || !_queue.empty(); }

public:
typedef std::list<fs_stat> (
*get_fs_stats)(filter&, const std::shared_ptr<spdlog::logger>& logger);

static get_fs_stats os_fs_stats;

drive_size_thread(const std::shared_ptr<asio::io_context>& io_context,
const std::shared_ptr<spdlog::logger>& logger)
: _io_context(io_context), _logger(logger) {}

void run();

void kill();

template <class handler_type>
void async_get_fs_stats(const std::shared_ptr<filter>& request_filter,
const time_point& timeout,
handler_type&& handler);
};

} // namespace check_drive_size_detail

/**
* @brief drive size check object (same for linux and windows)
*
*/
class check_drive_size : public check {
std::shared_ptr<check_drive_size_detail::filter> _filter;
bool _prct_threshold;
bool _free_threshold;
uint64_t _warning; // value in bytes or percent * 100
uint64_t _critical;

typedef e_status (check_drive_size::*fs_stat_test)(
const check_drive_size_detail::fs_stat&) const;

fs_stat_test _fs_test;

e_status _used_test(const check_drive_size_detail::fs_stat& fs) const;
e_status _prct_used_test(const check_drive_size_detail::fs_stat& fs) const;

e_status _free_test(const check_drive_size_detail::fs_stat& fs) const;
e_status _prct_free_test(const check_drive_size_detail::fs_stat& fs) const;

e_status _no_test(const check_drive_size_detail::fs_stat& fs) const;

void _completion_handler(
unsigned start_check_index,
const std::list<check_drive_size_detail::fs_stat>& result);

public:
check_drive_size(const std::shared_ptr<asio::io_context>& io_context,
const std::shared_ptr<spdlog::logger>& logger,
time_point first_start_expected,
duration check_interval,
const std::string& serv,
const std::string& cmd_name,
const std::string& cmd_line,
const rapidjson::Value& args,
const engine_to_agent_request_ptr& cnf,
check::completion_handler&& handler);

virtual ~check_drive_size() = default;

std::shared_ptr<check_drive_size> shared_from_this() {
return std::static_pointer_cast<check_drive_size>(
check::shared_from_this());
}

void start_check(const duration& timeout) override;

static void thread_kill();
};

} // namespace com::centreon::agent

#endif // CENTREON_AGENT_NATIVE_DRIVE_SIZE_HH
Loading