From c0ce29cebffa1caa2128874b8930046d54b57d96 Mon Sep 17 00:00:00 2001 From: wyyalt Date: Tue, 30 Jan 2024 16:05:31 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20MAC=20OS=E5=AE=89=E8=A3=85=E8=84=9A?= =?UTF-8?q?=E6=9C=AC=E9=80=82=E9=85=8D=20(closed=20#2084)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- apps/backend/api/constants.py | 3 + apps/node_man/constants.py | 13 +- .../migrations/0081_auto_20240307_1656.py | 113 +++ apps/node_man/models.py | 1 + .../agent_tools/agent2/setup_agent.zsh | 935 ++++++++++++++++++ script_tools/gsectl/agent/darwin/gsectl | 864 ++++++++++++++++ .../plugin_scripts/fetch_used_ports.zsh | 9 + .../plugin_scripts/operate_plugin.zsh | 47 + script_tools/plugin_scripts/reload.zsh | 105 ++ script_tools/plugin_scripts/remove_config.zsh | 5 + script_tools/plugin_scripts/restart.zsh | 4 + script_tools/plugin_scripts/start.zsh | 117 +++ script_tools/plugin_scripts/stop.zsh | 112 +++ script_tools/plugin_scripts/stop_debug.zsh | 38 + script_tools/plugin_scripts/update_binary.zsh | 147 +++ 15 files changed, 2509 insertions(+), 4 deletions(-) create mode 100644 apps/node_man/migrations/0081_auto_20240307_1656.py create mode 100644 script_tools/agent_tools/agent2/setup_agent.zsh create mode 100755 script_tools/gsectl/agent/darwin/gsectl create mode 100644 script_tools/plugin_scripts/fetch_used_ports.zsh create mode 100644 script_tools/plugin_scripts/operate_plugin.zsh create mode 100755 script_tools/plugin_scripts/reload.zsh create mode 100644 script_tools/plugin_scripts/remove_config.zsh create mode 100755 script_tools/plugin_scripts/restart.zsh create mode 100755 script_tools/plugin_scripts/start.zsh create mode 100755 script_tools/plugin_scripts/stop.zsh create mode 100644 script_tools/plugin_scripts/stop_debug.zsh create mode 100644 script_tools/plugin_scripts/update_binary.zsh diff --git a/apps/backend/api/constants.py b/apps/backend/api/constants.py index 5f47cecbe..267f80ad5 100644 --- a/apps/backend/api/constants.py +++ b/apps/backend/api/constants.py @@ -23,6 +23,7 @@ class OS(object): LINUX = "linux" AIX = "aix" SOLARIS = "solaris" + DARWIN = "darwin" # 操作系统->系统账户映射表 @@ -31,6 +32,7 @@ class OS(object): OS.LINUX: "root", OS.AIX: "root", OS.SOLARIS: "root", + OS.DARWIN: "root", } # 操作系统->后缀映射表 @@ -39,6 +41,7 @@ class OS(object): OS.LINUX: "sh", OS.AIX: "ksh", OS.SOLARIS: "sh", + OS.DARWIN: "zsh", } diff --git a/apps/node_man/constants.py b/apps/node_man/constants.py index aaeb6cd64..7718c575c 100644 --- a/apps/node_man/constants.py +++ b/apps/node_man/constants.py @@ -114,11 +114,11 @@ def get_optional_items(cls) -> List[str]: AUTH_CHOICES = tuple_choices(AUTH_TUPLE) AuthType = choices_to_namedtuple(AUTH_CHOICES) -OS_TUPLE = ("LINUX", "WINDOWS", "AIX", "SOLARIS") +OS_TUPLE = ("LINUX", "WINDOWS", "AIX", "SOLARIS", "DARWIN") OS_CHOICES = tuple_choices(OS_TUPLE) OsType = choices_to_namedtuple(OS_CHOICES) OS_CHN = {os_type: os_type if os_type == OsType.AIX else os_type.capitalize() for os_type in OS_TUPLE} -BK_OS_TYPE = {"LINUX": "1", "WINDOWS": "2", "AIX": "3", "SOLARIS": "5"} +BK_OS_TYPE = {"LINUX": "1", "WINDOWS": "2", "AIX": "3", "SOLARIS": "5", "DARWIN": "8"} # 操作系统匹配关键词 OS_KEYWORDS = { OsType.LINUX: ["linux", "ubuntu", "centos", "redhat", "suse", "debian", "fedora"], @@ -132,13 +132,15 @@ def get_optional_items(cls) -> List[str]: OsType.LINUX: settings.BACKEND_UNIX_ACCOUNT, OsType.AIX: settings.BACKEND_UNIX_ACCOUNT, OsType.SOLARIS: settings.BACKEND_UNIX_ACCOUNT, + OsType.DARWIN: settings.BACKEND_UNIX_ACCOUNT, OsType.WINDOWS.lower(): settings.BACKEND_WINDOWS_ACCOUNT, OsType.LINUX.lower(): settings.BACKEND_UNIX_ACCOUNT, OsType.AIX.lower(): settings.BACKEND_UNIX_ACCOUNT, OsType.SOLARIS.lower(): settings.BACKEND_UNIX_ACCOUNT, + OsType.DARWIN.lower(): settings.BACKEND_UNIX_ACCOUNT, } -OS_TYPE = {"1": "LINUX", "2": "WINDOWS", "3": "AIX", "5": "SOLARIS"} +OS_TYPE = {"1": "LINUX", "2": "WINDOWS", "3": "AIX", "5": "SOLARIS", "8": "DARWIN"} NODE_TUPLE = ("AGENT", "PROXY", "PAGENT") NODE_CHOICES = tuple_choices(NODE_TUPLE) @@ -488,7 +490,7 @@ def _get_member__alias_map(cls) -> Dict[Enum, str]: CONFIG_FILE_FORMAT_TUPLE = ("json", "yaml", "", None) CONFIG_FILE_FORMAT_CHOICES = tuple_choices(CONFIG_FILE_FORMAT_TUPLE) -PLUGIN_OS_TUPLE = ("windows", "linux", "aix", "solaris") +PLUGIN_OS_TUPLE = ("windows", "linux", "aix", "solaris", "darwin") PLUGIN_OS_CHOICES = tuple_choices(PLUGIN_OS_TUPLE) PluginOsType = choices_to_namedtuple(PLUGIN_OS_CHOICES) @@ -500,6 +502,7 @@ def _get_member__alias_map(cls) -> Dict[Enum, str]: OsType.WINDOWS: CpuType.x86_64, OsType.AIX: CpuType.powerpc, OsType.SOLARIS: CpuType.sparc, + OsType.DARWIN: CpuType.x86_64, } CMDB_CPU_MAP = {"x86": CpuType.x86, "arm": CpuType.aarch64} @@ -686,6 +689,7 @@ class SetupScriptFileName(Enum): SETUP_PROXY_SH = "setup_proxy.sh" SETUP_AGENT_SH = "setup_agent.sh" SETUP_AGENT_KSH = "setup_agent.ksh" + SETUP_AGENT_ZSH = "setup_agent.zsh" SETUP_AGENT_BAT = "setup_agent.bat" SETUP_PAGENT_PY = "setup_pagent.py" GSECTL_BAT = "gsectl.bat" @@ -697,6 +701,7 @@ class SetupScriptFileName(Enum): OsType.WINDOWS: SetupScriptFileName.SETUP_AGENT_BAT.value, OsType.AIX: SetupScriptFileName.SETUP_AGENT_KSH.value, OsType.SOLARIS: SetupScriptFileName.SETUP_AGENT_SOLARIS_SH.value, + OsType.DARWIN: SetupScriptFileName.SETUP_AGENT_ZSH.value, } diff --git a/apps/node_man/migrations/0081_auto_20240307_1656.py b/apps/node_man/migrations/0081_auto_20240307_1656.py new file mode 100644 index 000000000..c360472a2 --- /dev/null +++ b/apps/node_man/migrations/0081_auto_20240307_1656.py @@ -0,0 +1,113 @@ +# Generated by Django 3.2.4 on 2024-03-07 08:56 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("node_man", "0080_auto_20231122_1552"), + ] + + operations = [ + migrations.AlterField( + model_name="gseconfigenv", + name="os", + field=models.CharField( + choices=[ + ("windows", "windows"), + ("linux", "linux"), + ("aix", "aix"), + ("solaris", "solaris"), + ("darwin", "darwin"), + ], + db_index=True, + default="linux", + max_length=32, + verbose_name="系统类型", + ), + ), + migrations.AlterField( + model_name="gseconfigtemplate", + name="os", + field=models.CharField( + choices=[ + ("windows", "windows"), + ("linux", "linux"), + ("aix", "aix"), + ("solaris", "solaris"), + ("darwin", "darwin"), + ], + db_index=True, + default="linux", + max_length=32, + verbose_name="系统类型", + ), + ), + migrations.AlterField( + model_name="host", + name="os_type", + field=models.CharField( + choices=[ + ("LINUX", "LINUX"), + ("WINDOWS", "WINDOWS"), + ("AIX", "AIX"), + ("SOLARIS", "SOLARIS"), + ("DARWIN", "DARWIN"), + ], + db_index=True, + default="LINUX", + max_length=16, + verbose_name="操作系统", + ), + ), + migrations.AlterField( + model_name="packages", + name="os", + field=models.CharField( + choices=[ + ("windows", "windows"), + ("linux", "linux"), + ("aix", "aix"), + ("solaris", "solaris"), + ("darwin", "darwin"), + ], + db_index=True, + default="linux", + max_length=32, + verbose_name="系统类型", + ), + ), + migrations.AlterField( + model_name="pluginconfigtemplate", + name="os", + field=models.CharField( + choices=[ + ("windows", "windows"), + ("linux", "linux"), + ("aix", "aix"), + ("solaris", "solaris"), + ("darwin", "darwin"), + ], + default="linux", + max_length=16, + verbose_name="操作系统", + ), + ), + migrations.AlterField( + model_name="proccontrol", + name="os", + field=models.CharField( + choices=[ + ("windows", "windows"), + ("linux", "linux"), + ("aix", "aix"), + ("solaris", "solaris"), + ("darwin", "darwin"), + ], + default="linux", + max_length=32, + verbose_name="系统类型", + ), + ), + ] diff --git a/apps/node_man/models.py b/apps/node_man/models.py index 4403def0c..6954224fe 100644 --- a/apps/node_man/models.py +++ b/apps/node_man/models.py @@ -589,6 +589,7 @@ def get_agent_config(self, os_type: str) -> Dict[str, Any]: if os_type in [ constants.OsType.AIX.lower(), constants.OsType.SOLARIS.lower(), + constants.OsType.DARWIN.lower(), ]: os_type = constants.OsType.LINUX.lower() return self.agent_config[os_type] diff --git a/script_tools/agent_tools/agent2/setup_agent.zsh b/script_tools/agent_tools/agent2/setup_agent.zsh new file mode 100644 index 000000000..468c19bd9 --- /dev/null +++ b/script_tools/agent_tools/agent2/setup_agent.zsh @@ -0,0 +1,935 @@ +#!/bin/bash +# vim:ft=sh expandtab sts=4 ts=4 sw=4 nu +# gse agent 2.0 安装脚本, 仅在节点管理2.0中使用 + +# DEFAULT DEFINITION +NODE_TYPE=agent + +GSE_AGENT_RUN_DIR=/var/run/gse +GSE_AGENT_DATA_DIR=/var/lib/gse +GSE_AGENT_LOG_DIR=/var/log/gse + +OS_INFO="" +OS_TYPE="" +RC_LOCAL_FILE=/etc/rc.d/rc.local + +GSE_AGENT_CONFIG="gse_agent.conf" +AGENT_CONFIGS=("gse_agent.conf") +AGENT_CLEAN_UP_DIRS=("bin") + +# 收到如下信号或者exit退出时,执行清理逻辑 +#trap quit 1 2 3 4 5 6 7 8 10 11 12 13 14 15 +trap 'cleanup' HUP INT QUIT ABRT SEGV PIPE ALRM TERM EXIT +trap 'report_err $LINENO; exit 1; ' ERR + +log () { local L=INFO D; D="$(date +%F\ %T)"; echo "$D $L $*" | tee -a "$LOG_FILE"; bulk_report_step_status "$LOG_FILE" "$BULK_LOG_SIZE" ; return 0; } +warn () { local L=WARN D; D="$(date +%F\ %T)"; echo "$D $L $*" | tee -a "$LOG_FILE"; bulk_report_step_status "$LOG_FILE" "$BULK_LOG_SIZE" ; return 0; } +err () { local L=ERROR D; D="$(date +%F\ %T)"; echo "$D $L $*" | tee -a "$LOG_FILE"; bulk_report_step_status "$LOG_FILE" "$BULK_LOG_SIZE" ; return 1; } +fail () { local L=ERROR D; D="$(date +%F\ %T)"; echo "$D $L $*" | tee -a "$LOG_FILE"; bulk_report_step_status "$LOG_FILE" "$BULK_LOG_SIZE" URG; exit 1; } + +get_cpu_arch () { + local cmd=$1 + CPU_ARCH=$($cmd) + CPU_ARCH=$(echo ${CPU_ARCH} | tr 'A-Z' 'a-z') + if [[ "${CPU_ARCH}" =~ "x86_64" ]]; then + return 0 + elif [[ "${CPU_ARCH}" =~ "x86" || "${CPU_ARCH}" =~ ^i[3456]86 ]]; then + CPU_ARCH="x86" + return 0 + elif [[ "${CPU_ARCH}" =~ "aarch" ]]; then + return 0 + else + return 1 + fi +} + +get_cpu_arch "uname -m" || fail get_cpu_arch "Failed to get CPU arch, please contact the developer." + + +get_os_info () { + if [ -f "/proc/version" ]; then + OS_INFO="$OS_INFO $(cat /proc/version)" + fi + if [ -f "/etc/issue" ]; then + OS_INFO="$OS_INFO $(cat /etc/issue)" + fi + OS_INFO="$OS_INFO $(uname -a)" + OS_INFO=$(echo ${OS_INFO} | tr 'A-Z' 'a-z') +} + +get_os_type () { + get_os_info + OS_INFO=$(echo ${OS_INFO} | tr 'A-Z' 'a-z') + if [[ "${OS_INFO}" =~ "ubuntu" ]]; then + OS_TYPE="ubuntu" + RC_LOCAL_FILE="/etc/rc.local" + elif [[ "${OS_INFO}" =~ "centos" ]]; then + OS_TYPE="centos" + RC_LOCAL_FILE="/etc/rc.d/rc.local" + elif [[ "${OS_INFO}" =~ "coreos" ]]; then + OS_TYPE="coreos" + RC_LOCAL_FILE="/etc/rc.d/rc.local" + elif [[ "${OS_INFO}" =~ "freebsd" ]]; then + OS_TYPE="freebsd" + RC_LOCAL_FILE="/etc/rc.d/rc.local" + elif [[ "${OS_INFO}" =~ "debian" ]]; then + OS_TYPE="debian" + RC_LOCAL_FILE="/etc/rc.local" + elif [[ "${OS_INFO}" =~ "suse" ]]; then + OS_TYPE="suse" + RC_LOCAL_FILE="/etc/rc.d/rc.local" + elif [[ "${OS_INFO,,}" =~ "hat" ]]; then + OS_TYPE="redhat" + RC_LOCAL_FILE="/etc/rc.d/rc.local" + elif [[ "${OS_INFO,,}" =~ "mac" ]]; then + OS_TYPE="mac" + RC_LOCAL_FILE="/etc/rc.d/rc.local" + fi +} + +check_rc_file () { + get_os_type + if [ -f $RC_LOCAL_FILE ]; then + return 0 + elif [ -f "/etc/rc.d/rc.local" ]; then + RC_LOCAL_FILE="/etc/rc.d/rc.local" + else + RC_LOCAL_FILE="/etc/rc.local" + fi +} + +# 清理逻辑:保留本次的LOG_FILE,下次运行时会删除历史的LOG_FILE。 +# 保留安装脚本本身 +cleanup () { + bulk_report_step_status "$LOG_FILE" "$BULK_LOG_SIZE" URG # 上报所有剩余的日志 + + if ! [[ $DEBUG = "true" ]]; then + local GLOBIGNORE="$LOG_FILE*" + rm -vf "$TMP_DIR"/nm.* + fi + + exit 0 +} + +# 打印错误行数信息 +report_err () { + awk -v LN="$1" -v L="ERROR" -v D="$(date +%F\ %T)" \ + 'NR>LN-3 && NR>>":""), $0 }' $0 +} + +validate_setup_path () { + local invalid_path_prefix=( + /tmp + /var + /etc + /bin + /lib + /lib64 + /boot + /mnt + /proc + /dev + /run + /sys + /sbin + /root + /home + ) + + local invalid_path=( + /usr + /usr/bin + /usr/sbin + /usr/local/lib + /usr/include + /usr/lib + /usr/lib64 + /usr/libexec + ) + + local p1="${AGENT_SETUP_PATH%/$NODE_TYPE*}" + local p2="${p1%/gse*}" + local p + + if [[ "$p1" == "${AGENT_SETUP_PATH}" ]] || [[ "$p2" == "$AGENT_SETUP_PATH" ]]; then + fail check_env FAILED "$AGENT_SETUP_PATH is not allowed to install agent" + fi + + for p in "${invalid_path[@]}"; do + if [[ "${p2}" == "$p" ]]; then + fail check_env FAILED "$AGENT_SETUP_PATH is not allowed to install agent" + fi + done + + for p in "${invalid_path_prefix[@]}"; do + if [[ "${p2//$p}" != "$p2" ]]; then + fail check_env FAILED "$AGENT_SETUP_PATH is not allowed to install agent" + fi + done +} + +is_port_listen () { + local i port + + for i in {0..15}; do + sleep 1 + for port in "$@"; do + lsof -iTCP:"$port" -sTCP:LISTEN -a -i -P -n -p "$AGENT_PID" && return 0 + done + done + + return 1 +} + +is_port_connected_by_pid () { + local pid port + pid=$1 port=$2 + + for i in {0..10}; do + sleep 1 + [ `sudo lsof -i:$port | grep -w $pid |wc -l` -ge 1 ] && return 0 + done + return 1 +} + +is_connected () { + local i port=$1 + + for i in {0..15}; do + sleep 1 + lsof -iTCP:"$port" -sTCP:ESTABLISHED -a -i -P -n -p "$AGENT_PID" && return 0 + done + + return 1 +} + + +# 用法:通过ps的comm字段和二进制的绝对路径来精确获取pid +get_pid_by_comm_path () { + local comm=$1 path=$2 worker=$3 + local _pids pids + local pid + if [[ "${worker}" == "WORKER" ]]; then + read -r -a _pids <<< "$(ps -ax -o ppid,pid,command | grep $comm | grep $AGENT_SETUP_PATH | awk '{print $1 "|" $2 "|" $3}' | awk -F'|' '$1 != 1 && $3 ~ /gse_agent/' | awk -F'|' '{print $2}' | xargs)" + elif [[ "${worker}" == "MASTER" ]]; then + read -r -a _pids <<< "$(ps -ax -o ppid,pid,command | grep $comm | grep $AGENT_SETUP_PATH | awk '{print $1 "|" $2 "|" $3}' | awk -F'|' '$1 == 1 && $3 ~ /gse_agent/' | awk -F'|' '{print $2}' | xargs)" + else + read -r -a _pids <<< "$(ps -ax -o ppid,pid,command | grep $comm | grep $AGENT_SETUP_PATH | awk '{print $1 "|" $2 "|" $3}' | awk -F'|' '$3 ~ /gse_agent/' | awk -F'|' '{print $2}' | xargs)" + fi + + pids=("${_pids[@]}") + # 传入了绝对路径,则进行基于二进制路径的筛选 + # if [[ -e "$path" ]]; then + # for pid in "${_pids[@]}"; do + # if [[ "$(readlink -f "$path")" = "$(sudo lsof -p $_pid | awk '$4=="txt" {print $9}' | grep gse_agent)" ]]; then + # if ! grep -nEq '^\ +$' <<< "$pid"; then + # pids+=("$pid") + # fi + # fi + # done + # else + # pids=("${_pids[@]}") + # fi + echo ${pids[@]} +} + +is_base64_command_exist() { + if ! command -v base64 >/dev/null 2>&1; then + return 1 + else + return 0 + fi +} + +is_process_ok () { + local proc=${1:-agent} + local gse_master_pid gse_worker_pids gse_agent_pids + gse_agent_pids="$(get_pid_by_comm_path gse_agent "$AGENT_SETUP_PATH/bin/gse_${proc}" | xargs)" + gse_master_pid=$(get_pid_by_comm_path gse_agent "$AGENT_SETUP_PATH/bin/gse_${proc}" MASTER | xargs) + + read -r -a gse_master <<< "$gse_master_pids" + read -r -a gse_pids <<< "$gse_agent_pids" + + agent_id_file="${AGENT_SETUP_PATH}"/bin/run/agent.pid + + if [[ ${#gse_master} -gt 1 && -f ${agent_id_file} ]]; then + gse_master_pid=$(cat ${agent_id_file}) + fi + + gse_worker_pids=$(pgrep -P $gse_master_pid) + + read -r -a gse_worker <<< "$gse_worker_pids" + + if [ "${#gse_pids[@]}" -eq 0 ]; then + fail setup_agent FAILED "process check: no gse_agent found. gse_${proc} process abnormal (node type:$NODE_TYPE)" + fi + + if [ "${#gse_master[@]}" -gt 1 ]; then + fail setup_agent FAILED "process check: ${#gse_master[@]} gse_agent Master found. pid($gse_master_pids) gse_${proc} process abnormal (node type:$NODE_TYPE)" + fi + + # worker 进程在某些任务情况下可能不只一个,只要都是一个爹,多个worker也是正常,不为0即可 + if [ "${#gse_worker[@]}" -eq 0 ]; then + fail setup_agent FAILED "process check: gse_agent Worker not found (node type:$NODE_TYPE)" + fi +} + +check_heathz_by_gse () { + local SLEEP_TIME=1 RETRY_COUNT=0 + + for i in {0..2}; do + local result execution_code + if [ -f "${GSE_AGENT_CONFIG_PATH}" ]; then + result=$("${AGENT_SETUP_PATH}"/bin/gse_agent -f "${GSE_AGENT_CONFIG_PATH}" --healthz 1) + else + result=$("${AGENT_SETUP_PATH}"/bin/gse_agent --healthz 1) + fi + execution_code=$? + if [[ "${execution_code}" -eq 0 ]]; then + break + else + sleep "${SLEEP_TIME}" + RETRY_COUNT=$((RETRY_COUNT + 1)) + if [[ "${RETRY_COUNT}" -ge 3 ]]; then + log healthz_check INFO "gse_agent healthz check return code: ${execution_code}" + report_result=$(awk -F': ' '{print $2}' <<< "$result") + if is_base64_command_exist; then + report_result=$(echo "$result" | base64) + else + report_result=$(echo "$result" | tr "\"" "\'") + fi + log report_healthz INFO "${report_result}" + fail healthz_check FAILED "gse healthz check failed with retry count: $RETRY_COUNT" + fi + fi + done + report_result=$(awk -F': ' '{print $2}' <<< "$result") + if is_base64_command_exist; then + report_result=$(echo "$result" | base64) + else + report_result=$(echo "$result" | tr "\"" "\'") + fi + log report_healthz - "${report_result}" + log healthz_check INFO "gse_agent healthz check success" +} + +remove_crontab () { + local tmpcron + tmpcron=$(mktemp "$TMP_DIR"/cron.XXXXXXX) + + crontab -l | grep -v "bin/gsectl" >"$tmpcron" + crontab "$tmpcron" && rm -f "$tmpcron" + + # 下面这段代码是为了确保修改的crontab能立即生效 + if pgrep -x crond &>/dev/null; then + pkill -HUP -x crond + fi +} + +get_daemon_file () { + DAEMON_FILE_PATH="/Library/LaunchDaemons/" + DAEMON_FILE_NAME="com.tencent.$(echo ${AGENT_SETUP_PATH%*/} | tr '/' '.' | awk -F '.' '{print $(NF-1)"."$NF}').Daemon.plist" +} + +setup_startup_scripts () { + get_daemon_file + touch $DAEMON_FILE_PATH$DAEMON_FILE_NAME + bash -c "cat >$DAEMON_FILE_NAME" << EOF + + + + + Label + com.tencent.$(echo ${AGENT_SETUP_PATH%*/} | tr '/' '.' | awk -F '.' '{print $(NF-1)"."$NF}') + ProgramArguments + + ${AGENT_SETUP_PATH}/bin/gsectl + start + + RunAtLoad + + + +EOF +} + +registe_agent_with_excepte () { + local SLEEP_TIME=1 RETRY_COUNT=0 + + for i in {0..2}; do + local registe_result registe_code + if [ -f "${GSE_AGENT_CONFIG_PATH}" ]; then + registe_result=$($AGENT_SETUP_PATH/bin/gse_agent -f "${GSE_AGENT_CONFIG_PATH}" --register 2>&1) + else + registe_result=$($AGENT_SETUP_PATH/bin/gse_agent --register 2>&1) + fi + registe_code=$? + if [[ "${registe_code}" -eq 0 ]] && [[ ! "${registe_result}" =~ "overwrite" ]]; then + log report_agent_id DONE "$registe_result" + break + else + sleep "${SLEEP_TIME}" + RETRY_COUNT=$((RETRY_COUNT + 1)) + if [[ "${RETRY_COUNT}" -ge 3 ]]; then + fail register_agent_id FAILED "register agent id failed, error: ${registe_result}" + fi + fi + done +} + +register_agent_id () { + if [ ! -f "$AGENT_SETUP_PATH/bin/gse_agent" ]; then + fail register_agent_id FAILED "gse_agent file not exists in $AGENT_SETUP_PATH/bin" + fi + + if [[ "${UNREGISTER_AGENT_ID}" == "TRUE" ]]; then + log register_agent_id - "trying to unregister agent id" + unregister_agent_id SKIP + fi + + log register_agent_id - "trying to register agent id" + registe_agent_with_excepte +} + +unregister_agent_id () { + local skip="$1" + log unregister_agent_id - "trying to unregister agent id" + if [ -f "$AGENT_SETUP_PATH/bin/gse_agent" ]; then + if [ -f "${GSE_AGENT_CONFIG_PATH}" ]; then + unregister_agent_id_result=$("$AGENT_SETUP_PATH"/bin/gse_agent -f "${GSE_AGENT_CONFIG_PATH}" --unregister 2>&1) + else + unregister_agent_id_result=$("$AGENT_SETUP_PATH"/bin/gse_agent --unregister 2>&1) + fi + + if [[ $? -eq 0 ]]; then + log unregister_agent_id SUCCESS "unregister agent id succeed" + else + if [[ "${skip}" == "SKIP" ]]; then + warn unregister_agent_id - "unregister agent id failed, but skip it. error: ${unregister_agent_id_result}" + else + fail unregister_agent_id FAILED "unregister agent id failed, error: ${unregister_agent_id_result}" + fi + fi + else + warn unregister_agent_id - "gse_agent file not exists in $AGENT_SETUP_PATH/bin" + fi +} + +start_agent () { + local i p + + "$AGENT_SETUP_PATH"/bin/gsectl start || fail setup_agent FAILED "start gse agent failed" + + sleep 3 + is_process_ok agent +} + +remove_proxy_if_exists () { + local i pids + local path=${AGENT_SETUP_PATH%/*}/proxy + + ! [[ -d $path ]] && return 0 + "$path/bin/gsectl" stop + + # 两种版本的proxy,都要杀掉 + for p in agent transit btsvr data; do + for i in {0..10}; do + read -r -a pids <<< "$(pidof "$path"/bin/gse_${p})" + if [ ${#pids[@]} -eq 0 ]; then + # 进程已退,继续检查下一个进程 + break + elif [ "$i" == 10 ]; then + # 强杀 + kill -9 "${pids[@]}" + else + sleep 1 + fi + done + done + + rm -rf "$path" +} + +stop_agent () { + local i pids + + ! [[ -d $AGENT_SETUP_PATH ]] && return 0 + "$AGENT_SETUP_PATH/bin/gsectl" stop + for i in {1..10}; do + # for pid in $(get_pid_by_comm_path gse_agent "$AGENT_SETUP_PATH/bin/gse_agent"); do + # # 富容器场景下,会误杀docker里的agent进程,因此需要判断父进程ID是否为1,仅干掉这些进程 + # if [[ $(ps -o ppid= -p $pid) -eq 1 ]]; then + # pids=($pid $(pgrep -P $pid)) + # break + # fi + # done + pids="$(get_pid_by_comm_path gse_agent "$AGENT_SETUP_PATH/bin/gse_agent")" + if [[ ! -n "$pids" ]]; then + log remove_agent SUCCESS 'old agent has been stopped successfully' + break + elif [[ $i -eq 10 ]]; then + kill -9 ${pids[@]} + else + sleep 1 + fi + done +} + +clean_up_agent_directory () { + for dir in "${AGENT_CLEAN_UP_DIRS[@]}"; do + rm -rf "${AGENT_SETUP_PATH}"/"${dir}" + done +} + +remove_agent () { + log remove_agent - 'trying to stop old agent' + stop_agent + + log remove_agent - "trying to remove old agent directory(${AGENT_SETUP_PATH}/${AGENT_CLEAN_UP_DIRS[@]})" + cd "${AGENT_SETUP_PATH}" + for file in `ls -lR@ |ggrep -E "i-" |awk '{print $NF}'`;do echo "--- $file" && chattr -i $file ;done + cd - + + if [[ "$REMOVE" == "TRUE" ]]; then + unregister_agent_id + clean_up_agent_directory + log remove_agent DONE "agent removed" + exit 0 + fi + clean_up_agent_directory +} + +get_config () { + local filename http_status + + log get_config - "request $NODE_TYPE config file(s)" + + for filename in "${AGENT_CONFIGS[@]}"; do + tmp_json_body=$(mktemp "$TMP_DIR"/nm.reqbody."$(date +%Y%m%d_%H%M%S)".XXXXXX.json) + tmp_json_resp=$(mktemp "$TMP_DIR"/nm.reqresp."$(date +%Y%m%d_%H%M%S)".XXXXXX.json) + cat > "$tmp_json_body" <<_OO_ +{ + "bk_cloud_id": ${CLOUD_ID}, + "filename": "${filename}", + "node_type": "${NODE_TYPE}", + "inner_ip": "${LAN_ETH_IP}", + "token": "${TOKEN}" +} +_OO_ + + http_status=$(http_proxy=$HTTP_PROXY https_proxy=$HTTP_PROXY \ + curl -s -g -S -X POST --retry 5 -d@"$tmp_json_body" "$CALLBACK_URL"/get_gse_config/ -o "$TMP_DIR/$filename" --silent -w "%{http_code}") + rm -f "$tmp_json_body" "$tmp_json_resp" + + if [[ "$http_status" != "200" ]]; then + fail get_config FAILED "request config $filename failed. request info:$CLOUD_ID,$LAN_ETH_IP,$NODE_TYPE,$filename,$TOKEN. http status:$http_status, file content: $(cat "$TMP_DIR/$filename")" + fi + done +} + +setup_agent () { + log setup_agent START "setup agent. (extract, render config)" + report_mkdir "$AGENT_SETUP_PATH"/etc + + cd "$AGENT_SETUP_PATH/.." && ( tar xf "$TMP_DIR/$PKG_NAME" || fail setup_proxy FAILED "decompress package $PKG_NAME failed" ) + + get_config + + for f in "${AGENT_CONFIGS[@]}"; do + if [[ -f $TMP_DIR/$f ]]; then + cp -fp "$TMP_DIR/${f}" "${AGENT_SETUP_PATH}"/etc/${f} + else + fail setup_agent FAILED "agent config file ${f} lost. please check." + fi + done + + # create dir + report_mkdir "$GSE_AGENT_RUN_DIR" "$GSE_AGENT_DATA_DIR" "$GSE_AGENT_LOG_DIR" + + register_agent_id + + check_heathz_by_gse + + start_agent + + log setup_agent DONE "gse agent is setup successfully." +} + +download_pkg () { + local f http_status path + local tmp_stdout tmp_stderr curl_pid + if [[ "${REMOVE}" == "TRUE" ]]; then + log download_pkg - "remove agent, no need to download package" + return 0 + fi + + log download_pkg START "download gse agent package from $COMPLETE_DOWNLOAD_URL/$PKG_NAME)." + cd "$TMP_DIR" && rm -f "$PKG_NAME" + + tmp_stdout=$(mktemp "${TMP_DIR}"/nm.curl.stdout_XXXXXXXX) + tmp_stderr=$(mktemp "${TMP_DIR}"/nm.curl.stderr_XXXXXXXX) + curl -g --connect-timeout 5 -o "$TMP_DIR/$PKG_NAME" \ + --progress-bar -w "%{http_code}" "${COMPLETE_DOWNLOAD_URL}/${PKG_NAME}" >"$tmp_stdout" 2>"$tmp_stderr" & + curl_pid=$! + # 如果curl结束,那么http_code一定会写入到stdout文件 + until [[ -n $http_status ]]; do + read -r http_status < "$tmp_stdout" + # 为了上报curl的进度 + log download_pkg DOWNLOADING "$(awk 'BEGIN { RS="\r"; } END { print }' < "$tmp_stderr")" + sleep 1 + done + rm -f "${tmp_stdout}" "${tmp_stderr}" + wait "$curl_pid" + + # HTTP status 000需要进一步研究 + if [[ $http_status != "200" ]] && [[ "$http_status" != "000" ]]; then + fail download_pkg FAILED "file $PKG_NAME download failed. (url:$COMPLETE_DOWNLOAD_URL/$PKG_NAME, http_status:$http_status)" + fi + + log download_pkg DONE "gse_agent package download succeeded" + log report_cpu_arch DONE "${CPU_ARCH}" +} + + +check_deploy_result () { + # 端口监听状态 + local ret=0 + + AGENT_PID=$( get_pid_by_comm_path gse_agent "$AGENT_SETUP_PATH/bin/gse_agent" "WORKER") + is_port_connected_by_pid "$AGENT_PID" "$IO_PORT" || { fail check_deploy_result FAILED "agent(PID:$AGENT_PID, PORT:$IO_PORT) is not connect to gse server"; ((ret++)); } + is_port_connected_by_pid "$AGENT_PID" "$DATA_PORT" || { fail check_deploy_result FAILED "agent(PID:$AGENT_PID, PORT:$DATA_PORT) is not connect to gse server"; ((ret++)); } + + [ $ret -eq 0 ] && log check_deploy_result DONE "gse agent has been deployed successfully" +} +# 日志行转为json格式函数 +log_to_json () { + local date _time log_level step status message + read -r date _time log_level step status message <<<"$@" + + printf '{"timestamp": "%s", "level": "%s", "step":"%s", "log":"%s","status":"%s"}' "$(date -j -f "%Y-%m-%d %H:%M:%S" "$date $_time" "+%s")" "$log_level" "$step" "$message" "$status" +} + +# 读入LOG_FILE的日志然后批量上报 +# 用法:bulk_report_step_status +bulk_report_step_status () { + local log_file=$1 + local bulk_size=${2:-3} # 默认设置为累积三条报一次 + local is_urg=${3:-""} # 设置URG后立即上报 + local log_total_line diff + local bulk_log log=() line json_log + local tmp_json_body tmp_json_resp + + # 未设置上报API时,直接忽略 + [[ -z "$CALLBACK_URL" ]] && return 0 + log_total_line=$(wc -l <"$log_file") + diff=$(( log_total_line - LOG_RPT_CNT )) + + if (( diff >= bulk_size )) || [[ $is_urg = "URG" ]]; then + ((LOG_RPT_CNT++)) #always report from next line + bulk_log=$(sed -n "${LOG_RPT_CNT},${log_total_line}p" "$log_file") + # 如果刚好 log_total_line能整除 bulk_size时,最后EXIT的URG调用会触发一个空行 + # 判断如果是空字符串则不上报 + if [[ -z "$bulk_log" ]]; then + return 0 + fi + else + return 0 + fi + LOG_RPT_CNT=$log_total_line + + # 构建log数组 + while read -r line; do + log+=( "$(log_to_json "$line")" ) + done <<< "$bulk_log" + # 生成log json array + json_log=$(printf "%s," "${log[@]}") + json_log=${json_log%,} + + tmp_json_body=$(mktemp "$TMP_DIR"/nm.reqbody."$(date +%Y%m%d_%H%M%S)".XXXXXX.json) + tmp_json_resp=$(mktemp "$TMP_DIR"/nm.reqresp."$(date +%Y%m%d_%H%M%S)".XXXXXX.json) + + cat > "$tmp_json_body" <<_OO_ +{ + "task_id": "$TASK_ID", + "token": "$TOKEN", + "logs": [ $json_log ] +} +_OO_ + + http_proxy=$HTTP_PROXY https_proxy=$HTTP_PROXY \ + curl -g -s -S -X POST --retry 5 -d@"$tmp_json_body" "$CALLBACK_URL"/report_log/ -o "$tmp_json_resp" + rm -f "$tmp_json_body" "$tmp_json_resp" +} + +report_step_status () { + local date _time log_level step status message + local tmp_json_body tmp_json_resp + + # 未设置上报API时,直接忽略 + [ -z "$CALLBACK_URL" ] && return 0 + + read -r date _time log_level step status message <<<"$@" + + tmp_json_body=$(mktemp "$TMP_DIR"/nm.reqbody."$(date +%Y%m%d_%H%M%S)".XXXXXX.json) + tmp_json_resp=$(mktemp "$TMP_DIR"/nm.reqresp."$(date +%Y%m%d_%H%M%S)".XXXXXX.json) + + + cat > "$tmp_json_body" <<_OO_ +{ + "task_id": "$TASK_ID", + "token": "$TOKEN", + "logs": [ + { + "timestamp": "$(date +%s -d "$date $_time")", + "level": "$log_level", + "step": "$step", + "log": "$message", + "status": "$status" + } + ] +} +_OO_ + + http_proxy=$HTTP_PROXY https_proxy=$HTTP_PROXY \ + curl -g -s -S -X POST --retry 5 -d@"$tmp_json_body" "$CALLBACK_URL"/report_log/ -o "$tmp_json_resp" + rm -f "$tmp_json_body" "$tmp_json_resp" +} + +validate_vars_string () { + echo "$1" | grep -Pq '^[a-zA-Z_][a-zA-Z0-9]+=' +} + +check_pkgtool () { + _yum=$(command -v yum) + _apt=$(command -v apt) + _dnf=$(command -v dnf) + + _curl=$(command -v curl) + + if [ -f "$_curl" ]; then + return 0 + else + log check_env - "trying to install curl by package management tool" + if [ -f "$_yum" ]; then + # yum 的报错可能有多行,此时错误信息的展示和上报需要单独处理 + yum -y -q install curl || \ + fail check_env FAILED "install curl failed." + elif [ -f "$_apt" ]; then + apt-get -y install curl || \ + fail check_env FAILED "install curl failed." + elif [ -f "$_dnf" ]; then + dnf -y -q install curl || \ + fail check_env FAILED "install curl failed." + else + fail check_env FAILED "no curl command found and can not be installed by neither yum,dnf nor apt-get" + fi + + log check_env - "curl has been installed" + fi +} + +check_disk_space () { + local dir=$1 + # if df -x tmpfs -x devtmpfs --output=avail -k "$TMP_DIR" | awk 'NR==2 { if ($1 < 300 * 1024 ) { exit 1 } else {exit 0} }'; then + if df -k "$TMP_DIR" | awk 'NR==2 { if ($4 < 300 * 1024 ) { exit 1 } else {exit 0} }'; then + log check_env - "check free disk space. done" + else + fail check_env FAILED "no enough space left on $dir" + fi +} + +report_mkdir () { + local dirs="$@" + for dir in ${dirs[@]}; do + local result + if [[ -d "${dir}" ]]; then + continue + else + result="$(mkdir -p ${dir} 2>&1)" + if [ $? -ne 0 ]; then + if [[ -f "${dir}" ]]; then + fail check_env FAILED "create directory $dir failed. error: ${dir} exists and is a normal file" + else + fail check_env FAILED "create directory $dir failed. error: ${result}" + fi + fi + fi + done +} + +check_dir_permission () { + mkdir -p "$TMP_DIR" || fail check-env FAILED "custom temprary dir '$TMP_DIR' create failed." + + if ! mktemp "$TMP_DIR/nm.test.XXXXXXXX"; then + rm "$TMP_DIR"/nm.test.???????? + fail check_env FAILED "create temp files failed in $TMP_DIR" + else + log check_env - "check temp dir write access: yes" + fi +} + +check_download_url () { + local http_status f + + if [[ "${REMOVE}" == "TRUE" ]]; then + return 0 + fi + + for f in $PKG_NAME; do + log check_env - "checking resource($COMPLETE_DOWNLOAD_URL/$f) url's validality" + http_status=$(curl -g -o /dev/null --silent -Iw '%{http_code}' "$COMPLETE_DOWNLOAD_URL/$f") + if [[ "$http_status" == "200" ]] || [[ "$http_status" == "000" ]]; then + log check_env - "check resource($COMPLETE_DOWNLOAD_URL/$f) url succeed" + else + fail check_env FAILED "check resource($COMPLETE_DOWNLOAD_URL/$f) url failed, http_status:$http_status" + fi + done +} + +check_target_clean () { + if [[ -d $AGENT_SETUP_PATH/ ]]; then + warn check_env - "directory $AGENT_SETUP_PATH is not clean. everything will be wiped unless -u was specified" + fi +} + +_help () { + + echo "${0%*/} -i CLOUD_ID -l URL -I LAN_IP [OPTIONS]" + + echo " -n NAME" + echo " -t VERSION" + echo " -I lan ip address on ethernet " + echo " -i CLOUD_ID" + echo " -l DOWNLOAD_URL" + echo " -s TASK_ID. [optional]" + echo " -c TOKEN. [optional]" + echo " -r CALLBACK_URL, [optional]" + echo " -x HTTP_PROXY, [optional]" + echo " -p AGENT_SETUP_PATH, [optional]" + echo " -e BT_FILE_SERVER_IP, [optional]" + echo " -a DATA_SERVER_IP, [optional]" + echo " -k TASK_SERVER_IP, [optional]" + echo " -N UPSTREAM_TYPE, 'server' or 'proxy' [optional]" + echo " -T TEMP directory, [optional]" + echo " -v CUSTOM VARIABLES ASSIGNMENT LISTS. [optional]" + echo " valid variables:" + echo " GSE_AGENT_RUN_DIR" + echo " GSE_AGENT_DATA_DIR" + echo " GSE_AGENT_LOG_DIR" + echo " -o enable override OPTION DEFINED VARIABLES by -v. [optional]" + echo " -O IO_PORT" + echo " -E FILE_SVR_PORT" + echo " -A DATA_PORT" + echo " -V BTSVR_THRIFT_PORT" + echo " -B BT_PORT" + echo " -S BT_PORT_START" + echo " -Z BT_PORT_END" + echo " -K TRACKER_PORT" + echo " -F UNREGISTER_AGENT_ID [optional]" + + exit 0 +} + +check_env () { + local node_type=${1:-$NODE_TYPE} + + log check_env START "checking prerequisite. NETWORK_POLICY,DISK_SPACE,PERMISSION,RESOURCE etc.[PID:$CURR_PID]" + + [ "$CLOUD_ID" != "0" ] && node_type=pagent + validate_setup_path + check_disk_space "$TMP_DIR" + check_dir_permission + check_pkgtool + check_download_url + check_target_clean + + log check_env DONE "checking prerequisite done, result: SUCCESS" +} + +# DEFAULT SETTINGS +CLOUD_ID=0 +TMP_DIR=/tmp +AGENT_SETUP_PATH="/usr/local/gse/${NODE_TYPE}" +CURR_PID=$$ +OVERIDE=false +REMOVE=false +UNREGISTER_AGENT_ID=false +CALLBACK_URL= +AGENT_PID= +DEBUG= + +# 已上报的日志行数 +LOG_RPT_CNT=0 +BULK_LOG_SIZE=3 + +# main program +while getopts n:t:I:i:l:s:uc:r:x:p:e:a:k:N:v:oT:RDO:E:A:V:B:S:Z:K:F arg; do + case $arg in + n) NAME="$OPTARG" ;; + t) VERSION="$OPTARG" ;; + I) LAN_ETH_IP=$OPTARG ;; + i) CLOUD_ID=$OPTARG ;; + l) DOWNLOAD_URL=${OPTARG%/} ;; + s) TASK_ID=$OPTARG ;; + c) TOKEN=$OPTARG ;; + r) CALLBACK_URL=$OPTARG ;; + x) HTTP_PROXY=$OPTARG; HTTPS_PROXY=$OPTARG ;; + p) AGENT_SETUP_PATH=$(echo "$OPTARG/$NODE_TYPE" | sed 's|//*|/|g') ;; + e) read -r -a BT_FILE_SERVER_IP <<< "${OPTARG//,/ }" ;; + a) read -r -a DATA_SERVER_IP <<< "${OPTARG//,/ }" ;; + k) read -r -a TASK_SERVER_IP <<< "${OPTARG//,/ }" ;; + N) UPSTREAM_TYPE=$OPTARG ;; + v) VARS_LIST="$OPTARG" ;; + o) OVERIDE=TRUE ;; + T) TMP_DIR=$OPTARG; mkdir -p "$TMP_DIR" ;; + R) REMOVE=TRUE ;; + D) DEBUG=TRUE ;; + O) IO_PORT=$OPTARG ;; + E) FILE_SVR_PORT=$OPTARG ;; + A) DATA_PORT=$OPTARG ;; + V) BTSVR_THRIFT_PORT=$OPTARG ;; + B) BT_PORT=$OPTARG ;; + S) BT_PORT_START=$OPTARG ;; + Z) BT_PORT_END=$OPTARG ;; + K) TRACKER_PORT=$OPTARG ;; + F) UNREGISTER_AGENT_ID=TRUE ;; + *) _help ;; + esac +done + +## 检查自定义环境变量 +for var_name in ${VARS_LIST//;/ /}; do + validate_vars_string "$var_name" || fail "$var_name is not a valid name" + + case ${var_name%=*} in + CLOUD_ID | DOWNLOAD_URL | TASK_ID | CALLBACK_URL | HOST_LIST_FILE | NODEMAN_PROXY | AGENT_SETUP_PATH) + [ "$OVERIDE" == "TRUE" ] || continue ;; + VARS_LIST) continue ;; + esac + + eval "$var_name" +done + +# 获取包名 +PKG_NAME=${NAME}-${VERSION}.tgz +COMPLETE_DOWNLOAD_URL="${DOWNLOAD_URL}/agent/darwin/${CPU_ARCH}" +GSE_AGENT_CONFIG_PATH="${AGENT_SETUP_PATH}/etc/${GSE_AGENT_CONFIG}" + +LOG_FILE="$TMP_DIR"/nm.${0##*/}.$TASK_ID +DEBUG_LOG_FILE=${TMP_DIR}/nm.${0##*/}.${TASK_ID}.debug + +# redirect STDOUT & STDERR to DEBUG +exec &> >(tee "$DEBUG_LOG_FILE") + +log check_env - "Args are: $*" + +# removed remove_crontab、setup_startup_scripts -> 由 gsectl 判断是否添加 / 移除 + +for step in check_env \ + download_pkg \ + remove_agent \ + remove_proxy_if_exists \ + setup_agent \ + check_deploy_result; do + $step +done diff --git a/script_tools/gsectl/agent/darwin/gsectl b/script_tools/gsectl/agent/darwin/gsectl new file mode 100755 index 000000000..86bf7909b --- /dev/null +++ b/script_tools/gsectl/agent/darwin/gsectl @@ -0,0 +1,864 @@ +#!/bin/bash +# set -x +# vim:ft=sh sts=4 ts=4 expandtab + +# 切换到本脚本(gsectl)所在的目录,并设置WORK_HOME变量为上一级目录 +cd ${BASH_SOURCE%/*} 2>/dev/null +WORK_HOME=${PWD%/bin} +WORK_HOME=`echo $WORK_HOME |sed 's/\/$//g'` +INSTALL_ENV=`echo $WORK_HOME |awk -F/ '{print $(NF-1)}'` + +# 设置agent的max open files +ulimit -n 409600 2>/dev/null +ulimit -c unlimited + +usage () { + echo "useage: gsectl ACTION [MODULE_NAME ... ]" + echo "" + echo "ACTION list: start, stop, restart" + echo " start start gse_agent" + echo " stop stop gse_agent" + echo " restart restart gse_agent" + echo " reload reload gse_agent" + echo " watch watch gse_agent without systemd" +} + +# 启动agent +start_by_binary () { + + local ret=0 + local rt + local -a info + + info=( $(_status) ) + rt=$? + case $rt in + 0) status="process:gse_agent pid:${info[0]} etime:${info[1]} Already RUNNING" ;; + 1) status="ERROR STATUS" ;; + 2) status="EXIT" ;; + 3) status="Reload failed" ;; + 4) status="have more than one ppid equal 1" ;; + esac + + if [ $rt -eq 0 ];then + printf "%s: %s\n" "gse_agent" "$status" + exit 0 + else + echo "have no gse_agent Running, status: $status, then starting" + fi + + if [ $rt -eq 4 ];then + if [ `ps -ef |grep -E gse_agent |grep -E -w $WORK_HOME |awk '$3 == 1' |grep -E -v grep |wc -l` -ge 1 ];then + echo "have more than one agentWorker process with ppid equal 1, need to kill" + #ps -ef |grep -E gse_agent |grep -E -w $WORK_HOME |awk '$3 == 1' |awk '{print $2}' |xargs kill -9 + fi + fi + + echo "start gse_agent ..." + ( ./gse_agent -f $WORK_HOME/etc/gse_agent.conf ) 1>/tmp/start_${node_type}_tmp.log 2>&1; sleep 3 + + + __status; + if [ $? -ne 0 ];then + if is_use_systemd ;then + systemctl status ${INSTALL_ENV}_${module} + else + tail /tmp/start_${node_type}_tmp.log + fi + return 1 + fi +} + +# 停止agent +stop_by_binary () { + # 调用gse_agent --quit停止进程,并等待它退出 + if [ -f ./gse_agent ]; then + ( ./gse_agent --quit ) >/dev/null 2>&1 + sleep 3 + else + echo "no such file: gse_agent. " + return 1 + fi + + _status + # 状态码为2的时候,表示进程不存在的了 + if [[ $? -eq 2 ]]; then + echo "gse agent stop successful" + return 0 + else + echo "gse agent stop failed" + return 1 + fi +} + +# 重启agent +restart_by_binary () { + stop_by_binary $module && start_by_binary $module +} + +# 重载agent +reload_by_binary () { + echo "reload gse_agent ..." + ( ./gse_agent --reload ) >/dev/null 2>&1; sleep 5 + + __status; +} + +# 检测agent状态 +status_by_binary () { + local rt + local -a info + + info=( $(_status) ) + rt=$? + case $rt in + 0) status="pid:${info[0]} etime:${info[1]} RUNNING" ;; + 1) status="ERROR STATUS" ;; + 2) status="EXIT" ;; + 3) status="Reload failed" ;; + 4) status="have more than one ppid equal 1" ;; + esac + printf "%s: %s\n" "gse_agent" "$status" + return $rt +} + +# 检测agent健康状态 +healthz_by_binary () { + local rt + local -a info + + info=$(_healthz) + printf "%s\n" "$info" + return $rt +} + +red_echo () { [ "$HASTTY" != "1" ] && echo "$@" || echo -e "\033[031;1m$*\033[0m"; } +blue_echo () { [ "$HASTTY" != "1" ] && echo "$@" || echo -e "\033[034;1m$*\033[0m"; } +green_echo () { [ "$HASTTY" != "1" ] && echo "$@" || echo -e "\033[032;1m$*\033[0m"; } + +log () { + # 打印消息, 并记录到日志, 日志文件由 LOG_FILE 变量定义 + local retval=$? + local timestamp=$(date +%Y%m%d-%H%M%S) + local level=INFO + local func_seq=$(echo "${FUNCNAME[@]}" | sed 's/ /-/g') + local logfile=${LOG_FILE:=/tmp/watch_${INSTALL_ENV}_${node_type}.log} + local minute + local firstday + + # 如果当前时间为当月1号0点时间,则重命名日志文件名称 + # 获取当前时间的分钟数及当月1号 + minute=$(date +%M) + firstday=$(date +%d) + + # 判断是否为当月1号0点时间 + if [ "$minute" == "00" -a "$firstday" == "01" ]; then + if [ -f ${LOG_FILE}_$(date -d "last month" '+%Y%m').log ];then + echo "backup log already exists" + else + echo "[$(blue_echo ${EXTERNAL_IP}-$LAN_IP)]$timestamp $level|$BASH_LINENO|${func_seq} The current day is first day of month, reset the log file to new one ." >>$logfile + [ -f $LOG_FILE ] && mv $LOG_FILE ${LOG_FILE}_$(date -d "last month" '+%Y%m').log + touch $LOG_FILE + if [ -f /tmp/watch_gse2_agent.log ];then + mv /tmp/watch_gse2_agent.log /tmp/watch_gse2_agent_$(date -d "last month" '+%Y%m').log + fi + fi + fi + + local opt= + + if [ "${1:0:1}" == "-" ]; then + opt=$1 + shift 1 + else + opt="" + fi + + echo -e $opt "[$(blue_echo ${EXTERNAL_IP:-$LAN_IP})]$timestamp|$BASH_LINENO\t$*" + echo "[$(blue_echo ${EXTERNAL_IP}-$LAN_IP)]$timestamp $level|$BASH_LINENO|${func_seq} $*" >>$logfile + + return $retval +} + +watch_by_binary () { + log "=================================" + log "Start detecting..." + local module="agent" + + # 设置记录上次脚本运行的文件 + LAST_RUN_FILE=/var/run/already_run_times_$module + + # 如果文件存在,则读取文件中记录的次数 + if [ -f $LAST_RUN_FILE ]; then + run_count=$(cat $LAST_RUN_FILE) + else + run_count=0 + fi + + # 如果当前时间为整点时间,则重置计数,重新开始检测 + # 获取当前时间的分钟数 + minute=$(date +%M) + + # 判断是否为整点时间 + if [ "$minute" == "00" ]; then + if [ -f $LAST_RUN_FILE -a $run_count -gt 0 ];then + log "The current time is on the hour, reset the counter $run_count -> 0, and restart the detection." + echo 0 > $LAST_RUN_FILE + fi + fi + + # 设置告警阈值 + THRESHOLD=5 + + # 检查上一次脚本是否存在 + if [ -f /var/run/gsectl_check_agent_status.pid ]; then + pid=`cat /var/run/gsectl_check_agent_status.pid` + if lsof -p $pid >/dev/null; then + log "`date +'%F %T.%N'` Last Script: $0 Detection status: PID:$pid is until running , no longer checking the status of the module: ${module}" + return + else + # 如果超过阈值,则发出告警 + if [ $run_count -ge $THRESHOLD ]; then + log "`date +'%F %T.%N'` Script: $0 Detection status: Failed to start the process, exceeded $run_count cycles, no longer checking the status of the module: ${module}" + return + else + log "`date +'%F %T.%N'` The previous script: $0 watch has ended, starting a new detection" + fi + fi + fi + + # 记录当前脚本的 PID + echo $$ > /var/run/gsectl_check_agent_status.pid + + # 检测gse_agent是否正常存在的逻辑 + if [ -z "${module}" ]; then + echo "watch: get module: ${module} failed" + log "watch: get module: ${module} failed" + else + if ! _status ${module}; then + stop_by_binary + start_by_binary + if [ $? -ne 0 ];then + log "`date +'%F %T.%N'` Process failed to start, increment counter" + run_count=$((run_count + 1)) + echo $run_count > $LAST_RUN_FILE + fi + else + if [ $run_count -ne 0 ];then + log "`date +'%F %T.%N'` The previous script: $0 Detection ${module} status is Running , then reset the count" + echo 0 > $LAST_RUN_FILE + fi + fi + fi + return +} + +start_by_systemd () { + if is_systemd_supported ;then + add_config_to_systemd + fi + + if is_use_systemd ;then + stop_by_binary + systemctl start ${INSTALL_ENV}_${module} + __status; + else + start_by_binary + fi +} + +stop_by_systemd () { + if is_use_systemd ;then + systemctl stop ${INSTALL_ENV}_${module} + __status; + else + stop_by_binary + fi +} + +restart_by_systemd () { + if is_systemd_supported ;then + add_config_to_systemd + fi + + if is_use_systemd ;then + stop_by_binary + systemctl restart ${INSTALL_ENV}_${module} + __status; + else + stop_by_binary + start_by_systemd + fi +} + +reload_by_systemd () { + if is_systemd_supported ;then + add_config_to_systemd + fi + + if is_use_systemd ;then + systemctl reload ${INSTALL_ENV}_${module} + __status; + else + reload_by_binary + fi +} + +status_by_systemd () { + if is_use_systemd ;then + systemctl status ${INSTALL_ENV}_${module} + else + status_by_binary + fi +} + +healthz_by_systemd () { + healthz_by_binary +} + +start_by_crontab () { + if is_use_systemd ;then + remove_systemd_config + start_by_binary + add_startup_to_boot + setup_crontab + else + start_by_binary + add_startup_to_boot + setup_crontab + fi + return +} + +stop_by_crontab () { + remove_crontab + stop_by_binary + return +} + + +reload_by_crontab () { + if is_use_systemd ;then + remove_systemd_config + reload_by_binary + add_startup_to_boot + setup_crontab + else + reload_by_binary + add_startup_to_boot + setup_crontab + fi + return +} + +restart_by_crontab () { + if is_use_systemd ;then + remove_systemd_config + restart_by_binary + add_startup_to_boot + setup_crontab + else + restart_by_binary + add_startup_to_boot + setup_crontab + fi + return +} + + +status_by_crontab () { + status_by_binary + return +} + +healthz_by_crontab () { + healthz_by_binary + return +} + +watch_by_crontab () { + watch_by_binary + return +} + + +start_by_rclocal () { + remove_crontab + if is_use_systemd ;then + remove_systemd_config + start_by_binary + else + start_by_binary + fi + + add_startup_to_boot + return +} + +stop_by_rclocal () { + stop_by_binary + return +} + +reload_by_rclocal () { + remove_crontab + if is_use_systemd ;then + remove_systemd_config + fi + + reload_by_binary + add_startup_to_boot + return +} + +restart_by_rclocal () { + remove_crontab + if is_use_systemd ;then + remove_systemd_config + fi + restart_by_binary + add_startup_to_boot + return +} + + +status_by_rclocal () { + status_by_binary + return +} + +healthz_by_rclocal () { + healthz_by_binary + return +} + +is_systemd_supported () { + # 是否支持 systemd, systemd:0, sysinit:1 + if [ "`ps -p 1 -o comm=`" == "systemd" ];then + return 0 + else + return 1 + fi +} + + +is_use_systemd () { + local module="agent" + if [ -f /usr/lib/systemd/system/${INSTALL_ENV}_${module}.service ];then + return 0 + else + return 1 + fi +} + +get_os_info () { + OS_INFO="-" + if [ -f "/proc/version" ]; then + OS_INFO="$OS_INFO $(cat /proc/version)" + fi + if [ -f "/etc/issue" ]; then + OS_INFO="$OS_INFO $(cat /etc/issue)" + fi + OS_INFO="$OS_INFO $(uname -a)" + OS_INFO=$(echo ${OS_INFO} | tr 'A-Z' 'a-z') +} + +get_os_type () { + get_os_info + OS_INFO=$(echo ${OS_INFO} | tr 'A-Z' 'a-z') + if [[ "${OS_INFO}" =~ "ubuntu" ]]; then + OS_TYPE="ubuntu" + RC_LOCAL_FILE="/etc/rc.local" + elif [[ "${OS_INFO}" =~ "centos" ]]; then + OS_TYPE="centos" + RC_LOCAL_FILE="/etc/rc.d/rc.local" + elif [[ "${OS_INFO}" =~ "coreos" ]]; then + OS_TYPE="coreos" + RC_LOCAL_FILE="/etc/rc.d/rc.local" + elif [[ "${OS_INFO}" =~ "freebsd" ]]; then + OS_TYPE="freebsd" + RC_LOCAL_FILE="/etc/rc.d/rc.local" + elif [[ "${OS_INFO}" =~ "debian" ]]; then + OS_TYPE="debian" + RC_LOCAL_FILE="/etc/rc.local" + elif [[ "${OS_INFO}" =~ "suse" ]]; then + OS_TYPE="suse" + RC_LOCAL_FILE="/etc/rc.d/rc.local" + elif [[ "${OS_INFO}" =~ "hat" ]]; then + OS_TYPE="redhat" + RC_LOCAL_FILE="/etc/rc.d/rc.local" + elif [[ "${OS_INFO}" =~ "darwin" ]]; then + OS_TYPE="macos" + RC_LOCAL_FILE="/etc/rc.d/rc.local" + fi +} + +check_rc_file () { + get_os_type + if [ -f "$RC_LOCAL_FILE" ]; then + return 0 + elif [ -f "/etc/rc.d/rc.local" ]; then + RC_LOCAL_FILE="/etc/rc.d/rc.local" + elif [ -f "/etc/init.d/rc.local" ]; then + RC_LOCAL_FILE="/etc/init.d/rc.local" + elif [ -f "/etc/init.d/boot.local" ]; then + RC_LOCAL_FILE="/etc/init.d/boot.local" + else + RC_LOCAL_FILE="`readlink -f /etc/rc.local`" + fi +} + +get_daemon_file () { + DAEMON_FILE_PATH="/Library/LaunchDaemons/" + DAEMON_FILE_NAME="com.tencent.$(echo ${WORK_HOME%*/} | tr '/' '.' | awk -F '.' '{print $(NF-1)"."$NF}').Daemon.plist" +} + + +add_startup_to_boot () { + get_daemon_file + bash -c "cat >$DAEMON_FILE_PATH$DAEMON_FILE_NAME" << EOF + + + + + Label + com.tencent.$(echo ${WORK_HOME%*/} | tr '/' '.' | awk -F '.' '{print $(NF-1)"."$NF}') + ProgramArguments + + ${WORK_HOME}/bin/gsectl + start + + RunAtLoad + + + +EOF + launchctl load $DAEMON_FILE_NAME +} + +add_config_to_systemd () { + + local module="agent" +cat > /tmp/${INSTALL_ENV}_${module}.service << EOF +[Unit] +Description=GSE2.0 Agent Daemon +Wants=network-online.target +After=network-online.target + +[Service] +LimitNOFILE=512000 +LimitCORE=infinity +WorkingDirectory=${WORK_HOME}/bin +PIDFile=${WORK_HOME}/bin/run/${module}.pid +ExecStart=${WORK_HOME}/bin/gse_agent -f /usr/local/${INSTALL_ENV}/${node_type}/etc/gse_agent.conf +ExecReload=${WORK_HOME}/bin/gse_agent --reload +ExecStop=${WORK_HOME}/bin/gse_agent --quit +Type=forking +KillMode=process +User=root +Restart=always +RestartSec=10 + +[Install] +WantedBy=multi-user.target +EOF + + if [ -f /usr/lib/systemd/system/${INSTALL_ENV}_${module}.service ];then + if [ `md5sum /tmp/${INSTALL_ENV}_${module}.service |awk '{print $1}'` == `md5sum /usr/lib/systemd/system/${INSTALL_ENV}_${module}.service |awk '{print $1}'` ];then + echo "${INSTALL_ENV}_${module}.service have no change..." + else + echo "update ${INSTALL_ENV}_${module}.service" + cp /tmp/${INSTALL_ENV}_${module}.service /usr/lib/systemd/system/${INSTALL_ENV}_${module}.service + systemctl daemon-reload + systemctl enable ${INSTALL_ENV}_${module}.service + fi + else + echo "copy ${INSTALL_ENV}_${module}.service" + cp /tmp/${INSTALL_ENV}_${module}.service /usr/lib/systemd/system/${INSTALL_ENV}_${module}.service + systemctl daemon-reload + systemctl enable ${INSTALL_ENV}_${module}.service + fi + + # 删除rc.local里的启动项 + check_rc_file + sed -i "\|${WORK_HOME}/bin/gsectl start ${module}|d" $RC_LOCAL_FILE + + # 删除crontab里的watch条目 + remove_crontab +} + +remove_systemd_config (){ + local module="agent" + + if [ -f /usr/lib/systemd/system/${INSTALL_ENV}_${module}.service ];then + systemctl stop ${INSTALL_ENV}_${module}.service + systemctl disable ${INSTALL_ENV}_${module}.service + rm /usr/lib/systemd/system/${INSTALL_ENV}_${module}.service + fi +} + +setup_crontab () { + local tmpcron + + if [ -n "`crontab -l | grep \"$WORK_HOME/bin/gsectl\" |grep -E -v \"^#|\s+#\"`" ];then + echo "The watch detection entry is already in the crontab..." + return 0 + fi + + tmpcron=/tmp/cron.XXXXXXX + + ( + crontab -l | grep -v "$WORK_HOME/bin/gsectl" + echo "#$WORK_HOME/bin/gsectl Agent check, add by NodeMan @ `date +'%F %T'`" + echo "* * * * * $WORK_HOME/bin/gsectl watch agent 1>>/tmp/watch_gse2_agent.log 2>&1" + ) > "$tmpcron" + + crontab "$tmpcron" && rm -f "$tmpcron" + crontab -l |grep -E "$WORK_HOME" +} + +remove_crontab (){ + local tmpcron + tmpcron=/tmp/cron.XXXXXX + + crontab -l |grep -E -v "$WORK_HOME" >$tmpcron + crontab $tmpcron && rm -f $tmpcron + + # 下面这段代码是为了确保修改的crontab立即生效 + if pgrep -x crond &>/dev/null; then + pkill -HUP -x crond + fi +} + +get_process_runtime (){ + local p_status tmp_gse_master_pid_info tmp_gse_agent_master_pids _pid PID + p_status=1 + + sleep 3 + + for i in {1..20} + do + tmp_gse_master_pid_info=$(ps -ax -o ppid,pid,comm | grep gse_agent | awk '{print $1 "|" $2 "|" $3}' | awk -F'|' '$1 == 1 && $3 ~ /gse_agent/' | awk -F'|' '{print $2}' | xargs) + read -r -a tmp_gse_agent_master_pids <<< "$tmp_gse_master_pid_info" + + for _pid in "${tmp_gse_agent_master_pids[@]}"; do + # tmp_abs_path=$(readlink -f /proc/$_pid/exe) + tmp_abs_path=$(lsof -p $_pid | awk '$4=="txt" {print $9}' | grep gse_agent) + tmp_abs_path=$(echo "${tmp_abs_path}" | sed 's/ (deleted)$//') # 防止异常情况下二进制更新但是还没重启进程 + # 两个路径都用readlink -f 防止有软链接目录 + # master既然存在,先判断路径是否包含WORK_HOME + if [ "$tmp_abs_path" == "$(readlink -f ${WORK_HOME}/bin/gse_agent)" ]; then + # 找到了匹配的pid + # 获取进程pid的启动时间 + PID=$_pid + START_TIME=$(ps -p "$PID" -o lstart= | sed 's/[[:space:]]*$//') + #START_TIME_S=$(date -d "$START_TIME" +%s) + START_TIME_S=$(date -j -f "%a %b %d %T %Y" "$START_TIME" "+%s") + CURRENT_TIME_S=$(date +%s) + TIME_DIFF=$(($CURRENT_TIME_S - $START_TIME_S)) + + if [ $TIME_DIFF -le 20 ]; then + echo "gse_agent -> $PID has been running for $TIME_DIFF seconds, check $i times" + p_status=0 + break 2 + else + echo "gse_agent -> $PID has been running for $TIME_DIFF seconds, restart not yet successful, check $i times" + sleep 1 + fi + fi + done + done + return $p_status +} + +__status (){ + local module="agent" + + # 最多等待20s来判断是否真正启动成功 + for i in {0..20}; do + if [ "$action" == "stop" ];then + if [ $(ps -ax -o ppid,pid,comm | grep gse_${module} | awk '{print $1 "|" $2 "|" $3}' |grep -E "${WORK_HOME}" |wc -l) -eq 0 ];then + #if [ $(ps --no-header -C gse_${module} -o '%P|%p|%a' |grep -E "${WORK_HOME}" |wc -l) -eq 0 ];then + echo gse_${module} $action $action success + break + elif [ $i -eq 20 ];then + echo "gse_${module} $action $action failed" + return 1 + else + sleep 1 + fi + else + if _status >/dev/null; then + # 启动正常,直接退出,返回码0 + echo "gse agent start successful" + + if [ "$action" == "start" -o "$action" == "restart" ];then + get_process_runtime + if [ $? -ne 0 ];then + echo "gse_agent $action failed" + return 3 + fi + elif [ "$action" == "reload" ];then + for i in {0..5}; do + get_process_runtime + if [ $? -eq 0 ];then + break + elif [ $? -ne 0 ];then + sleep 2 + elif [ $i -eq 5 ];then + echo "gse_agent $action failed" + return 3 + fi + done + fi + + return 0 + elif [ $i -eq 20 ]; then + # i等于20,超时退出,返回码1 + echo "gse agent start failed" + return 1 + else + sleep 2 + fi + fi + done +} + +# 返回码: +# 0: 正常,且成对出现 +# 1:异常,存在master进程但是worker不存在 +# 2: 异常,没有master进程存在 +# 3: 异常,进程重启、reload、启动失败 +_status () { + local gse_master_info _pid pid abs_path + + if [ "$action" == "reload" ];then + # 如果是reload,需要新的进程启动,才能继续判断进程是否符合正常情况 + get_process_runtime + if [ $? -ne 0 ];then + echo "gse_agent $action failed" + return 3 + fi + fi + + # 初筛,考虑到gse组件的父、子进程都是名为gse_agent的,且它的父进程应该是等于1 + # ps的-o参数指定输出字段%P(ppid)、%p(pid)、%a(args) + # 所以下面命令是拉出所有进程名为gse_agent,且父进程为1,进程参数包含gse_agent的进程信息 + gse_master_pid_info=$(ps -ax -o ppid,pid,comm | grep gse_agent | awk '{print $1 "|" $2 "|" $3}' | awk -F'|' '$1 == 1 && $3 ~ /gse_agent/' | awk -F'|' '{print $2}' | xargs) + read -r -a gse_agent_master_pids <<< "$gse_master_pid_info" + + if [[ -z "$gse_agent_master_pids" ]]; then + # 连master都没有,那不用做更深入的判断,直接返回false + return 2 + fi + gse_master_pids_by_exe_path=() + + for _pid in "${gse_agent_master_pids[@]}"; do + #abs_path=$(readlink -f /proc/$_pid/exe) + abs_path=$(lsof -p $_pid | awk '$4=="txt" {print $9}' | grep gse_agent) + abs_path=$(echo "${abs_path}" | sed 's/ (deleted)$//') # 防止异常情况下二进制更新但是还没重启进程 + # 两个路径都用readlink -f 防止有软链接目录 + # master既然存在,先判断路径是否包含WORK_HOME + if [ "$abs_path" == "$(readlink -f ${WORK_HOME}/bin/gse_agent)" ]; then + # 找到了匹配的pid + gse_master_pids_by_exe_path+=($_pid) + fi + done + + agent_id_file=${WORK_HOME}/bin/run/agent.pid + if [[ ${#gse_master_pids_by_exe_path} -eq 0 ]]; then + # 连master都没有,那不用做更深入的判断,直接返回false + return 2 + elif [[ ${#gse_master_pids_by_exe_path[@]} -gt 1 && -f ${agent_id_file} ]]; then + # 兼容存在游离gse_agent worker进程的场景 + gse_master_pid=$(cat $agent_id_file) + return 4 + else + gse_master_pid=$gse_master_pids_by_exe_path + fi + + # 查看该gseMaster进程是否子进程Worker(>=1) + if [[ $(pgrep -P $gse_master_pid | wc -l) -eq 0 ]]; then + return 1 + fi + # 运行到这里时就可以获取进程状态详细信息输出到STDOUT,并返回0了 + # ps --no-header -p $gse_master_pid -o pid,etime,command + ps -p $gse_master_pid -o pid,etime,command | grep gse_agent + return 0 +} + +_healthz () { + ./gse_agent --healthz +} + +get_auto_type () { + # 由节点管理进行渲染,当前环境使用 {{ AUTO_TYPE }} + echo "{{ AUTO_TYPE }}" + return + if is_systemd_supported;then + echo "systemd" + else + echo "crontab" + fi +} + +detect_node_type () { + case $WORK_HOME in + *"$INSTALL_ENV"/proxy) node_type=proxy ;; + *"$INSTALL_ENV"/agent) node_type=agent ;; + *) node_type=unknown ;; + esac + + echo $node_type >$WORK_HOME/.gse_node_type +} + +# main +action="$1"; shift +module="agent" + +auto_type=$(get_auto_type) + +if [ -s $WORK_HOME/.gse_node_type ]; then + read node_type ignore <$WORK_HOME/.gse_node_type +else + detect_node_type +fi + +if [ "${node_type}" == "unknown" ];then + echo "wrong node type: ${node_type}" + exit +fi +if [ $auto_type == "systemd" ]; then + case $action in + start) start_by_systemd 2>&1 | tee /tmp/nm_"${auto_type}"_"${action}".log ;; + stop) stop_by_systemd 2>&1 | tee /tmp/nm_"${auto_type}"_"${action}".log ;; + restart) restart_by_systemd 2>&1 | tee /tmp/nm_"${auto_type}"_"${action}".log ;; + status) status_by_systemd 2>&1 | tee /tmp/nm_"${auto_type}"_"${action}".log ;; + reload) reload_by_systemd 2>&1 | tee /tmp/nm_"${auto_type}"_"${action}".log ;; + healthz) healthz_by_systemd 2>&1 | tee /tmp/nm_"${auto_type}"_"${action}".log ;; + -h|*) usage ; exit 255 ;; + esac +elif [ $auto_type == "crontab" ]; then + case $action in + start) start_by_crontab 2>&1 | tee /tmp/nm_"${auto_type}"_"${action}".log ;; + stop) stop_by_crontab 2>&1 | tee /tmp/nm_"${auto_type}"_"${action}".log ;; + restart) restart_by_crontab 2>&1 | tee /tmp/nm_"${auto_type}"_"${action}".log ;; + status) status_by_crontab 2>&1 | tee /tmp/nm_"${auto_type}"_"${action}".log ;; + reload) reload_by_crontab 2>&1 | tee /tmp/nm_"${auto_type}"_"${action}".log ;; + healthz) healthz_by_crontab 2>&1 | tee /tmp/nm_"${auto_type}"_"${action}".log ;; + watch) watch_by_crontab 2>&1 | tee /tmp/nm_"${auto_type}"_"${action}".log ;; + -h|*) usage ; exit 255 ;; + esac +elif [ $auto_type == "rclocal" ]; then + case $action in + start) start_by_rclocal 2>&1 | tee /tmp/nm_"${auto_type}"_"${action}".log ;; + stop) stop_by_rclocal 2>&1 | tee /tmp/nm_"${auto_type}"_"${action}".log ;; + restart) restart_by_rclocal 2>&1 | tee /tmp/nm_"${auto_type}"_"${action}".log ;; + status) status_by_rclocal 2>&1 | tee /tmp/nm_"${auto_type}"_"${action}".log ;; + reload) reload_by_rclocal 2>&1 | tee /tmp/nm_"${auto_type}"_"${action}".log ;; + healthz) healthz_by_rclocal 2>&1 | tee /tmp/nm_"${auto_type}"_"${action}".log ;; + -h|*) usage ; exit 255 ;; + esac +fi +exit $? + diff --git a/script_tools/plugin_scripts/fetch_used_ports.zsh b/script_tools/plugin_scripts/fetch_used_ports.zsh new file mode 100644 index 000000000..aaeec41a5 --- /dev/null +++ b/script_tools/plugin_scripts/fetch_used_ports.zsh @@ -0,0 +1,9 @@ +#!/bin/sh + +echo "Showing used ports" + +echo "===== list of used ports begin =====" + +netstat -lntup | awk '{print $4}' + +echo "===== list of used ports end =====" diff --git a/script_tools/plugin_scripts/operate_plugin.zsh b/script_tools/plugin_scripts/operate_plugin.zsh new file mode 100644 index 000000000..f941c5aed --- /dev/null +++ b/script_tools/plugin_scripts/operate_plugin.zsh @@ -0,0 +1,47 @@ +#!/bin/sh + +usage () { + echo "usage: operate_plugin OPTIONS" + echo "OPTIONS" + echo " -t plugin category, could be: official/external/scripts" + echo " -n plugin name" + echo " -p home path of plugin" + echo " -c command to run" + echo " -g group id" + exit 0 +} + +guess_target_dir () { + case $CATEGORY in + official | 1) + export BINDIR=$INSTALL_PATH/plugins/bin + ;; + external | 2) + export BINDIR=$INSTALL_PATH/external_plugins/$GROUP_ID/$PLUGIN_NAME/ + ;; + scripts | 3) + export BINDIR=$INSTALL_PATH/external_scripts/ + ;; + *) + echo "unkown category. abort" + exit 1 + ;; + esac +} + + +while getopts t:n:p:c:g: arg; do + case $arg in + t) export CATEGORY=$OPTARG ;; + n) export PLUGIN_NAME=${OPTARG} ;; + p) export INSTALL_PATH=${OPTARG} ;; + c) export RUN_CMD=${OPTARG} ;; + g) export GROUP_ID=${OPTARG} ;; + *) usage ;; + esac +done + +guess_target_dir + +cd ${BINDIR} +${RUN_CMD} diff --git a/script_tools/plugin_scripts/reload.zsh b/script_tools/plugin_scripts/reload.zsh new file mode 100755 index 000000000..7f9b910cd --- /dev/null +++ b/script_tools/plugin_scripts/reload.zsh @@ -0,0 +1,105 @@ +#!/bin/bash + +cd ${BASH_SOURCE%/*} + +red_echo () { [ "$HASTTY" != "1" ] && echo "$@" || echo -e "\033[031;1m$@\033[0m"; } +blue_echo () { [ "$HASTTY" != "1" ] && echo "$@" || echo -e "\033[034;1m$@\033[0m"; } +green_echo () { [ "$HASTTY" != "1" ] && echo "$@" || echo -e "\033[032;1m$@\033[0m"; } + +log () { + # 打印消息, 并记录到日志, 日志文件由 LOG_FILE 变量定义 + local retval=$? + local timestamp=$(date +%Y%m%d-%H%M%S) + local level=INFO + local func_seq=$(echo ${FUNCNAME[@]} | sed 's/ /-/g') + local logfile=${LOG_FILE:=/tmp/bkc.log} + + local opt= + + if [ "${1:0:1}" == "-" ]; then + opt=$1 + shift 1 + else + opt="" + fi + + echo -e $opt "$timestamp|$BASH_LINENO\t$@" + echo -e $opt "$timestamp $level|$BASH_LINENO|${func_seq} $@\n" >>$logfile + + return $retval +} + +usage () { + echo "usage: $0 PLUGIN_NAME" + exit 0 +} + +_status_windows_proc () { + local proc="$1" + local pids + + pids=( $(ps -efW | grep "bin/${proc}" | awk '{print $2}') ) + echo -n ${pids[@]} + + [ ${#pids[@]} -ne 0 ] +} + +_status_linux_proc () { + local proc="$1" + local pids + local __pids=() + + pids=$(ps xao pid,ppid,command | awk -v PROG="./$proc" '$3 == PROG { print $1 }') + for pid in ${pids[@]} ; do + abs_path=$(readlink -f /proc/$pid/exe) + if [ "${abs_path%/$proc*}" == "${PWD}" ] ; then + __pids=(${__pids} ${pid}) + fi + done + pids=(${__pids}) + echo -n ${pids[@]} + + [ ${#pids[@]} -ne 0 ] +} + +_status_darwin_proc () { + local proc="$1" + local pids + local __pids=() + + pids=$(ps xao pid,ppid,command | awk -v PROG="./$proc" '$3 == PROG { print $1 }') + for pid in ${pids[@]} ; do + abs_path=$(lsof -p $pid | awk '$4 == "txt" { print $9 }') + for _abs_path in ${abs_path[@]} ; do + if [ "${_abs_path%/$proc*}" == "${PWD}" ] ; then + __pids=(${__pids} ${pid}) + fi + done + done + pids=(${__pids[@]}) + + echo -n ${pids[@]} + + [ ${#pids[@]} -ne 0 ] +} + +_reload () { + kill -USR1 $(_status_${os_type}_proc $1) 2>/dev/null +} + +_status () { + local proc="$1" + + _status_${os_type}_proc $proc +} + +case $(uname -s) in + *Linux) os_type=linux ;; + *CYGWIN*) os_type=windows ;; + *Darwin*) os_type=darwin ;; +esac + +[ -z "$1" ] && usage + +log -n "reload $1 ..." +_reload $1 && green_echo " Done" || red_echo " Fail" \ No newline at end of file diff --git a/script_tools/plugin_scripts/remove_config.zsh b/script_tools/plugin_scripts/remove_config.zsh new file mode 100644 index 000000000..f02f09298 --- /dev/null +++ b/script_tools/plugin_scripts/remove_config.zsh @@ -0,0 +1,5 @@ +#!/bin/bash + +echo "remove config files" $@ + +rm -f $@ diff --git a/script_tools/plugin_scripts/restart.zsh b/script_tools/plugin_scripts/restart.zsh new file mode 100755 index 000000000..6521c5add --- /dev/null +++ b/script_tools/plugin_scripts/restart.zsh @@ -0,0 +1,4 @@ +#!/bin/bash + +cd ${BASH_SOURCE%/*} 2>/dev/null +./stop.sh $@ >/dev/null && ./start.sh $@ >/dev/null diff --git a/script_tools/plugin_scripts/start.zsh b/script_tools/plugin_scripts/start.zsh new file mode 100755 index 000000000..7a2ce8128 --- /dev/null +++ b/script_tools/plugin_scripts/start.zsh @@ -0,0 +1,117 @@ +#!/bin/bash + +cd ${BASH_SOURCE%/*} + +red_echo () { [ "$HASTTY" != "1" ] && echo "$@" || echo -e "\033[031;1m$@\033[0m" >&2; } +blue_echo () { [ "$HASTTY" != "1" ] && echo "$@" || echo -e "\033[034;1m$@\033[0m"; } +green_echo () { [ "$HASTTY" != "1" ] && echo "$@" || echo -e "\033[032;1m$@\033[0m"; } + +log () { + # 打印消息, 并记录到日志, 日志文件由 LOG_FILE 变量定义 + local retval=$? + local timestamp=$(date +%Y%m%d-%H%M%S) + local level=INFO + local func_seq=$(echo ${FUNCNAME[@]} | sed 's/ /-/g') + local logfile=${LOG_FILE:=/tmp/bkc.log} + + local opt= + + if [ "${1:0:1}" == "-" ]; then + opt=$1 + shift 1 + else + opt="" + fi + + echo -e $opt "$timestamp $BASH_LINENO $@" + echo -e $opt "$timestamp $level|$BASH_LINENO|${func_seq} $@\n" >>$logfile + + return $retval +} + +usage () { + echo "usage: $0 PLUGIN_NAME" + exit 0 +} + +_status_windows_proc () { + local proc="$1" + + pids=( $(ps -efW | grep "${proc}.exe" | awk '{print $2}') ) + echo -n ${pids[@]} +} + +_status_linux_proc () { + local proc="$1" + local pids + local __pids=() + + pids=$(ps xao pid,ppid,command | awk -v PROG="./$proc" '$3 == PROG { print $1 }') + for pid in ${pids[@]} ; do + abs_path=$(readlink -f /proc/$pid/exe) + if [ "${abs_path%/$proc*}" == "${PWD}" ] ; then + __pids=(${__pids} ${pid}) + fi + done + pids=(${__pids}) + echo -n ${pids[@]} + + [ ${#pids[@]} -ne 0 ] +} + +_status_darwin_proc () { + local proc="$1" + local pids + local __pids=() + + pids=$(ps xao pid,ppid,command | awk -v PROG="./$proc" '$3 == PROG { print $1 }') + for pid in ${pids[@]} ; do + abs_path=$(lsof -p $pid | awk '$4 == "txt" { print $9 }') + for _abs_path in ${abs_path[@]} ; do + if [ "${_abs_path%/$proc*}" == "${PWD}" ] ; then + __pids=(${__pids} ${pid}) + fi + done + done + pids=(${__pids[@]}) + + echo -n ${pids[@]} + + [ ${#pids[@]} -ne 0 ] +} + +_status () { + local proc="$1" + + _status_${os_type}_proc $proc +} + +case $(uname -s) in + *Linux) os_type=linux ;; + *CYGWIN*) os_type=windows ;; + *Darwin*) os_type=darwin ;; +esac + +[ -z "$1" ] && usage + +log -n "start $1 ..." +if [ -f $1 ]; then + chmod +x ./$1 +else + red_echo "$1: file not exists($PWD)" + exit 1 +fi + +if [ -f ../etc/${1}.conf ]; then + nohup ./$1 -c ../etc/${1}.conf >/dev/null 2>/tmp/xuoasefasd.err & + sleep 1 + if _status $1; then + green_echo "Done" + else + red_echo "$(< /tmp/xuoasefasd.err). Fail" + exit 1 + fi +else + red_echo "config file ${1}.conf not exists" + exit 1 +fi diff --git a/script_tools/plugin_scripts/stop.zsh b/script_tools/plugin_scripts/stop.zsh new file mode 100755 index 000000000..9c58edb08 --- /dev/null +++ b/script_tools/plugin_scripts/stop.zsh @@ -0,0 +1,112 @@ +#!/bin/bash + +cd ${BASH_SOURCE%/*} + +red_echo () { [ "$HASTTY" != "1" ] && echo "$@" || echo -e "\033[031;1m$@\033[0m"; } +blue_echo () { [ "$HASTTY" != "1" ] && echo "$@" || echo -e "\033[034;1m$@\033[0m"; } +green_echo () { [ "$HASTTY" != "1" ] && echo "$@" || echo -e "\033[032;1m$@\033[0m"; } + +log () { + # 打印消息, 并记录到日志, 日志文件由 LOG_FILE 变量定义 + local retval=$? + local timestamp=$(date +%Y%m%d-%H%M%S) + local level=INFO + local func_seq=$(echo ${FUNCNAME[@]} | sed 's/ /-/g') + local logfile=${LOG_FILE:=/tmp/bkc.log} + + local opt= + + if [ "${1:0:1}" == "-" ]; then + opt=$1 + shift 1 + else + opt="" + fi + + echo -e $opt "$timestamp|$BASH_LINENO\t$@" + echo -e $opt "$timestamp $level|$BASH_LINENO|${func_seq} $@\n" >>$logfile + + return $retval +} + +usage () { + echo "usage: $0 PLUGIN_NAME" + exit 0 +} + +_status_windows_proc () { + local proc="$1" + local pids + + pids=( $(ps -efW | grep "bin/${proc}" | awk '{print $2}') ) + echo -n ${pids[@]} + + [ ${#pids[@]} -ne 0 ] +} + +_status_linux_proc () { + local proc="$1" + local pids + local __pids=() + + pids=$(ps xao pid,ppid,command | awk -v PROG="./$proc" '$3 == PROG { print $1 }') + for pid in ${pids[@]} ; do + abs_path=$(readlink -f /proc/$pid/exe) + if [ "${abs_path%/$proc*}" == "${PWD}" ] ; then + __pids=(${__pids} ${pid}) + fi + done + pids=(${__pids[@]}) + + echo -n ${pids[@]} + + [ ${#pids[@]} -ne 0 ] +} + +_status_darwin_proc () { + local proc="$1" + local pids + local __pids=() + + pids=$(ps xao pid,ppid,command | awk -v PROG="./$proc" '$3 == PROG { print $1 }') + for pid in ${pids[@]} ; do + abs_path=$(lsof -p $pid | awk '$4 == "txt" { print $9 }') + for _abs_path in ${abs_path[@]} ; do + if [ "${_abs_path%/$proc*}" == "${PWD}" ] ; then + __pids=(${__pids} ${pid}) + fi + done + done + pids=(${__pids[@]}) + + echo -n ${pids[@]} + + [ ${#pids[@]} -ne 0 ] +} + +_stop () { + kill -9 $(_status_${os_type}_proc $1) 2>/dev/null +} + +_status () { + local proc="$1" + + _status_${os_type}_proc $proc +} + +case $(uname -s) in + *Linux) os_type=linux ;; + *CYGWIN*) os_type=windows ;; + *Darwin*) os_type=darwin ;; +esac + +[ -z "$1" ] && usage + +log -n "stop $1 ..." +_stop $1 +sleep 2 +if ! _status $1; then + green_echo "Done" +else + red_echo "Fail" +fi \ No newline at end of file diff --git a/script_tools/plugin_scripts/stop_debug.zsh b/script_tools/plugin_scripts/stop_debug.zsh new file mode 100644 index 000000000..951f94ee9 --- /dev/null +++ b/script_tools/plugin_scripts/stop_debug.zsh @@ -0,0 +1,38 @@ +#!/bin/sh + +echo "clean debug plugin" + +while getopts n:p:g: arg; do + case $arg in + n) export PLUGIN_NAME=${OPTARG} ;; + p) export INSTALL_PATH=${OPTARG} ;; + g) export GROUP_ID=${OPTARG} ;; + *) usage ;; + esac +done + +if [[ $GROUP_ID == "" ]];then + echo "group id can not be empty!" + exit 1 +fi + +export BINDIR=$INSTALL_PATH/external_plugins/$GROUP_ID/$PLUGIN_NAME/ + +echo "Stopping debug process..." + +for file in $BINDIR/pid/* +do + if test -f $file + then + pid=`cat $file` + echo "Found PID file: $file" + echo "PID to be killed: $pid" + kill -9 $pid + fi +done + +echo "Removing plugin directory..." + +rm -rf $INSTALL_PATH/external_plugins/$GROUP_ID/ + +exit 0 \ No newline at end of file diff --git a/script_tools/plugin_scripts/update_binary.zsh b/script_tools/plugin_scripts/update_binary.zsh new file mode 100644 index 000000000..f8c233026 --- /dev/null +++ b/script_tools/plugin_scripts/update_binary.zsh @@ -0,0 +1,147 @@ +#!/bin/bash + +usage () { + echo "usage: update_binary OPTIONS" + echo "OPTIONS" + echo " -t plugin category, cloud be: official/external/scripts" + echo " -r uninstall" + echo " -m upgrade type: append or override" + echo " -n plugin name" + echo " -f package of plugin" + echo " -p home path of plugin" + echo " -i plugin group id, for external plugin only" + #echo " -v plugin VERSION, for verification" + + exit 1 +} + +backup () { + local backup_filename=${PLUGIN_NAME}-backup.pz + + tar zcPf ${backup_filename} $BINDIR $ETCDIR + mkdir -p ${BACKUP_DIR} + mv ${backup_filename} ${BACKUP_DIR} +} + +guess_target_dir () { + case $CATEGORY in + OFFICIAL | 1) + export BINDIR=$GSE_HOME/plugins/bin + export ETCDIR=$GSE_HOME/plugins/etc + ;; + EXTERNAL | 2) + if [ -n "$GROUP_DIR" ]; then + # 如果提供了实例ID,则将插件安装到给定的ID目录下 + export BINDIR=$GSE_HOME/external_plugins/$GROUP_DIR/$PLUGIN_NAME/ + export ETCDIR=$GSE_HOME/external_plugins/$GROUP_DIR/$PLUGIN_NAME/etc + export EXTERNAL_PLUGIN_TMPDIR=${EXTERNAL_PLUGIN_DIR}/external_plugins/${GROUP_DIR}/${PLUGIN_NAME}/ + mkdir -p ${EXTERNAL_PLUGIN_TMPDIR} + else + export BINDIR=$GSE_HOME/external_plugins/$PLUGIN_NAME/ + export ETCDIR=$GSE_HOME/external_plugins/$PLUGIN_NAME/etc + fi + ;; + SCRIPTS | 3) + export BINDIR=$GSE_HOME/external_scripts/ + export ETCDIR=$GSE_HOME/external_scripts/ + ;; + *) + echo "unknown category $CATEGORY. abort" + exit 1 + ;; + esac +} + +REMOVE=0 +UPGRADE_TYPE=append +RESERVE_CONF=0 +TMP=/tmp/ +BACKUP_DIR=/tmp/nodeman_backup/ +EXTERNAL_PLUGIN_DIR=/tmp/nodeman_external_plugins/ + +while getopts rut:T:d:n:m:f:z:v:p:h:i: arg; do + case $arg in + T) TIMEOUT=$OPTARG ;; + t) export CATEGORY=$OPTARG ;; + r) export REMOVE=1 ;; + m) export UPGRADE_TYPE=$OPTARG ;; + n) export PLUGIN_NAME=${OPTARG} ;; + p) export GSE_HOME=${OPTARG} ;; + z) export TMP=${OPTARG} ;; + u) export RESERVE_CONF=1 ;; + #v) export VERSION=$OPTARG ;; + f) export PACKAGE=$OPTARG ;; # 官方插件/第三方插件有效 + i) export GROUP_DIR=$OPTARG ;; + *) usage ;; + esac +done + +CATEGORY=$(echo $CATEGORY | tr 'a-z' 'A-Z') +UPGRADE_TYPE=$(echo $UPGRADE_TYPE | tr 'a-z' 'A-Z') +guess_target_dir +CATEGORY=$(echo $CATEGORY | tr 'a-z' 'A-Z') +UPGRADE_TYPE=$(echo $UPGRADE_TYPE | tr 'a-z' 'A-Z') + +if [ "$REMOVE" == 1 ]; then + cd $BINDIR/ || { echo "$PLUGIN_NAME is not installed, abort"; exit 0; } + # ./stop.sh ${PLUGIN_NAME} || echo "stop plugin $PLUGIN_NAME failed, ignored." + + if [ "${CATEGORY}" == "OFFICIAL" ]; then + rm -rf $BINDIR/bin/${PLUGIN_NAME} $ETCDIR/${PLUGIN_NAME}.conf + else + rm -rf $BINDIR/ + fi + exit $? +fi + + +mkdir -p $GSE_HOME + +if [ -d "$BINDIR" ]; then + backup $PLUGIN_NAME + + if [ "${UPGRADE_TYPE}" != "APPEND" -a "${CATEGORY}" != "OFFICIAL" ]; then + echo "removing old plugin files" + # 官方插件, 用覆盖的方式, 删掉所有目录, 默认情况下, 官方插件所有都在同一个包里. + # 第三方插件, 删掉的是对应的插件目录. + rm -rf ${BINDIR%/*} + fi +fi + +# 解压配置到目标路径 +cd $TMP +echo "coming into: $TMP" + +if [ "${CATEGORY}" == "EXTERNAL" -a -n "$GROUP_DIR" ]; then + # 解压文件到临时目录 然后转移到对应目录 规避同台机器并发导致的部分实例目录为空问题 + # 第三方插件指定了instance_id,解压后需要将插件从标准路径移动到实例路径下 + echo "unzip ${PACKAGE} to dir: ${EXTERNAL_PLUGIN_TMPDIR}" + tar xvf ${PACKAGE} -C ${EXTERNAL_PLUGIN_TMPDIR} + if [ $? -ne 0 ]; then + echo "unzip ${PACKAGE} to ${EXTERNAL_PLUGIN_TMPDIR} failed!" + exit 1 + fi + mkdir -p $BINDIR + cp -rf $EXTERNAL_PLUGIN_TMPDIR/external_plugins/$PLUGIN_NAME/ $(dirname "$BINDIR") + if [ $? -ne 0 ]; then + echo "cp -rf $EXTERNAL_PLUGIN_TMPDIR/external_plugins/$PLUGIN_NAME/ to $(dirname "$BINDIR") failed." + exit 1 + else + rm -rf $EXTERNAL_PLUGIN_TMPDIR/external_plugins/$PLUGIN_NAME/ + fi +else + tar xvf $PACKAGE -C $GSE_HOME +fi + +# 恢复配置文件 +if [ "$RESERVE_CONF" == 1 ]; then + echo "recover config file" + tar xPf $(ls -rt $BACKUP_DIR/${PLUGIN_NAME}-*.pz | tail -1) $ETCDIR + if [ $? -ne 0 ]; then + exit 1 + fi +fi + +# 输出看看更新后的信息. debug 时用. +#ls -l $BINDIR/$PLUGIN_NAME $ETCDIR/${PLUGIN_NAME}.conf +#cat $ETCDIR/${PLUGIN_NAME}.conf \ No newline at end of file