From b3977e282eca204adf57d43f33a027f18c733e29 Mon Sep 17 00:00:00 2001 From: crayon <873217631@qq.com> Date: Mon, 23 Oct 2023 15:46:48 +0800 Subject: [PATCH] =?UTF-8?q?feature:=20=E5=8F=AF=E8=A7=82=E6=B5=8B=E5=BB=BA?= =?UTF-8?q?=E8=AE=BE=20(closed=20#1852)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../collections/agent_new/install.py | 213 ++++++++++++-- apps/backend/components/collections/base.py | 83 +++++- apps/backend/components/collections/core.py | 3 + apps/backend/components/collections/plugin.py | 4 + apps/backend/constants.py | 2 + .../periodic_tasks/cache_scope_instances.py | 3 +- .../collect_auto_trigger_job.py | 2 + apps/backend/plugin/views.py | 1 + apps/backend/subscription/commons.py | 19 +- apps/backend/subscription/handler.py | 4 +- apps/backend/subscription/render_functions.py | 2 +- apps/backend/subscription/steps/plugin.py | 80 +++--- apps/backend/subscription/tasks.py | 207 +++++++++++-- apps/backend/subscription/tools.py | 144 +++++++--- apps/backend/subscription/views.py | 4 +- .../collections/agent_new/test_install.py | 121 ++++++++ apps/backend/tests/subscription/utils.py | 2 +- apps/backend/views.py | 2 +- apps/core/concurrent/cache.py | 128 +++++++++ apps/core/gray/tools.py | 4 +- apps/exceptions.py | 13 + apps/generic.py | 20 +- apps/iam/handlers/resources.py | 4 +- apps/node_man/constants.py | 11 +- apps/node_man/handlers/debug.py | 17 ++ apps/node_man/handlers/job.py | 4 +- apps/node_man/handlers/password.py | 40 ++- apps/node_man/handlers/policy.py | 4 +- .../migrations/0076_auto_20230924_2330.py | 77 +++++ apps/node_man/models.py | 31 +- .../add_biz_to_gse2_gray_scope.py | 4 +- .../periodic_tasks/resource_watch_task.py | 21 ++ .../tests/test_handlers/test_policy.py | 2 +- .../test_add_biz_to_gse2_gray_scope.py | 4 +- apps/node_man/tools/job.py | 10 +- apps/prometheus/helper.py | 213 ++++++++++++++ apps/prometheus/metrics.py | 271 ++++++++++++++++++ apps/prometheus/middlewares.py | 4 +- apps/prometheus/models.py | 27 +- apps/prometheus/reporter.py | 75 +++++ apps/utils/cache.py | 76 +++-- apps/utils/exc.py | 27 +- apps/utils/local.py | 39 ++- common/api/base.py | 4 +- common/api/modules/utils.py | 4 +- common/context_processors.py | 4 +- config/default.py | 154 ++++++---- config/patchers/__init__.py | 10 + config/patchers/monitor_reporter.py | 135 +++++++++ env/__init__.py | 35 +++ env/constants.py | 9 + script_tools/setup_pagent2.py | 152 +++++++--- .../helm/bk-nodeman/templates/_helpers.tpl | 2 + .../templates/configmaps/env-configmap.yaml | 22 ++ .../kubernetes/helm/bk-nodeman/values.yaml | 21 ++ .../#etc#supervisor-bknodeman-nodeman.conf | 4 +- .../templates/nodeman#bin#environ.sh | 4 + 57 files changed, 2243 insertions(+), 339 deletions(-) create mode 100644 apps/core/concurrent/cache.py create mode 100644 apps/node_man/migrations/0076_auto_20230924_2330.py create mode 100644 apps/prometheus/helper.py create mode 100644 apps/prometheus/metrics.py create mode 100644 apps/prometheus/reporter.py create mode 100644 config/patchers/__init__.py create mode 100644 config/patchers/monitor_reporter.py diff --git a/apps/backend/components/collections/agent_new/install.py b/apps/backend/components/collections/agent_new/install.py index 9229d7246..2ab99807e 100644 --- a/apps/backend/components/collections/agent_new/install.py +++ b/apps/backend/components/collections/agent_new/install.py @@ -15,8 +15,9 @@ import random import socket import time +import typing from collections import defaultdict -from typing import Any, Dict, List, Optional, Set, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union from django.conf import settings from django.utils import timezone, translation @@ -36,10 +37,14 @@ from apps.backend.utils.wmi import execute_cmd, put_file from apps.core.concurrent import controller from apps.core.remote import conns -from apps.exceptions import AuthOverdueException +from apps.exceptions import AuthOverdueException, parse_exception from apps.node_man import constants, models -from apps.utils import concurrent, exc, sync +from apps.prometheus import metrics +from apps.prometheus.helper import SetupObserve +from apps.utils import concurrent, sync +from apps.utils.exc import ExceptionHandler from common.api import JobApi +from common.log import logger from pipeline.core.flow import Service, StaticIntervalGenerator from .. import core @@ -56,6 +61,85 @@ def __init__(self, sub_inst_id: int, host: models.Host, installation_tool: Insta super().__init__(sub_inst_id=sub_inst_id, host=host, identity_data=installation_tool.identity_data) +def parse_common_labels_by_install_obj( + method: str, params: typing.Dict[str, typing.Any] +) -> typing.Dict[str, typing.Any]: + install_sub_inst_obj: InstallSubInstObj = params["install_sub_inst_obj"] + common_labels: typing.Dict[str, typing.Any] = { + "method": method, + "username": install_sub_inst_obj.conns_init_params["username"], + "port": install_sub_inst_obj.conns_init_params["port"], + "auth_type": install_sub_inst_obj.identity_data.auth_type, + "os_type": install_sub_inst_obj.host.os_type, + } + return common_labels + + +def parse_common_labels_by_host_identity( + method: str, params: typing.Dict[str, typing.Any] +) -> typing.Dict[str, typing.Any]: + host: models.Host = params["host"] + identity_data: models.IdentityData = params["identity_data"] + common_labels: typing.Dict[str, typing.Any] = { + "method": method, + "username": identity_data.account, + "port": identity_data.port, + "auth_type": identity_data.auth_type, + "os_type": host.os_type, + } + return common_labels + + +def execute_shell_solution_async_exc_handler( + wrapped: Callable, instance: base.BaseService, args: Tuple[Any], kwargs: Dict[str, Any], exc: Exception +) -> Optional[List]: + """ + 默认的单订阅实例任务异常处理,用于批量调用时规避单任务异常导致整体执行失败的情况 + :param wrapped: 被装饰的函数或类方法 + :param instance: 基础Pipeline服务 + :param exc: 捕获到异常 + :param args: 位置参数 + :param kwargs: 关键字参数 + :return: + """ + common_labels: typing.Dict[str, typing.Any] = parse_common_labels_by_install_obj("ssh", kwargs) + metrics.app_core_remote_connects_total.labels(**common_labels, status="failed").inc() + metrics.app_core_remote_connect_exceptions_total.labels(**common_labels, **parse_exception(exc)).inc() + return core.default_sub_inst_task_exc_handler(wrapped, instance, args, kwargs, exc) + + +def execute_windows_commands_exc_handler( + wrapped: Callable, instance: base.BaseService, args: Tuple[Any], kwargs: Dict[str, Any], exc: Exception +) -> Optional[List]: + """ + 默认的单订阅实例任务异常处理,用于批量调用时规避单任务异常导致整体执行失败的情况 + :param wrapped: 被装饰的函数或类方法 + :param instance: 基础Pipeline服务 + :param exc: 捕获到异常 + :param args: 位置参数 + :param kwargs: 关键字参数 + :return: + """ + common_labels: typing.Dict[str, typing.Any] = parse_common_labels_by_host_identity("wmiexe", kwargs) + metrics.app_core_remote_connects_total.labels(**common_labels, status="failed").inc() + metrics.app_core_remote_connect_exceptions_total.labels(**common_labels, **parse_exception(exc)).inc() + return core.default_sub_inst_task_exc_handler(wrapped, instance, args, kwargs, exc) + + +def execute_shell_solution_async_success_handler( + wrapped: Callable, instance: base.BaseService, args: Tuple[Any], kwargs: Dict[str, Any] +) -> None: + common_labels: typing.Dict[str, typing.Any] = parse_common_labels_by_install_obj("ssh", kwargs) + metrics.app_core_remote_connects_total.labels(**common_labels, status="success").inc() + + +def execute_windows_commands_success_handler( + wrapped: Callable, instance: base.BaseService, args: Tuple[Any], kwargs: Dict[str, Any] +) -> None: + common_labels: typing.Dict[str, typing.Any] = parse_common_labels_by_host_identity("wmiexe", kwargs) + metrics.app_core_remote_connects_total.labels(**common_labels, status="success").inc() + + class InstallService(base.AgentBaseService, remote.RemoteServiceMixin): __need_schedule__ = True interval = StaticIntervalGenerator(5) @@ -71,6 +155,12 @@ def outputs_format(self): Service.InputItem(name="polling_time", key="polling_time", type="int", required=True), ] + @SetupObserve( + histogram=metrics.app_core_remote_batch_execute_duration_seconds, + labels={"method": "job"}, + # 不统计异常耗时 + include_exception_histogram=False, + ) @controller.ConcurrentController( data_list_name="install_sub_inst_objs", batch_call_func=concurrent.batch_call, @@ -88,6 +178,12 @@ def handle_non_lan_inst(self, install_sub_inst_objs: List[InstallSubInstObj]) -> ] return concurrent.batch_call(func=self.execute_job_commands, params_list=params_list) + @SetupObserve( + histogram=metrics.app_core_remote_batch_execute_duration_seconds, + labels={"method": "wmiexe"}, + # 不统计异常耗时 + include_exception_histogram=False, + ) @controller.ConcurrentController( data_list_name="install_sub_inst_objs", batch_call_func=concurrent.batch_call, @@ -170,6 +266,12 @@ def _filter_params_list_in_next_step( params_list=_filter_params_list_in_next_step(run_install_params_list, succeed_sub_inst_ids), ) + @SetupObserve( + histogram=metrics.app_core_remote_batch_execute_duration_seconds, + labels={"method": "ssh"}, + # 不统计异常耗时 + include_exception_histogram=False, + ) @controller.ConcurrentController( data_list_name="install_sub_inst_objs", batch_call_func=concurrent.batch_call, @@ -271,16 +373,29 @@ def _execute(self, data, parent_data, common_data: base.AgentCommonData): remote_conn_helpers_gby_result_type = self.bulk_check_ssh(remote_conn_helpers=lan_windows_sub_inst) - succeed_non_lan_inst_ids = self.handle_non_lan_inst(install_sub_inst_objs=non_lan_sub_inst) - succeed_lan_windows_sub_inst_ids = self.handle_lan_windows_sub_inst( - install_sub_inst_objs=remote_conn_helpers_gby_result_type.get( - remote.SshCheckResultType.UNAVAILABLE.value, [] - ) + if non_lan_sub_inst: + succeed_non_lan_inst_ids = self.handle_non_lan_inst(install_sub_inst_objs=non_lan_sub_inst) + else: + succeed_non_lan_inst_ids = [] + + unavailable_ssh_lan_windows_sub_inst = remote_conn_helpers_gby_result_type.get( + remote.SshCheckResultType.UNAVAILABLE.value, [] ) - succeed_lan_shell_sub_inst_ids = self.handle_lan_shell_sub_inst( - install_sub_inst_objs=lan_linux_sub_inst - + remote_conn_helpers_gby_result_type.get(remote.SshCheckResultType.AVAILABLE.value, []) + if unavailable_ssh_lan_windows_sub_inst: + succeed_lan_windows_sub_inst_ids = self.handle_lan_windows_sub_inst( + install_sub_inst_objs=unavailable_ssh_lan_windows_sub_inst + ) + else: + succeed_lan_windows_sub_inst_ids = [] + + lan_shell_sub_inst = lan_linux_sub_inst + remote_conn_helpers_gby_result_type.get( + remote.SshCheckResultType.AVAILABLE.value, [] ) + if lan_shell_sub_inst: + succeed_lan_shell_sub_inst_ids = self.handle_lan_shell_sub_inst(install_sub_inst_objs=lan_shell_sub_inst) + else: + succeed_lan_shell_sub_inst_ids = [] + # 使用 filter 移除并发过程中抛出异常的实例 data.outputs.scheduling_sub_inst_ids = list( filter( @@ -295,7 +410,7 @@ def _execute(self, data, parent_data, common_data: base.AgentCommonData): ) data.outputs.polling_time = 0 - @exc.ExceptionHandler(exc_handler=core.default_sub_inst_task_exc_handler) + @ExceptionHandler(exc_handler=core.default_sub_inst_task_exc_handler) def get_gse_config_tuple( self, sub_inst_id: int, @@ -308,7 +423,15 @@ def get_gse_config_tuple( content = agent_step_adapter.get_config(host=host, filename=file_name, node_type=general_node_type, ap=ap) return REDIS_AGENT_CONF_KEY_TPL.format(file_name=file_name, sub_inst_id=sub_inst_id), content - @exc.ExceptionHandler(exc_handler=core.default_sub_inst_task_exc_handler) + @ExceptionHandler( + exc_handler=execute_windows_commands_exc_handler, success_handler=execute_windows_commands_success_handler + ) + @SetupObserve( + histogram=metrics.app_core_remote_execute_duration_seconds, + labels={"method": "wmiexe"}, + # 不统计异常耗时 + include_exception_histogram=False, + ) @base.RetryHandler(interval=0, retry_times=2, exception_types=[ConnectionResetError]) def execute_windows_commands( self, sub_inst_id: int, host: models.Host, commands: List[str], identity_data: models.IdentityData @@ -364,7 +487,7 @@ def execute_windows_commands( return sub_inst_id - @exc.ExceptionHandler(exc_handler=core.default_sub_inst_task_exc_handler) + @ExceptionHandler(exc_handler=core.default_sub_inst_task_exc_handler) @base.RetryHandler(interval=0, retry_times=2, exception_types=[ConnectionResetError]) def push_curl_exe( self, @@ -399,7 +522,13 @@ def push_curl_exe( raise e return sub_inst_id - @exc.ExceptionHandler(exc_handler=core.default_sub_inst_task_exc_handler) + @ExceptionHandler(exc_handler=core.default_sub_inst_task_exc_handler) + @SetupObserve( + histogram=metrics.app_core_remote_execute_duration_seconds, + labels={"method": "job"}, + # 不统计异常耗时 + include_exception_histogram=False, + ) def execute_job_commands(self, sub_inst_id, installation_tool: InstallationTools): # p-agent 走 作业平台,再 ssh 到 p-agent,这样可以无需保存 proxy 密码 host = installation_tool.host @@ -481,7 +610,16 @@ def execute_job_commands(self, sub_inst_id, installation_tool: InstallationTools host.save(update_fields=["upstream_nodes"]) return sub_inst_id - @exc.ExceptionHandler(exc_handler=core.default_sub_inst_task_exc_handler) + @ExceptionHandler( + exc_handler=execute_shell_solution_async_exc_handler, + success_handler=execute_shell_solution_async_success_handler, + ) + @SetupObserve( + histogram=metrics.app_core_remote_execute_duration_seconds, + labels={"method": "ssh"}, + # 不统计异常耗时 + include_exception_histogram=False, + ) async def execute_shell_solution_async( self, meta: Dict[str, Any], sub_inst_id: int, install_sub_inst_obj: InstallSubInstObj ) -> int: @@ -537,7 +675,7 @@ async def execute_shell_solution_async( return sub_inst_id - def handle_report_data(self, sub_inst_id: int, success_callback_step: str) -> Dict: + def handle_report_data(self, host: models.Host, sub_inst_id: int, success_callback_step: str) -> Dict: """处理上报数据""" name = REDIS_INSTALL_CALLBACK_KEY_TPL.format(sub_inst_id=sub_inst_id) # 先计算出要从redis取数据的长度 @@ -560,11 +698,19 @@ def handle_report_data(self, sub_inst_id: int, success_callback_step: str) -> Di tag = data.get("prefix") or "[script]" log = f"{tag} [{step}] {data['log']}" status = data["status"] + labels = {"step": step, "os_type": host.os_type, "node_type": host.node_type} if status == "FAILED": error_log = log + logger.warning( + f"[app_core_remote:handle_report_data:scripts_error] install failed: " + f"sub_inst_id -> {sub_inst_id}, labels -> {labels}, log -> {error_log}" + ) + metrics.app_core_remote_install_exceptions_total.labels(**labels).inc() is_finished = True else: - logs.append(log) + if step != "metrics": + logs.append(log) + if step == "report_cpu_arch": cpu_arch = data["log"] elif step == "report_agent_id": @@ -595,9 +741,34 @@ def handle_report_data(self, sub_inst_id: int, success_callback_step: str) -> Di except json.decoder.JSONDecodeError: # 如果 healthz 不可解析,记录原文并打上标记 healthz_result_dict = {"is_parseable": False, "log": data["log"]} + logger.warning( + f"[app_core_remote:handle_report_data:scripts_error] healthz decode failed: " + f"sub_inst_id -> {sub_inst_id}, labels -> {labels}, result -> {healthz_result_dict}" + ) + metrics.app_core_remote_install_exceptions_total.labels(**labels).inc() logs.append(f"{tag} [{step}] parse healthz result: \n {json.dumps(healthz_result_dict, indent=4)}") + elif step == "metrics": + logger.info(f"[app_core_remote:handle_report_data] sub_inst_id -> {sub_inst_id}, data -> {data}") + try: + name: str = data["metrics"]["name"] + if name == "app_core_remote_proxy_info": + metrics.app_core_remote_proxy_info.labels(**data["metrics"]["labels"]).set(1) + elif name == "app_core_remote_connect_exceptions_total": + metrics.app_core_remote_connect_exceptions_total.labels(**data["metrics"]["labels"]).inc() + elif name == "app_core_remote_execute_duration_seconds": + metrics.app_core_remote_execute_duration_seconds.labels(**data["metrics"]["labels"]).observe( + data["metrics"]["data"]["cost_time"] + ) + elif name == "app_core_remote_connects_total": + metrics.app_core_remote_connects_total.labels(**data["metrics"]["labels"]).inc() + except Exception: + logger.exception( + f"[app_core_remote:handle_report_data:metrics] sub_inst_id -> {sub_inst_id}, data -> {data}" + ) + metrics.app_core_remote_install_exceptions_total.labels(**labels).inc() + # 只要匹配到成功返回步骤完成,则认为是执行完成了 if step == success_callback_step and status == "DONE": is_finished = True @@ -633,7 +804,11 @@ def _schedule(self, data, parent_data, callback_data=None): return params_list = [ - {"sub_inst_id": sub_inst_id, "success_callback_step": success_callback_step} + { + "host": common_data.host_id_obj_map[common_data.sub_inst_id__host_id_map[sub_inst_id]], + "sub_inst_id": sub_inst_id, + "success_callback_step": success_callback_step, + } for sub_inst_id in scheduling_sub_inst_ids ] host_id__sub_inst_map: Dict[int, models.SubscriptionInstanceRecord] = { diff --git a/apps/backend/components/collections/base.py b/apps/backend/components/collections/base.py index ed8d10c39..72ac75efc 100644 --- a/apps/backend/components/collections/base.py +++ b/apps/backend/components/collections/base.py @@ -8,6 +8,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ +import logging import os import traceback import typing @@ -34,12 +35,16 @@ from apps.adapters.api.gse import GseApiBaseHelper, get_gse_api_helper from apps.backend.subscription import errors from apps.core.files.storage import get_storage +from apps.exceptions import parse_exception from apps.node_man import constants, models +from apps.prometheus import metrics +from apps.prometheus.helper import SetupObserve from apps.utils import cache, time_handler, translation from apps.utils.exc import ExceptionHandler -from common.log import logger from pipeline.core.flow import Service +logger = logging.getLogger("celery") + class ActivityType: HEAD = 0 @@ -87,11 +92,15 @@ def service_run_exc_handler( act_name = data.get_one_of_inputs("act_name") sub_inst_ids = instance.get_subscription_instance_ids(data) + code = instance.__class__.__name__ - error_msg = _("{act_name} 失败: {exc},请先尝试查看错误日志进行处理,若无法解决,请联系管理员处理").format(act_name=act_name, exc=str(exc)) - logger.exception(error_msg) + metrics.app_task_engine_service_run_exceptions_total.labels(code=code, **parse_exception(exc)).inc() + logger.exception(f"[task_engine][service_run_exc_handler:{code}] act_name -> {act_name}, exc -> {str(exc)}") + + error_msg = _("{act_name} 失败: {exc},请先尝试查看错误日志进行处理,若无法解决,请联系管理员处理").format(act_name=act_name, exc=str(exc)) instance.bulk_set_sub_inst_act_status( + data=data, sub_inst_ids=sub_inst_ids, status=constants.JobStatusType.FAILED, common_log=instance.log_maker.error_log(error_msg), @@ -115,6 +124,12 @@ def get_language_func( return data.get_one_of_inputs("blueking_language") +def get_labels_func( + wrapped: Callable, instance: "BaseService", args: Tuple[Any], kwargs: Dict[str, Any] +) -> typing.Dict[str, str]: + return {"code": instance.__class__.__name__} + + class LogMixin: # 日志类 @@ -252,19 +267,40 @@ def sub_inst_failed_handler(self, sub_inst_ids: Union[List[int], Set[int]]): """ raise NotImplementedError() - def bulk_set_sub_inst_status(self, status: str, sub_inst_ids: Union[List[int], Set[int]]): + @SetupObserve(histogram=metrics.app_task_engine_set_sub_inst_statuses_duration_seconds) + def bulk_set_sub_inst_status(self, data, status: str, sub_inst_ids: Union[List[int], Set[int]]): """批量设置实例状态,对于实例及原子的状态更新只应该在base内部使用""" models.SubscriptionInstanceRecord.objects.filter(id__in=sub_inst_ids).update( status=status, update_time=timezone.now() ) + # status -> PENDING -> RUNNING -> FAILED | SUCCESS + metrics.app_task_engine_sub_inst_statuses_total.labels(status=status).inc(len(sub_inst_ids)) + + meta: Dict[str, Any] = self.get_meta(data) + steps: List[Dict] = meta.get("STEPS") or [] + gse_version: str = meta.get("GSE_VERSION") or "unknown" + for step in steps: + metrics.app_task_engine_sub_inst_step_statuses_total.labels( + step_id=step.get("id") or "unknown", + step_type=step.get("type") or "unknown", + step_num=len(steps), + step_index=step.get("index") or 0, + gse_version=gse_version, + action=step.get("action") or "unknown", + code=self.__class__.__name__, + status=status, + ).inc(amount=len(sub_inst_ids)) + if status in [constants.JobStatusType.FAILED]: self.sub_inst_failed_handler(sub_inst_ids) + @SetupObserve(histogram=metrics.app_task_engine_set_sub_inst_act_statuses_duration_seconds) def bulk_set_sub_inst_act_status( - self, sub_inst_ids: Union[List[int], Set[int]], status: str, common_log: str = None + self, data, sub_inst_ids: Union[List[int], Set[int]], status: str, common_log: str = None ): """ 批量设置实例状态 + :param data: :param sub_inst_ids: :param status: :param common_log: 全局日志,用于需要全局暴露的异常 @@ -281,7 +317,7 @@ def bulk_set_sub_inst_act_status( # 失败的实例需要更新汇总状态 if status in [constants.JobStatusType.FAILED]: - self.bulk_set_sub_inst_status(constants.JobStatusType.FAILED, sub_inst_ids) + self.bulk_set_sub_inst_status(data, constants.JobStatusType.FAILED, sub_inst_ids) @staticmethod def get_subscription_instance_ids(data): @@ -293,6 +329,13 @@ def get_subscription_instance_ids(data): subscription_instance_ids = succeeded_subscription_instance_ids return subscription_instance_ids + @staticmethod + def get_meta(data) -> Dict[str, Any]: + meta: Dict[str, Any] = data.get_one_of_inputs("meta", {}) + if "STEPS" not in meta: + meta["STEPS"] = [] + return meta + @classmethod def get_common_data(cls, data): """ @@ -334,7 +377,7 @@ def get_common_data(cls, data): sub_inst_id__host_id_map=sub_inst_id__host_id_map, host_id__sub_inst_id_map=host_id__sub_inst_id_map, ap_id_obj_map=ap_id_obj_map, - gse_api_helper=get_gse_api_helper(gse_version=data.get_one_of_inputs("meta", {}).get("GSE_VERSION")), + gse_api_helper=get_gse_api_helper(gse_version=cls.get_meta(data).get("GSE_VERSION")), subscription=subscription, subscription_step=subscription_step, subscription_instances=subscription_instances, @@ -343,6 +386,7 @@ def get_common_data(cls, data): def set_current_id(self, subscription_instance_ids: List[int]): # 更新当前实例的pipeline id + # TODO 偶发死锁 models.SubscriptionInstanceRecord.objects.filter(id__in=subscription_instance_ids).update(pipeline_id=self.id) def set_outputs_data(self, data, common_data: CommonData) -> bool: @@ -367,7 +411,7 @@ def run(self, service_func, data, parent_data, **kwargs) -> bool: act_type = data.get_one_of_inputs("act_type") # 流程起始设置RUNNING if service_func == self._execute and act_type in [ActivityType.HEAD, ActivityType.HEAD_TAIL]: - self.bulk_set_sub_inst_status(constants.JobStatusType.RUNNING, subscription_instance_ids) + self.bulk_set_sub_inst_status(data, constants.JobStatusType.RUNNING, subscription_instance_ids) service_func(data, parent_data, **kwargs) @@ -387,6 +431,7 @@ def run(self, service_func, data, parent_data, **kwargs) -> bool: ) self.bulk_set_sub_inst_act_status( + data=data, sub_inst_ids=revoked_subscription_instance_ids, status=constants.JobStatusType.FAILED, common_log=self.log_maker.warning_log( @@ -410,6 +455,7 @@ def run(self, service_func, data, parent_data, **kwargs) -> bool: # failed_subscription_instance_id_set - sub_inst_ids_previous_failed_set 取差集,仅更新本轮失败的订阅实例详情 self.bulk_set_sub_inst_act_status( + data=data, sub_inst_ids=failed_subscription_instance_id_set - previous_failed_subscription_instance_id_set, status=constants.JobStatusType.FAILED, common_log=self.log_maker.error_log( @@ -422,6 +468,7 @@ def run(self, service_func, data, parent_data, **kwargs) -> bool: return bool(succeeded_subscription_instance_ids) self.bulk_set_sub_inst_act_status( + data=data, sub_inst_ids=succeeded_subscription_instance_ids, status=constants.JobStatusType.SUCCESS, common_log=self.log_maker.info_log(_("{act_name} 成功").format(act_name=act_name)), @@ -430,16 +477,29 @@ def run(self, service_func, data, parent_data, **kwargs) -> bool: # 流程结束设置成功的实例 if act_type in [ActivityType.TAIL, ActivityType.HEAD_TAIL]: self.bulk_set_sub_inst_status( - constants.JobStatusType.SUCCESS, sub_inst_ids=succeeded_subscription_instance_ids + data, constants.JobStatusType.SUCCESS, sub_inst_ids=succeeded_subscription_instance_ids ) return bool(succeeded_subscription_instance_ids) @translation.RespectsLanguage(get_language_func=get_language_func) + @SetupObserve( + gauge=metrics.app_task_engine_running_executes_info, + histogram=metrics.app_task_engine_execute_duration_seconds, + get_labels_func=get_labels_func, + ) @ExceptionHandler(exc_handler=service_run_exc_handler) def execute(self, data, parent_data): common_data = self.get_common_data(data) act_name = data.get_one_of_inputs("act_name") + act_type = data.get_one_of_inputs("act_type") + if act_type in [ActivityType.HEAD, ActivityType.HEAD_TAIL]: + logger.info( + "[sub_lifecycle][engine] enter", + common_data.subscription.id, + common_data.subscription_instances[0].task_id, + ) + subscription_instance_ids = self.get_subscription_instance_ids(data) to_be_created_sub_statuses = [ models.SubscriptionInstanceStatusDetail( @@ -456,6 +516,11 @@ def execute(self, data, parent_data): return self.run(self._execute, data, parent_data, common_data=common_data) @translation.RespectsLanguage(get_language_func=get_language_func) + @SetupObserve( + gauge=metrics.app_task_engine_running_schedules_info, + histogram=metrics.app_task_engine_schedule_duration_seconds, + get_labels_func=get_labels_func, + ) @ExceptionHandler(exc_handler=service_run_exc_handler) def schedule(self, data, parent_data, callback_data=None): return self.run(self._schedule, data, parent_data, callback_data=callback_data) diff --git a/apps/backend/components/collections/core.py b/apps/backend/components/collections/core.py index 1fb7bde11..4edc8b9cc 100644 --- a/apps/backend/components/collections/core.py +++ b/apps/backend/components/collections/core.py @@ -19,6 +19,7 @@ from apps.core.concurrent import core_concurrent_constants from apps.node_man import models from apps.utils import enum +from common.log import logger from . import base @@ -110,7 +111,9 @@ def default_task_exc_handler( :param kwargs: 关键字参数 :return: """ + code = instance.__class__.__name__ sub_inst_id = sub_inst_id_extractor(args, kwargs) + logger.exception(f"[task_engine][service_task_exc_handler:{code}] sub_inst_id -> {sub_inst_id}, exc -> {str(exc)}") instance.move_insts_to_failed(sub_inst_id if isinstance(sub_inst_id, Iterable) else [sub_inst_id], str(exc)) # 打印 DEBUG 日志 instance.log_debug(sub_inst_id, log_content=traceback.format_exc(), fold=True) diff --git a/apps/backend/components/collections/plugin.py b/apps/backend/components/collections/plugin.py index e01a7109e..828210398 100644 --- a/apps/backend/components/collections/plugin.py +++ b/apps/backend/components/collections/plugin.py @@ -50,6 +50,8 @@ from apps.exceptions import AppBaseException, ComponentCallError from apps.node_man import constants, exceptions, models from apps.node_man.handlers.cmdb import CmdbHandler +from apps.prometheus import metrics +from apps.prometheus.helper import SetupObserve from apps.utils import cache, md5 from apps.utils.batch_request import request_multi_thread from apps.utils.files import PathHandler @@ -90,6 +92,7 @@ class PluginBaseService(BaseService, metaclass=abc.ABCMeta): """ @classmethod + @SetupObserve(histogram=metrics.app_task_engine_get_common_data_duration_seconds, labels={"step_type": "PLUGIN"}) def get_common_data(cls, data): """ 初始化常用数据,注意这些数据不能放在 self 属性里,否则会产生较大的 process snap shot, @@ -968,6 +971,7 @@ def _execute(self, data, parent_data, common_data: PluginCommonData): {"group_id": process_status.group_id}, context, package_obj=package, + source="engine", ) process_status.configs = rendered_configs process_status.save() diff --git a/apps/backend/constants.py b/apps/backend/constants.py index 6b156d485..e3b681e3c 100644 --- a/apps/backend/constants.py +++ b/apps/backend/constants.py @@ -95,6 +95,7 @@ class PluginMigrateType: REMOVE_FROM_SCOPE = "REMOVE_FROM_SCOPE" NOT_SYNC_HOST = "NOT_SYNC_HOST" MANUAL_OP_EXEMPT = "MANUAL_OP_EXEMPT" + ABNORMAL_AGENT_STATUS = "ABNORMAL_AGENT_STATUS" MIGRATE_TYPE_ALIAS_MAP = { NEW_INSTALL: _("新安装"), @@ -102,6 +103,7 @@ class PluginMigrateType: CONFIG_CHANGE: _("配置变更"), PROC_NUM_NOT_MATCH: _("进程数量不匹配"), ABNORMAL_PROC_STATUS: _("进程状态异常"), + ABNORMAL_AGENT_STATUS: _("Agent 状态异常"), NOT_CHANGE: _("无需变更"), REMOVE_FROM_SCOPE: _("从范围中移除"), NOT_SYNC_HOST: _("未同步的主机"), diff --git a/apps/backend/periodic_tasks/cache_scope_instances.py b/apps/backend/periodic_tasks/cache_scope_instances.py index 2ca2b0891..932e74f71 100644 --- a/apps/backend/periodic_tasks/cache_scope_instances.py +++ b/apps/backend/periodic_tasks/cache_scope_instances.py @@ -31,7 +31,7 @@ def get_instances_by_scope_task(subscription_id): f" scope_md5: {scope_md5}, scope: {subscription.scope}" ) # 查询后会进行缓存,详见 get_instances_by_scope 的装饰器 func_cache_decorator - tools.get_instances_by_scope(subscription.scope) + tools.get_instances_by_scope(subscription.scope, source="get_instances_by_scope_task") logger.info(f"[cache_subscription_scope_instances] (subscription: {subscription_id}) end.") @@ -43,6 +43,7 @@ def get_instances_by_scope_task(subscription_id): def cache_scope_instances(): """定时缓存订阅范围实例,用于提高 instance_status、statistics 等接口的速度""" subscriptions = models.Subscription.objects.filter(enable=True, is_deleted=False) + # TODO 可以再按 scope md5 聚合一次,避免重复缓存 count = subscriptions.count() for index, subscription in enumerate(subscriptions): countdown = calculate_countdown(count=count, index=index, duration=constants.SUBSCRIPTION_UPDATE_INTERVAL) diff --git a/apps/backend/periodic_tasks/collect_auto_trigger_job.py b/apps/backend/periodic_tasks/collect_auto_trigger_job.py index 2c860f8fa..f90a3b3f4 100644 --- a/apps/backend/periodic_tasks/collect_auto_trigger_job.py +++ b/apps/backend/periodic_tasks/collect_auto_trigger_job.py @@ -13,6 +13,7 @@ from collections import defaultdict from celery.task import periodic_task +from django.conf import settings from django.db import transaction from django.db.models import Q @@ -99,6 +100,7 @@ def collect_auto_trigger_job(): statistics={f"{k}_count": 0 for k in ["success", "failed", "pending", "running", "total"]}, error_hosts=[], created_by="admin", + from_system=settings.APP_CODE, # TODO 将历史多个自动触发task先行整合到一个job,后续根据实际情况考虑是否拆分 task_id_list=task_ids_gby_sub_id[subscription["id"]], is_auto_trigger=True, diff --git a/apps/backend/plugin/views.py b/apps/backend/plugin/views.py index e91b88446..36c0e2895 100644 --- a/apps/backend/plugin/views.py +++ b/apps/backend/plugin/views.py @@ -114,6 +114,7 @@ def create_plugin_register_task(self, request): # 2. 创建一个新的task,返回任务ID job = models.Job.objects.create( created_by=params["bk_username"], + from_system=settings.APP_CODE, job_type=constants.JobType.PACKING_PLUGIN, # TODO 打包任务是否也用一次性订阅的方式下发 subscription_id=-1, diff --git a/apps/backend/subscription/commons.py b/apps/backend/subscription/commons.py index 32d0e9b74..9c990400b 100644 --- a/apps/backend/subscription/commons.py +++ b/apps/backend/subscription/commons.py @@ -15,6 +15,8 @@ from apps.component.esbclient import client_v2 from apps.exceptions import BizNotExistError from apps.node_man import constants +from apps.prometheus import metrics +from apps.prometheus.helper import SetupObserve, get_call_resource_labels_func from apps.utils.batch_request import batch_request logger = logging.getLogger("app") @@ -35,6 +37,7 @@ def get_host_object_attribute(bk_biz_id): return custom_fields +@SetupObserve(counter=metrics.app_common_method_requests_total, get_labels_func=get_call_resource_labels_func) def list_biz_hosts(bk_biz_id, condition, func, split_params=False): biz_custom_property = [] kwargs = { @@ -84,12 +87,14 @@ def get_host_by_inst(bk_biz_id, inst_list): elif inst["bk_obj_id"] in bk_obj_id_list: # 自定义层级 topo_cond = {"bk_obj_id": inst["bk_obj_id"], "bk_inst_id": inst["bk_inst_id"]} - hosts.extend(list_biz_hosts(bk_biz_id, topo_cond, "find_host_by_topo")) + hosts.extend( + list_biz_hosts(bk_biz_id, topo_cond, "find_host_by_topo", source="get_host_by_inst:find_host_by_topo") + ) if bk_biz_ids: # 业务查询 for bk_biz_id in bk_biz_ids: - hosts.extend(list_biz_hosts(bk_biz_id, {}, "list_biz_hosts")) + hosts.extend(list_biz_hosts(bk_biz_id, {}, "list_biz_hosts", source="get_host_by_inst:list_biz_hosts:biz")) if bk_set_ids: # 集群查询 hosts.extend( @@ -101,6 +106,14 @@ def get_host_by_inst(bk_biz_id, inst_list): ) if bk_module_ids: # 模块查询 这里CMDB限制了bk_module_ids不能超过500, 需要拆分参数 split_params=True - hosts.extend(list_biz_hosts(bk_biz_id, {"bk_module_ids": bk_module_ids}, "list_biz_hosts", split_params=True)) + hosts.extend( + list_biz_hosts( + bk_biz_id, + {"bk_module_ids": bk_module_ids}, + "list_biz_hosts", + split_params=True, + source="get_host_by_inst:list_biz_hosts:module", + ) + ) return hosts diff --git a/apps/backend/subscription/handler.py b/apps/backend/subscription/handler.py index 4a9c48733..dfacdaf2f 100644 --- a/apps/backend/subscription/handler.py +++ b/apps/backend/subscription/handler.py @@ -129,7 +129,7 @@ def task_result( # 如果不需要已不在订阅范围内的执行快照,查询订阅范围过滤掉移除的实例 ID subscription = models.Subscription.objects.get(id=self.subscription_id) scope_instance_id_list: Set[str] = set( - tools.get_instances_by_scope(subscription.scope, get_cache=True).keys() + tools.get_instances_by_scope(subscription.scope, get_cache=True, source="task_result").keys() ) base_kwargs["instance_id__in"] = scope_instance_id_list @@ -491,7 +491,7 @@ def statistic(subscription_id_list: List[int]) -> List[Dict]: sub_statistic_list: List[Dict] = [] for subscription in subscriptions: sub_statistic = {"subscription_id": subscription.id, "status": []} - current_instances = tools.get_instances_by_scope(subscription.scope, get_cache=True) + current_instances = tools.get_instances_by_scope(subscription.scope, get_cache=True, source="statistic") status_statistic = {"SUCCESS": 0, "PENDING": 0, "FAILED": 0, "RUNNING": 0} plugin_versions = defaultdict(lambda: defaultdict(int)) diff --git a/apps/backend/subscription/render_functions.py b/apps/backend/subscription/render_functions.py index d4def63c1..62cd3b80f 100644 --- a/apps/backend/subscription/render_functions.py +++ b/apps/backend/subscription/render_functions.py @@ -71,7 +71,7 @@ def get_hosts_by_node(config_hosts): if config_hosts[0].get("bk_host_id"): from apps.backend.subscription.tools import get_host_detail - host_infos = get_host_detail(config_hosts) + host_infos = get_host_detail(config_hosts, source="get_hosts_by_node") for host_info in host_infos: host_info["ip"] = host_info["bk_host_innerip"] or host_info["bk_host_innerip_v6"] instances.append(host_info) diff --git a/apps/backend/subscription/steps/plugin.py b/apps/backend/subscription/steps/plugin.py index a5a1ed2f7..af7c332ef 100644 --- a/apps/backend/subscription/steps/plugin.py +++ b/apps/backend/subscription/steps/plugin.py @@ -269,6 +269,7 @@ def check_config_change( process_status, context, package_obj=self.get_matching_package(target_host.os_type, target_host.cpu_arch), + source="migrate", ) old_rendered_configs = proc_status_id__configs_map[process_status["id"]] @@ -315,40 +316,23 @@ def handle_uninstall_instances( return remove_from_scope_instance_ids = set() - if self.subscription.object_type == self.subscription.ObjectType.HOST: - # 主机类型的订阅 - host_id_biz_map = {} - for host_info in models.Host.objects.filter(bk_host_id__in=uninstall_ids).values("bk_host_id", "bk_biz_id"): - host_id_biz_map[host_info["bk_host_id"]] = host_info["bk_biz_id"] - uninstall_scope = { - "bk_biz_id": self.subscription.bk_biz_id, - "object_type": self.subscription.object_type, - "node_type": self.subscription.NodeType.INSTANCE, - "nodes": [{"bk_host_id": host_id, "bk_biz_id": host_id_biz_map[host_id]} for host_id in uninstall_ids], - } - uninstall_instances = tools.get_instances_by_scope(uninstall_scope) + instance_key = "host" if self.subscription.object_type == models.Subscription.ObjectType.HOST else "service" + id_key = "bk_host_id" if instance_key == "host" else "id" - for instance_id in uninstall_instances: - remove_from_scope_instance_ids.add(instance_id) - instance_actions[instance_id] = uninstall_action - push_migrate_reason_func( - _instance_id=instance_id, migrate_type=backend_const.PluginMigrateType.REMOVE_FROM_SCOPE - ) - else: - for _id in uninstall_ids: - instance_id = tools.create_node_id( - { - "object_type": self.subscription.object_type, - "node_type": self.subscription.NodeType.INSTANCE, - "id": _id, - } - ) - remove_from_scope_instance_ids.add(instance_id) - instance_actions[instance_id] = uninstall_action - push_migrate_reason_func( - _instance_id=instance_id, migrate_type=backend_const.PluginMigrateType.REMOVE_FROM_SCOPE - ) + for _id in uninstall_ids: + instance_id = tools.create_node_id( + { + "object_type": self.subscription.object_type, + "node_type": self.subscription.NodeType.INSTANCE, + id_key: _id, + } + ) + remove_from_scope_instance_ids.add(instance_id) + instance_actions[instance_id] = uninstall_action + push_migrate_reason_func( + _instance_id=instance_id, migrate_type=backend_const.PluginMigrateType.REMOVE_FROM_SCOPE + ) # 仅策略的巡检需要假移除插件 if self.subscription.category == models.Subscription.CategoryType.POLICY and auto_trigger: @@ -364,6 +348,38 @@ def handle_uninstall_instances( source_id=self.subscription.id, name=self.plugin_name, bk_host_id__in=uninstall_ids ).update(source_id=None, group_id="", bk_obj_id=None) + elif self.subscription.object_type == self.subscription.ObjectType.HOST: + # 如果 Agent 状态异常,标记异常并且不执行变更,等到 Agent 状态恢复再执行卸载 + host_ids_with_alive_agent = models.ProcessStatus.objects.filter( + name=models.ProcessStatus.GSE_AGENT_PROCESS_NAME, + source_type=models.ProcessStatus.SourceType.DEFAULT, + bk_host_id__in=uninstall_ids, + status=constants.ProcStateType.RUNNING, + ).values_list("bk_host_id", flat=True) + + host_ids_with_no_alive_agent = set(uninstall_ids) - set(host_ids_with_alive_agent) + for host_id in host_ids_with_no_alive_agent: + instance_id = tools.create_node_id( + { + "object_type": self.subscription.object_type, + "node_type": self.subscription.NodeType.INSTANCE, + id_key: host_id, + } + ) + instance_actions.pop(instance_id, None) + push_migrate_reason_func( + _instance_id=instance_id, migrate_type=backend_const.PluginMigrateType.ABNORMAL_AGENT_STATUS + ) + + if host_ids_with_no_alive_agent and not preview_only: + models.ProcessStatus.objects.filter( + source_id=self.subscription.id, name=self.plugin_name, bk_host_id__in=host_ids_with_no_alive_agent + ).update(status=constants.ProcStateType.AGENT_NO_ALIVE) + logger.info( + f"[handle_uninstall_instances] set proc to AGENT_NO_ALIVE, subscription_id -> " + f"{self.subscription.id}, name -> {self.plugin_name}, host_ids -> {host_ids_with_no_alive_agent}", + ) + def handle_new_add_instances( self, install_action: str, diff --git a/apps/backend/subscription/tasks.py b/apps/backend/subscription/tasks.py index 1f678cd25..428a844c5 100644 --- a/apps/backend/subscription/tasks.py +++ b/apps/backend/subscription/tasks.py @@ -30,6 +30,7 @@ from apps.node_man import constants, models from apps.node_man import tools as node_man_tools from apps.node_man.handlers.cmdb import CmdbHandler +from apps.prometheus import metrics from apps.utils import translation from pipeline import builder from pipeline.builder import Data, NodeOutput, ServiceActivity, Var @@ -70,13 +71,28 @@ def build_instances_task( for step in subscription.steps: step_id_manager_map[step.step_id] = StepFactory.get_step_manager(step) step_map = {step.step_id: step for step in subscription.steps} - step_id_record_step_map = { - step_id: {"id": step_id, "type": step_map[step_id].type, "pipeline_id": "", "action": action, "extra_info": {}} - for step_id, action in step_actions.items() - } + + step_index = 0 + step_id_record_step_map = {} + for step_id in step_id_manager_map: + if step_id not in step_actions: + continue + step_id_record_step_map[step_id] = { + "id": step_id, + "type": step_map[step_id].type, + "index": step_index, + "pipeline_id": "", + "action": step_actions[step_id], + "extra_info": {}, + } + step_index += 1 + + # 将 step_actions 信息注入 meta + inject_meta: Dict[str, Any] = {**meta, "STEPS": list(step_id_record_step_map.values())} # 对流程步骤进行编排 current_activities = [] + subscription_instance_ids = [sub_inst.id for sub_inst in subscription_instances] for step_id in step_id_manager_map: if step_id not in step_actions: continue @@ -88,7 +104,7 @@ def build_instances_task( activities, __ = action_manager.generate_activities( subscription_instances, global_pipeline_data=global_pipeline_data, - meta=meta, + meta=inject_meta, current_activities=current_activities, ) @@ -102,8 +118,27 @@ def build_instances_task( current_activities.extend(activities) + # 在 Update 之前写指标,便于后续统计因死锁等情况引发的任务丢失率 + metrics.app_task_engine_sub_inst_step_statuses_total.labels( + step_id=step_id, + step_type=step_id_record_step_map[step_id]["type"], + step_num=len(inject_meta["STEPS"]), + step_index=step_id_record_step_map[step_id]["index"], + gse_version=meta.get("GSE_VERSION") or "unknown", + action=step_id_record_step_map[step_id]["action"], + code="CreatePipeline", + status=constants.JobStatusType.PENDING, + ).inc(amount=len(subscription_instance_ids)) + mark_acts_tail_and_head(current_activities) - subscription_instance_ids = [sub_inst.id for sub_inst in subscription_instances] + + logger.info( + "[sub_lifecycle][build_instances_task] inject_meta -> %s, step_id_record_step_map -> %s", + subscription.id, + subscription_instances[0].task_id, + inject_meta, + step_id_record_step_map, + ) # 每个原子引用上个原子成功输出的 succeeded_subscription_instance_ids for index, act in enumerate(current_activities): @@ -227,24 +262,45 @@ def create_task_transaction(create_task_func): @wraps(create_task_func) def wrapper(subscription: models.Subscription, subscription_task: models.SubscriptionTask, *args, **kwargs): + logger.info( + "[sub_lifecycle][create_task_transaction] enter, sub -> %s, task -> %s", + subscription.id, + subscription_task.id, + subscription, + subscription_task, + ) try: func_return = create_task_func(subscription, subscription_task, *args, **kwargs) except Exception as err: - logger.exception(err) + logger.exception( + "[sub_lifecycle][create_task_transaction] failed", + subscription.id, + subscription_task.id, + ) if subscription_task.is_auto_trigger or kwargs.get("preview_only"): # 自动触发的发生异常或者仅预览的情况,记录日志后直接删除此任务即可 - models.SubscriptionTask.objects.filter(id=subscription_task.id).delete() + if subscription_task.id: + models.SubscriptionTask.objects.filter(id=subscription_task.id).delete() + models.SubscriptionInstanceRecord.objects.filter(task_id=subscription_task.id).delete() # 抛出异常用于前端展示或事务回滚 raise err # 非自动触发的,记录错误信息 subscription_task.err_msg = str(err) subscription_task.save(update_fields=["err_msg"]) + models.SubscriptionInstanceRecord.objects.filter(task_id=subscription_task.id).delete() + else: if kwargs.get("preview_only"): # 仅预览,不执行动作 return func_return + subscription_task.is_ready = True - logger.info(f"task is ready: sub_task -> {subscription_task}, actions: {subscription_task.actions}") + logger.info( + "[sub_lifecycle][create_task_transaction] task ready, actions -> %s", + subscription.id, + subscription_task.id, + subscription_task.actions, + ) subscription_task.save(update_fields=["is_ready"]) # 创建好实例后立刻执行 run_subscription_task(subscription_task) @@ -279,7 +335,12 @@ def create_task( """ # 兜底注入 Meta,此处注入是覆盖面最全的(包含历史被移除实例) GrayTools().inject_meta_to_instances(instances) - logger.info(f"[create task] inject meta to instances[num={len(instances)}] successfully") + logger.info( + "[sub_lifecycle][create_task] inject meta to instances[num=%s] successfully", + subscription.id, + subscription_task.id, + len(instances), + ) topo_order = CmdbHandler.get_topo_order() batch_size = models.GlobalSettings.get_config("BATCH_SIZE", default=100) @@ -401,11 +462,15 @@ def create_task( ) if not to_be_created_records_map: - logger.warning(f"[create task] skipped: no instances to execute, subscription_task -> {subscription_task}") if subscription_task.is_auto_trigger: # 如果是自动触发,且没有任何实例,那么直接抛出异常,回滚数据库 - logger.info("[create task] auto trigger task will rollback") raise SubscriptionInstanceEmpty() + else: + logger.warning( + "[sub_lifecycle][create_task] no instances to execute", + subscription.id, + subscription_task.id, + ) # 非自动触发的直接退出即可 return { @@ -414,18 +479,25 @@ def create_task( } # 将最新属性置为False并批量创建订阅实例 + # TODO 偶发死锁 models.SubscriptionInstanceRecord.objects.filter( subscription_id=subscription.id, instance_id__in=instance_id_list ).update(is_latest=False) + # TODO 偶发死锁 models.SubscriptionInstanceRecord.objects.bulk_create(to_be_created_records_map.values(), batch_size=batch_size) # 批量创建订阅实例,由于bulk_create返回的objs没有主键,此处需要重新查出 + # TODO 这里是不是可以直接通过 task_id 直接查出来,避免 instance_id 过多导致查询退化? created_instance_records = list( models.SubscriptionInstanceRecord.objects.filter( subscription_id=subscription.id, instance_id__in=instance_id_list, is_latest=True ) ) + metrics.app_task_engine_sub_inst_statuses_total.labels(status=constants.JobStatusType.PENDING).inc( + amount=len(created_instance_records) + ) + task_host_limit = models.GlobalSettings.get_config( models.GlobalSettings.KeyEnum.TASK_HOST_LIMIT.value, default=TASK_HOST_LIMIT ) @@ -433,10 +505,14 @@ def create_task( # 保存pipeline id subscription_task.pipeline_id = pipeline.id subscription_task.save(update_fields=["actions", "pipeline_id"]) + logger.info( - f"[create task] succeed: subscription -> {subscription}, " - f"subscription_task -> {subscription_task}, instance_actions -> {instance_actions}" + "[sub_lifecycle][create_task] instance_actions -> %s", + subscription.id, + subscription_task.id, + instance_actions, ) + return { "to_be_created_records_map": to_be_created_records_map, "error_hosts": error_hosts, @@ -448,10 +524,22 @@ def run_subscription_task_and_create_instance_transaction(func): @wraps(func) def wrapper(subscription: models.Subscription, subscription_task: models.SubscriptionTask, *args, **kwargs): + logger.info( + "[sub_lifecycle][run_subscription_task_and_create_instance_transaction] " + "enter, sub -> %s, task -> %s", + subscription.id, + subscription_task.id, + subscription, + subscription_task, + ) try: func_result = func(subscription, subscription_task, *args, **kwargs) except Exception as err: - logger.exception(err) + logger.exception( + "[sub_lifecycle][run_subscription_task_and_create_instance_transaction] failed", + subscription.id, + subscription_task.id, + ) if subscription_task.is_auto_trigger or kwargs.get("preview_only"): # 自动触发的发生异常或者仅预览的情况 # 记录日志后直接删除此任务即可 @@ -504,15 +592,27 @@ def run_subscription_task_and_create_instance( scope["bk_biz_id"] = subscription.bk_biz_id # 获取订阅范围内全部实例 - instances = tools.get_instances_by_scope(scope) - logger.info(f"run_subscription_task[{subscription_task.id}] instances_num={len(instances)}") + instances = tools.get_instances_by_scope(scope, source="run_subscription_task_and_create_instance") + logger.info( + "[sub_lifecycle][run_subscription_task_and_create_instance] get_instances_by_scope -> %s", + subscription.id, + subscription_task.id, + len(instances), + ) + # 创建步骤管理器实例 step_managers = {step.step_id: StepFactory.get_step_manager(step) for step in subscription.steps} # 删除无用subscription缓存,否则执行延时任务时传入可能引起pickle异常 if hasattr(subscription, "_steps"): delattr(subscription, "_steps") - logger.info(f"run_subscription_task[{subscription_task.id}] step_managers={step_managers}") + logger.info( + "[sub_lifecycle][run_subscription_task_and_create_instance] step_managers -> %s", + subscription.id, + subscription_task.id, + step_managers, + ) + if actions is not None: # 指定了动作,不需要计算,直接执行即可 instance_actions = {instance_id: actions for instance_id in instances} @@ -522,7 +622,11 @@ def run_subscription_task_and_create_instance( # 预注入 Meta,用于变更计算(仅覆盖当前订阅范围,移除场景通过 create_task 兜底注入) GrayTools().inject_meta_to_instances(instances) logger.info( - f"run_subscription_task[{subscription_task.id}] pre-inject meta to instances[num={len(instances)}] successfully" + "[sub_lifecycle][run_subscription_task_and_create_instance] " + "pre-inject meta to instances[num=%s] successfully", + subscription.id, + subscription_task.id, + len(instances), ) # 按步骤顺序计算实例变更所需的动作 @@ -538,6 +642,7 @@ def run_subscription_task_and_create_instance( instance_id_action_map: Dict[str, str] = migrate_results["instance_actions"] for instance_id, action in instance_id_action_map.items(): instance_actions[instance_id][step.step_id] = action + metrics.app_task_instances_migrate_actions_total.labels(step_id=step.step_id, action=action).inc() # 归类变更原因 # eg: @@ -550,15 +655,20 @@ def run_subscription_task_and_create_instance( # } # } # } - instance_id_action_reason_map: Dict[str, str] = migrate_results["migrate_reasons"] + instance_id_action_reason_map: Dict[str, Dict] = migrate_results["migrate_reasons"] for instance_id, migrate_reason in instance_id_action_reason_map.items(): instance_migrate_reasons[instance_id][step.step_id] = migrate_reason + metrics.app_task_instances_migrate_reasons_total.labels( + step_id=step.step_id, reason=migrate_reason["migrate_type"] + ).inc() logger.info( - f"make_instances_migrate_actions: \n" - f"subscription_task -> {subscription_task} \n" - f"instance_actions -> {instance_actions} \n" - f"migrate_reasons -> {instance_migrate_reasons}" + "[sub_lifecycle][run_subscription_task_and_create_instance] " + "make_instances_migrate_actions: instance_actions -> %s, migrate_reasons -> %s", + subscription.id, + subscription_task.id, + instance_actions, + instance_migrate_reasons, ) # 查询被从范围内移除的实例 @@ -574,25 +684,43 @@ def run_subscription_task_and_create_instance( service_instance_id = tools.parse_node_id(instance_id)["id"] deleted_id_not_in_scope.append({"id": service_instance_id}) + logger.info( + "[sub_lifecycle][run_subscription_task_and_create_instance] " + "try to find deleted instances -> %s", + subscription.id, + subscription_task.id, + deleted_id_not_in_scope, + ) deleted_instance_info = tools.get_instances_by_scope( { "bk_biz_id": subscription.bk_biz_id, "object_type": subscription.object_type, "node_type": models.Subscription.NodeType.INSTANCE, "nodes": deleted_id_not_in_scope, - } + }, + source="find_deleted_instances", ) # 如果被删掉的实例在 CMDB 找不到,那么就使用最近一次的 InstanceRecord 的快照数据 not_exist_instance_id = set(instance_not_in_scope) - set(deleted_instance_info) - if not_exist_instance_id: - records = models.SubscriptionInstanceRecord.objects.filter( - subscription_id=subscription.id, instance_id__in=not_exist_instance_id, is_latest=True + records = list( + models.SubscriptionInstanceRecord.objects.filter( + subscription_id=subscription.id, instance_id__in=not_exist_instance_id, is_latest=True + ) ) for record in records: deleted_instance_info[record.instance_id] = record.instance_info + logger.info( + "[sub_lifecycle][run_subscription_task_and_create_instance] " + "deleted instances not exist in cc, find from db -> %s, find num -> %s", + subscription.id, + subscription_task.id, + not_exist_instance_id, + len(records), + ) + instances.update(deleted_instance_info) create_task_result = create_task( @@ -611,7 +739,11 @@ def run_subscription_task_and_create_instance( @app.task(queue="backend", ignore_result=True) @translation.RespectsLanguage() def run_subscription_task(subscription_task: models.SubscriptionTask): - logger.info(f"debug update_subscription enter run_subscription_task[{subscription_task.id}]") + logger.info( + "[sub_lifecycle][run_subscription_task] enter", + subscription_task.subscription_id, + subscription_task.id, + ) pipeline_ids = {} if subscription_task.pipeline_id: pipeline_ids[subscription_task.pipeline_id] = 0 @@ -628,7 +760,22 @@ def run_subscription_task(subscription_task: models.SubscriptionTask): # 排序 ordered_pipelines.sort(key=lambda item: item[0]) for index, pipeline in ordered_pipelines: - pipeline.run(index % 255) + try: + pipeline.run(index % 255) + except Exception: + logger.exception( + "[sub_lifecycle][run_subscription_task] failed to run pipeline -> %s", + subscription_task.subscription_id, + subscription_task.id, + pipeline.id, + ) + else: + logger.info( + "[sub_lifecycle][run_subscription_task] run pipeline -> %s", + subscription_task.subscription_id, + subscription_task.id, + pipeline.id, + ) @app.task(queue="backend", ignore_result=True) diff --git a/apps/backend/subscription/tools.py b/apps/backend/subscription/tools.py index e1cb03b86..a7d6f3620 100644 --- a/apps/backend/subscription/tools.py +++ b/apps/backend/subscription/tools.py @@ -14,6 +14,8 @@ import logging import math import os +import pprint +import typing from collections import Counter, defaultdict from concurrent.futures import ThreadPoolExecutor, as_completed from functools import wraps @@ -34,13 +36,15 @@ ) from apps.backend.utils.data_renderer import nested_render_data from apps.component.esbclient import client_v2 +from apps.core.concurrent.cache import FuncCacheDecorator from apps.core.ipchooser.tools.base import HostQuerySqlHelper from apps.exceptions import ComponentCallError from apps.node_man import constants, models from apps.node_man import tools as node_man_tools +from apps.prometheus import metrics +from apps.prometheus.helper import SetupObserve, get_call_resource_labels_func from apps.utils.basic import chunk_lists, distinct_dict_list, order_dict from apps.utils.batch_request import batch_request, request_multi_thread -from apps.utils.cache import func_cache_decorator from apps.utils.time_handler import strftime_local logger = logging.getLogger("app") @@ -240,6 +244,7 @@ def create_host_key(data: Dict) -> str: ) +@SetupObserve(counter=metrics.app_common_method_requests_total, get_labels_func=get_call_resource_labels_func) def find_host_biz_relations(bk_host_ids: List[int]) -> List[Dict]: """ 查询主机所属拓扑关系 @@ -346,27 +351,26 @@ def get_service_instance_by_ids(bk_biz_id, ids): return result -def search_business(condition, start=0): - kwargs = {"fields": ["bk_biz_id", "bk_biz_name"], "page": {"start": start, "limit": constants.QUERY_BIZ_LENS}} - kwargs.update(condition) - biz_data = client_v2.cc.search_business(kwargs) - biz_count = biz_data.get("count", 0) - bizs = biz_data.get("info") or [] - - if biz_count > constants.QUERY_BIZ_LENS + start: - bizs += search_business(condition, start + constants.QUERY_BIZ_LENS) - - bizs.append({"bk_biz_id": settings.BK_CMDB_RESOURCE_POOL_BIZ_ID, "bk_biz_name": "资源池"}) - return bizs +@FuncCacheDecorator(cache_time=1 * constants.TimeUnit.MINUTE) +def fetch_biz_info_map(fields: typing.Optional[typing.List[str]] = None) -> typing.Dict[str, typing.Dict]: + """ + 查询所有业务 + :return: 主机业务关系列表 + """ + fields = fields or ["bk_biz_id", "bk_biz_name"] + biz_infos: typing.List[typing.Dict] = batch_request(client_v2.cc.search_business, {"fields": fields}) + biz_infos.append({"bk_biz_id": str(settings.BK_CMDB_RESOURCE_POOL_BIZ_ID), "bk_biz_name": "资源池"}) + biz_info_map: typing.Dict[str, typing.Dict] = {str(biz_info["bk_biz_id"]): biz_info for biz_info in biz_infos} + logger.info("[fetch_biz_info_map] fields -> %s, count -> %s", pprint.pformat(fields), len(biz_infos)) + return biz_info_map -def fetch_biz_info(condition): - all_biz = search_business(condition) - biz_map = {} - for biz in all_biz: - biz_map[biz["bk_biz_id"]] = biz - return biz_map +def fetch_biz_info(bk_biz_ids: typing.List[int]) -> typing.Dict[int, typing.Dict]: + biz_info_map: typing.Dict[str, typing.Dict] = fetch_biz_info_map(get_cache=True) + if not biz_info_map: + logger.error("[fetch_biz_info] biz_info_map is empty !") + return {bk_biz_id: biz_info_map.get(str(bk_biz_id)) or {} for bk_biz_id in bk_biz_ids} def get_host_detail_by_template(bk_obj_id, template_info_list: list, bk_biz_id: int = None): @@ -396,12 +400,16 @@ def get_host_detail_by_template(bk_obj_id, template_info_list: list, bk_biz_id: host_info_result = batch_request( call_func, dict(bk_set_template_ids=template_ids, bk_biz_id=bk_biz_id, fields=fields) ) - biz_info = fetch_biz_info({"condition": {"bk_biz_id": bk_biz_id}}) - cloud_id_name_map = models.Cloud.cloud_id_name_map() + biz_info = fetch_biz_info([bk_biz_id]) + cloud_id_name_map = models.Cloud.cloud_id_name_map(get_cache=True) + + if not biz_info[bk_biz_id]: + logger.warning("[get_host_detail_by_template] can not find biz_info -> %s", bk_biz_id) + for host in host_info_result: host["bk_biz_id"] = bk_biz_id - host["bk_biz_name"] = host["bk_biz_name"] = biz_info[bk_biz_id]["bk_biz_name"] - host["bk_cloud_name"] = cloud_id_name_map.get(host["bk_cloud_id"]) + host["bk_biz_name"] = host["bk_biz_name"] = biz_info[bk_biz_id].get("bk_biz_name") + host["bk_cloud_name"] = cloud_id_name_map.get(str(host["bk_cloud_id"])) return host_info_result @@ -437,6 +445,7 @@ def get_service_instances_by_template(bk_obj_id, template_info_list: list, bk_bi return service_instances +@SetupObserve(counter=metrics.app_common_method_requests_total, get_labels_func=get_call_resource_labels_func) def get_host_detail(host_info_list: list, bk_biz_id: int = None): """ 获取主机详情 @@ -524,23 +533,23 @@ def get_host_detail(host_info_list: list, bk_biz_id: int = None): # 3. 综上所述,提前返回可以减少无效执行逻辑及网络IO return [] - hosts = list_biz_hosts(bk_biz_id, cond, "list_hosts_without_biz") + hosts = list_biz_hosts(bk_biz_id, cond, "list_hosts_without_biz", source="get_host_detail:list_hosts_without_biz") bk_host_ids = [] bk_cloud_ids = [] for host in hosts: bk_host_ids.append(host["bk_host_id"]) bk_cloud_ids.append(host["bk_cloud_id"]) - host_relations = find_host_biz_relations(list(set(bk_host_ids))) + host_relations = find_host_biz_relations(list(set(bk_host_ids)), source="get_host_detail") host_biz_map = {} for host in host_relations: host_biz_map[host["bk_host_id"]] = host["bk_biz_id"] - cloud_id_name_map = models.Cloud.cloud_id_name_map() + cloud_id_name_map = models.Cloud.cloud_id_name_map(get_cache=True) # 需要将资源池移除 - all_biz_id = list(set(host_biz_map.values()) - {settings.BK_CMDB_RESOURCE_POOL_BIZ_ID}) - all_biz_info = fetch_biz_info({"condition": {"bk_biz_id": {"$in": all_biz_id}}}) + all_biz_ids = list(set(host_biz_map.values()) - {settings.BK_CMDB_RESOURCE_POOL_BIZ_ID}) + all_biz_info = fetch_biz_info(all_biz_ids) host_key_dict = {} host_id_dict = {} @@ -551,8 +560,11 @@ def get_host_detail(host_info_list: list, bk_biz_id: int = None): if _host["bk_biz_id"] != settings.BK_CMDB_RESOURCE_POOL_BIZ_ID else "资源池" ) + if not _host["bk_biz_name"]: + logger.warning("[get_host_detail] can not find biz_info -> %s", _host["bk_biz_id"]) + _host["bk_cloud_name"] = ( - cloud_id_name_map.get(_host["bk_cloud_id"], "") + cloud_id_name_map.get(str(_host["bk_cloud_id"]), "") if _host["bk_cloud_id"] != constants.DEFAULT_CLOUD else "直连区域" ) @@ -640,18 +652,20 @@ def get_host_relation(bk_biz_id, nodes): data = [] hosts = get_host_by_inst(bk_biz_id, nodes) - host_biz_relations = find_host_biz_relations([_host["bk_host_id"] for _host in hosts]) + host_biz_relations = find_host_biz_relations([_host["bk_host_id"] for _host in hosts], source="get_host_relation") relations = defaultdict(lambda: defaultdict(list)) for item in host_biz_relations: relations[item["bk_host_id"]]["bk_module_ids"].append(item["bk_module_id"]) relations[item["bk_host_id"]]["bk_set_ids"].append(item["bk_set_id"]) - biz_info = fetch_biz_info({"condition": {"bk_biz_id": bk_biz_id}}) + biz_info = fetch_biz_info([bk_biz_id]) + if not biz_info[bk_biz_id]: + logger.warning("[set_template_scope_nodes] can not find biz_info -> %s", bk_biz_id) for host in hosts: host["bk_biz_id"] = bk_biz_id - host["bk_biz_name"] = biz_info[bk_biz_id]["bk_biz_name"] + host["bk_biz_name"] = biz_info[bk_biz_id].get("bk_biz_name", "") host["module"] = relations[host["bk_host_id"]]["bk_module_ids"] host["set"] = relations[host["bk_host_id"]]["bk_set_ids"] data.append(host) @@ -697,8 +711,28 @@ def wrapper(scope: Dict[str, Union[Dict, Any]], *args, **kwargs) -> Dict[str, Di return wrapper +def get_scope_labels_func( + wrapped: typing.Callable, + instance: typing.Any, + args: typing.Tuple[typing.Any], + kwargs: typing.Dict[str, typing.Any], +) -> typing.Dict[str, str]: + + if "scope" in kwargs: + scope = kwargs["scope"] + else: + scope = args[0] + + return { + "object_type": scope["object_type"], + "node_type": scope["node_type"], + "source": get_call_resource_labels_func(wrapped, instance, args, kwargs)["source"], + } + + @support_multi_biz -@func_cache_decorator(cache_time=SUBSCRIPTION_SCOPE_CACHE_TIME) +@SetupObserve(histogram=metrics.app_task_get_instances_by_scope_duration_seconds, get_labels_func=get_scope_labels_func) +@FuncCacheDecorator(cache_time=SUBSCRIPTION_SCOPE_CACHE_TIME) def get_instances_by_scope(scope: Dict[str, Union[Dict, int, Any]]) -> Dict[str, Dict[str, Union[Dict, Any]]]: """ 获取范围内的所有主机 @@ -762,7 +796,12 @@ def get_instances_by_scope(scope: Dict[str, Union[Dict, int, Any]]) -> Dict[str, # 按照实例查询 elif scope["node_type"] == models.Subscription.NodeType.INSTANCE: if scope["object_type"] == models.Subscription.ObjectType.HOST: - instances.extend([{"host": inst} for inst in get_host_detail(nodes, bk_biz_id=bk_biz_id)]) + instances.extend( + [ + {"host": inst} + for inst in get_host_detail(nodes, bk_biz_id=bk_biz_id, source="get_instances_by_scope") + ] + ) else: service_instance_ids = [int(node["id"]) for node in nodes] instances.extend( @@ -863,7 +902,9 @@ def add_host_info_to_instances(bk_biz_id: int, scope: Dict, instances: Dict): host_dict = { host_info["bk_host_id"]: host_info - for host_info in get_host_detail([instance["service"] for instance in instances], bk_biz_id=bk_biz_id) + for host_info in get_host_detail( + [instance["service"] for instance in instances], bk_biz_id=bk_biz_id, source="add_host_info_to_instances" + ) } for instance in instances: instance["host"] = host_dict[instance["service"]["bk_host_id"]] @@ -1002,6 +1043,7 @@ def get_plugin_path(plugin_name: str, target_host: models.Host, agent_config: Di return plugin_path +@FuncCacheDecorator(cache_time=5 * constants.TimeUnit.MINUTE) def get_plugin_common_constants(plugin_name: str) -> Dict: """ 获取插件配置公共常量 @@ -1048,10 +1090,7 @@ def get_all_subscription_steps_context( plugin_path = get_plugin_path(plugin_name, target_host, agent_config) # 将 step.params 中 context 提取到第一层,提供给模板渲染 - step_params = policy_step_adapter.get_matching_step_params( - target_host.os_type.lower(), - target_host.cpu_arch - ) + step_params = policy_step_adapter.get_matching_step_params(target_host.os_type.lower(), target_host.cpu_arch) context.update(step_params.get("context", {})) context.update(all_step_data[subscription_step.step_id]) @@ -1070,7 +1109,7 @@ def get_all_subscription_steps_context( "login_ip": target_host.login_ip, }, # 获取插件配置公共常量 - "constants": get_plugin_common_constants(plugin_name), + "constants": get_plugin_common_constants(plugin_name, get_cache=True), }, ) # 深拷贝一份,避免原数据后续被污染 @@ -1137,6 +1176,7 @@ def render_config_files_by_config_templates( process_status_info: Dict[str, Any], context: Dict, package_obj: models.Packages, + source: typing.Optional[str] = None, ): """ 根据订阅配置及步骤信息渲染配置模板 @@ -1144,6 +1184,7 @@ def render_config_files_by_config_templates( :param list[PluginConfigTemplate] config_templates: 配置文件模板 :param HostStatus process_status_info: 主机进程信息 :param dict context: 上下文信息 + :param source: 调用来源 :return: example: [ { "instance_id": config.id, @@ -1160,16 +1201,13 @@ def render_config_files_by_config_templates( content = template.render(context) except Exception as e: raise ConfigRenderFailed({"name": template.name, "msg": e}) + # 计算配置文件的MD5 md5 = hashlib.md5() md5.update(content.encode()) md5sum = md5.hexdigest() - rendered_config = { - "md5": md5sum, - "content": content, - "file_path": template.file_path, - } + rendered_config = {"md5": md5sum, "content": content, "file_path": template.file_path} if package_obj and package_obj.plugin_desc.is_official and not template.is_main: # 官方插件的部署方式为单实例多配置,在配置模板的名称上追加 group id 即可对配置文件做唯一标识 filename, extension = os.path.splitext(template.name) @@ -1180,7 +1218,23 @@ def render_config_files_by_config_templates( # 非官方插件、官方插件中的主配置文件,无需追加 group id # 适配模板名可渲染的形式 rendered_config["name"] = nested_render_data(template.name, context) + + common_labels = { + "plugin_name": template.plugin_name, + "name": template.name, + "os": template.os, + "cpu_arch": template.cpu_arch, + "source": source or "default", + } + if rendered_config["name"]: + if md5sum == template.md5: + metrics.app_plugin_render_configs_total.labels(**common_labels, type="equal_to_template").inc() + logger.warning( + "[render_config_files_by_config_templates] render config equal to template -> %s", template + ) + else: + metrics.app_plugin_render_configs_total.labels(**common_labels, type="default").inc() rendered_configs.append(rendered_config) return rendered_configs diff --git a/apps/backend/subscription/views.py b/apps/backend/subscription/views.py index 1f490176c..6308b9958 100644 --- a/apps/backend/subscription/views.py +++ b/apps/backend/subscription/views.py @@ -483,7 +483,9 @@ def instance_status(self, request): result = [] for subscription in subscriptions: subscription_result = [] - current_instances = tools.get_instances_by_scope(subscription.scope, get_cache=True) + current_instances = tools.get_instances_by_scope( + subscription.scope, get_cache=True, source="instance_status" + ) # 对于每个instance,通过group_id找到其对应的host_status for instance_id in current_instances: diff --git a/apps/backend/tests/components/collections/agent_new/test_install.py b/apps/backend/tests/components/collections/agent_new/test_install.py index 6b7fc29bc..7f731ceea 100644 --- a/apps/backend/tests/components/collections/agent_new/test_install.py +++ b/apps/backend/tests/components/collections/agent_new/test_install.py @@ -47,6 +47,8 @@ class InstallBaseTestCase(utils.AgentServiceBaseTestCase): + DEBUG = True + OS_TYPE = constants.OsType.LINUX NODE_TYPE = constants.NodeType.AGENT DOWNLOAD_PATH = "/tmp/data/bkee/public/bknodeman/download" @@ -444,6 +446,125 @@ class InstallLinuxPagentTestCase(InstallBaseTestCase): NODE_TYPE = constants.NodeType.PAGENT CLOUD_ID = 1 + def init_redis_data(self): + # 初始化redis数据,用于schedule时读取解析 + for sub_inst_id in self.common_inputs["subscription_instance_ids"]: + name = REDIS_INSTALL_CALLBACK_KEY_TPL.format(sub_inst_id=sub_inst_id) + report_agent_obj: models.Host = models.Host.objects.get(bk_host_id=self.obj_factory.bk_host_ids[0]) + json_dumps_logs = [ + json.dumps(log) + for log in [ + { + "timestamp": "1580870937", + "level": "INFO", + "step": "metrics", + "log": "metrics", + "metrics": { + "name": "app_core_remote_proxy_info", + "labels": { + "hostname": "hostname", + "ip": "127.0.0.1", + "bk_cloud_id": self.CLOUD_ID, + "paramiko_version": "2.9.1", + }, + }, + "status": "-", + }, + { + "timestamp": "1580870937", + "level": "INFO", + "step": "metrics", + "log": "metrics", + "metrics": { + "name": "app_core_remote_connects_total", + "labels": { + "method": "proxy_ssh", + "username": "root", + "port": 22, + "auth_type": "password", + "os_type": report_agent_obj.os_type, + "status": "success", + }, + }, + "status": "-", + }, + { + "timestamp": "1580870937", + "level": "INFO", + "step": "metrics", + "log": "metrics", + "metrics": { + "name": "app_core_remote_connect_exceptions_total", + "labels": { + "method": "proxy_ssh", + "username": "root", + "port": 22, + "auth_type": "password", + "os_type": report_agent_obj.os_type, + "exc_type": "-", + "exc_code": "0", + }, + }, + "status": "-", + }, + { + "timestamp": "1580870937", + "level": "INFO", + "step": "metrics", + "log": "metrics", + "metrics": { + "name": "app_core_remote_execute_duration_seconds", + "labels": {"method": "proxy_ssh"}, + "data": {"cost_time": 1.0}, + }, + "status": "-", + }, + { + "timestamp": "1580870937", + "level": "INFO", + "step": "report_cpu_arch", + "log": "aarch64", + "status": "DONE", + }, + { + "timestamp": "1580870937", + "level": "INFO", + "step": "report_agent_id", + "log": f"agent-id: {report_agent_obj.bk_cloud_id}:{report_agent_obj.inner_ip}", + "status": "DONE", + }, + { + "timestamp": "1580870937", + "level": "INFO", + "step": "report_healthz", + "log": random.choice( + [ + '{"ok":false,"data":{"base":"ok","cluster":"ok","data":"ok","file":"ok"}}', + "aGVhbHRoejogeyJvayI6dHJ1ZSwiZGF0YSI6eyJiYXNlIjoib2siLCJjbHVzdGVyIjoib2si" + "LCJkYXRhIjoib2siLCJmaWxlIjoib2sifX0NCg==", + "healthz: Failed", + ] + ), + "status": "DONE", + }, + { + "timestamp": "1580870937", + "level": "INFO", + "step": "report_os_version", + "status": "DONE", + "log": "6.1.1.1", + }, + { + "timestamp": "1580870937", + "level": "INFO", + "step": "check_deploy_result", + "log": "gse agent has been deployed successfully", + "status": "DONE", + }, + ] + ] + REDIS_INST.lpush(name, *json_dumps_logs) + def init_hosts(self): self.init_alive_proxies(bk_cloud_id=self.CLOUD_ID) models.Host.objects.filter(bk_host_id__in=self.obj_factory.bk_host_ids).update( diff --git a/apps/backend/tests/subscription/utils.py b/apps/backend/tests/subscription/utils.py index 8ee6285b9..70f19767f 100644 --- a/apps/backend/tests/subscription/utils.py +++ b/apps/backend/tests/subscription/utils.py @@ -71,7 +71,7 @@ def find_host_biz_relations(cls, *args, **kwargs): @classmethod def search_business(cls, *args, **kwargs): - return {"info": [{"bk_biz_id": 2, "bk_biz_name": "TEST"}]} + return {"info": [{"bk_biz_id": 2, "bk_biz_name": "TEST"}], "count": 1} @classmethod def search_biz_inst_topo(cls, *args, **kwargs): diff --git a/apps/backend/views.py b/apps/backend/views.py index 0265ccac1..36c42f2bc 100644 --- a/apps/backend/views.py +++ b/apps/backend/views.py @@ -145,7 +145,7 @@ def report_log(request): # 把日志写入redis中,由install service中的schedule方法统一读取,避免频繁callback name = REDIS_INSTALL_CALLBACK_KEY_TPL.format(sub_inst_id=decrypted_token["inst_id"]) json_dumps_logs = [json.dumps(log) for log in data["logs"]] - # 日志会被 Service 消费并持久化,在 Redis 保留一段时间便于排查「主机 -api-> Redis -log-> DB」 上的问题 + # 日志会被 Service 消费并持久化,在 Redis 保留一段时间便于排查「主机 -> api-> Redis -log-> DB」 上的问题 LPUSH_AND_EXPIRE_FUNC(keys=[name], args=[constants.TimeUnit.DAY] + json_dumps_logs) return JsonResponse({}) diff --git a/apps/core/concurrent/cache.py b/apps/core/concurrent/cache.py new file mode 100644 index 000000000..3e5b79a30 --- /dev/null +++ b/apps/core/concurrent/cache.py @@ -0,0 +1,128 @@ +# -*- coding: utf-8 -*- +""" +TencentBlueKing is pleased to support the open source community by making 蓝鲸智云-节点管理(BlueKing-BK-NODEMAN) available. +Copyright (C) 2017-2022 THL A29 Limited, a Tencent company. All rights reserved. +Licensed under the MIT License (the "License"); you may not use this file except in compliance with the License. +You may obtain a copy of the License at https://opensource.org/licenses/MIT +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +specific language governing permissions and limitations under the License. +""" + +import typing + +import ujson as json +import wrapt +from django.conf import settings +from django.core.cache import caches + +from apps.prometheus import metrics +from apps.prometheus.helper import observe +from apps.utils.cache import format_cache_key +from env.constants import CacheBackend + +DEFAULT_CACHE_TIME = 60 * 15 + + +class FuncCacheDecorator: + + cache_time: int = DEFAULT_CACHE_TIME + + def __init__(self, cache_time: typing.Optional[int] = None): + """ + :param cache_time: 缓存事件(秒) + """ + self.cache_time = cache_time or DEFAULT_CACHE_TIME + + def get_from_cache(self, using: str, key: str) -> typing.Any: + cache = caches[using] + func_result = cache.get(key, None) + if func_result is None: + return func_result + + if using == CacheBackend.DB.value: + return json.loads(func_result) + return func_result + + def set_to_cache(self, using: str, key: str, value: typing.Any): + cache = caches[using] + if using == CacheBackend.DB.value: + value = json.dumps(value) + cache.set(key, value, self.cache_time) + + def ttl_from_cache(self, using: str, key: str) -> int: + ttl: int = 0 + try: + ttl = caches[using].ttl(key) + except Exception: + pass + return ttl + + @wrapt.decorator + def __call__( + self, + wrapped: typing.Callable, + instance: typing.Any, + args: typing.Tuple[typing.Any], + kwargs: typing.Dict[str, typing.Any], + ) -> typing.Any: + """ + :param wrapped: 被装饰的函数或类方法 + :param instance: + - 如果被装饰者为普通类方法,该值为类实例 + - 如果被装饰者为 classmethod / 类方法,该值为类 + - 如果被装饰者为类/函数/静态方法,该值为 None + :param args: 位置参数 + :param kwargs: 关键字参数 + :return: + """ + + func_result: typing.Any = None + func_name: str = wrapped.__name__ + use_fast_cache: bool = False + get_cache: bool = kwargs.pop("get_cache", False) + tolerance_time: int = kwargs.pop("tolerance_time", 0) + cache_key: str = format_cache_key(wrapped, *args, **kwargs) + master_labels: typing.Dict = {"type": "master", "backend": settings.CACHE_BACKEND, "method": func_name} + if tolerance_time: + master_labels["type"] = master_labels["type"] + "_fast" + ttl: int = self.ttl_from_cache(using=settings.CACHE_BACKEND, key=cache_key) + if ttl != 0 and ttl + tolerance_time >= self.cache_time: + use_fast_cache = True + + metrics.app_core_cache_decorator_requests_total.labels(get_cache=get_cache, **master_labels).inc() + + if get_cache or use_fast_cache: + with observe(metrics.app_core_cache_decorator_get_duration_seconds, **master_labels): + func_result = self.get_from_cache(using=settings.CACHE_BACKEND, key=cache_key) + + if func_result is None: + # 无需从缓存中获取数据或者缓存中没有数据,则执行函数得到结果,并设置缓存 + func_result = wrapped(*args, **kwargs) + with observe(metrics.app_core_cache_decorator_set_duration_seconds, **master_labels): + self.set_to_cache(using=settings.CACHE_BACKEND, key=cache_key, value=func_result) + elif get_cache or use_fast_cache: + # cache hit + metrics.app_core_cache_decorator_hits_total.labels(**master_labels).inc() + + # 缓存预热 + if settings.CACHE_ENABLE_PREHEAT: + cache_using: str = (CacheBackend.DB.value, CacheBackend.REDIS.value)[ + settings.CACHE_BACKEND == CacheBackend.DB.value + ] + preheat_value: typing.Any = None + slave_labels: typing.Dict = {"type": "slave", "backend": cache_using, "method": func_name} + + metrics.app_core_cache_decorator_requests_total.labels(get_cache=get_cache, **slave_labels).inc() + + if get_cache: + with observe(metrics.app_core_cache_decorator_get_duration_seconds, **slave_labels): + preheat_value = self.get_from_cache(using=cache_using, key=cache_key) + + if preheat_value is None: + with observe(metrics.app_core_cache_decorator_set_duration_seconds, **slave_labels): + self.set_to_cache(using=cache_using, key=cache_key, value=func_result) + elif get_cache: + metrics.app_core_cache_decorator_hits_total.labels(**slave_labels).inc() + + return func_result diff --git a/apps/core/gray/tools.py b/apps/core/gray/tools.py index e7e70696d..729e1cc5e 100644 --- a/apps/core/gray/tools.py +++ b/apps/core/gray/tools.py @@ -12,16 +12,16 @@ from django.utils.translation import ugettext_lazy as _ +from apps.core.concurrent.cache import FuncCacheDecorator from apps.exceptions import ApiError from apps.node_man import constants as node_man_constants from apps.node_man import models as node_man_models -from apps.utils.cache import func_cache_decorator from env.constants import GseVersion class GrayTools: @classmethod - @func_cache_decorator(cache_time=20 * node_man_constants.TimeUnit.SECOND) + @FuncCacheDecorator(cache_time=20 * node_man_constants.TimeUnit.SECOND) def get_or_create_gse2_gray_scope_list(cls) -> typing.List[int]: """ 获取 GSE2.0 灰度列表 diff --git a/apps/exceptions.py b/apps/exceptions.py index 0241c97dd..1be22bd4b 100644 --- a/apps/exceptions.py +++ b/apps/exceptions.py @@ -8,6 +8,8 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ +import typing + from django.utils.translation import ugettext_lazy as _ @@ -120,3 +122,14 @@ class AuthOverdueException(AppBaseException): class BackendValidationError(AppBaseException): ERROR_CODE = 100 MESSAGE = _("参数验证失败") + + +def parse_exception(exc: Exception) -> typing.Dict[str, str]: + if isinstance(exc, AppBaseException): + exc_type = "app" + exc_code = str(exc.code) + else: + exc_type = "unknown" + exc_code = exc.__class__.__name__ + + return {"exc_type": exc_type, "exc_code": exc_code} diff --git a/apps/generic.py b/apps/generic.py index 3fa097d7c..d8e3181d2 100644 --- a/apps/generic.py +++ b/apps/generic.py @@ -20,7 +20,7 @@ from rest_framework.viewsets import ModelViewSet as _ModelViewSet from apps.exceptions import AppBaseException, ErrorCode -from apps.utils import cache +from apps.utils import cache, local from apps.utils.drf import ( CsrfExemptSessionAuthentication, DataPageNumberPagination, @@ -42,9 +42,15 @@ class ApiMixin(GenericViewSet): def initialize_request(self, request, *args, **kwargs): # 实体是为文件时body省略 body = "File" if "multipart/form-data" in request.headers.get("Content-Type", "") else request.body + bk_username = ( + local.get_username_from_request_or_none(self.request) or local.get_request_username_or_local_app_code() + ) + bk_app_code = ( + local.get_appcode_from_request_or_none(self.request) or local.get_request_app_code_or_local_app_code() + ) logger.info( - "[receive request], path: {}, header: {}, body: {}".format( - request.path, request.headers.get("X-Bkapi-App"), body + "[receive request], path: {}, header: {}, body: {}, bk_app_code: {}, bk_username: {}".format( + request.path, request.headers.get("X-Bkapi-App"), body, bk_app_code, bk_username ) ) return super(ApiMixin, self).initialize_request(request, *args, **kwargs) @@ -87,8 +93,12 @@ def validated_data(self): data = self.request.data # 从 esb 获取参数 - bk_username = self.request.META.get("HTTP_BK_USERNAME") - bk_app_code = self.request.META.get("HTTP_BK_APP_CODE") + bk_username = ( + local.get_username_from_request_or_none(self.request) or local.get_request_username_or_local_app_code() + ) + bk_app_code = ( + local.get_appcode_from_request_or_none(self.request) or local.get_request_app_code_or_local_app_code() + ) data = data.copy() data.setdefault("bk_username", bk_username) diff --git a/apps/iam/handlers/resources.py b/apps/iam/handlers/resources.py index 3c9ec3dcd..67a20e8cf 100644 --- a/apps/iam/handlers/resources.py +++ b/apps/iam/handlers/resources.py @@ -119,9 +119,9 @@ def create_instance(cls, instance_id: str, attribute=None) -> Resource: @classmethod def create_instances(cls, instance_ids: Union[List[str], Set[str]], attribute=None) -> List[Resource]: - cloud_id_name_map = models.Cloud.cloud_id_name_map() + cloud_id_name_map = models.Cloud.cloud_id_name_map(get_cache=True) return [ - cls.create_instance(instance_id, {"name": cloud_id_name_map.get(int(instance_id))}) + cls.create_instance(instance_id, {"name": cloud_id_name_map.get(str(instance_id))}) for instance_id in instance_ids ] diff --git a/apps/node_man/constants.py b/apps/node_man/constants.py index e5c1d46d8..f5e32c6f7 100644 --- a/apps/node_man/constants.py +++ b/apps/node_man/constants.py @@ -412,7 +412,16 @@ def _get_member__alias_map(cls) -> Dict[Enum, str]: return {cls.NOT_ALIVE: _("未知"), cls.ALIVE: _("正常"), cls.TERMINATED: _("异常"), cls.NOT_INSTALLED: _("未安装")} -PROC_STATE_TUPLE = ("RUNNING", "UNKNOWN", "TERMINATED", "NOT_INSTALLED", "UNREGISTER", "REMOVED", "MANUAL_STOP") +PROC_STATE_TUPLE = ( + "RUNNING", + "UNKNOWN", + "TERMINATED", + "NOT_INSTALLED", + "UNREGISTER", + "REMOVED", + "MANUAL_STOP", + "AGENT_NO_ALIVE", +) PROC_STATE_CHOICES = tuple_choices(PROC_STATE_TUPLE) ProcStateType = choices_to_namedtuple(PROC_STATE_CHOICES) PROC_STATUS_DICT = { diff --git a/apps/node_man/handlers/debug.py b/apps/node_man/handlers/debug.py index 4cc4eb2ab..8d24967c5 100644 --- a/apps/node_man/handlers/debug.py +++ b/apps/node_man/handlers/debug.py @@ -9,11 +9,14 @@ specific language governing permissions and limitations under the License. """ from collections import defaultdict +from datetime import timedelta from typing import Dict, List from django.conf import settings from django.core.exceptions import ObjectDoesNotExist +from django.utils import timezone +from apps.node_man import constants from apps.node_man.exceptions import HostNotExists from apps.node_man.models import ( Host, @@ -177,3 +180,17 @@ def fetch_subscriptions_by_host(self, bk_host_id): ) return result + + def zombie_sub_inst_count(self, days: int): + days = max(days, 1) + days = min(days, 60) + query_kwargs = { + "update_time__range": ( + timezone.now() - timedelta(days=1), + timezone.now() - timedelta(days), + ), + "status__in": [constants.JobStatusType.PENDING, constants.JobStatusType.RUNNING], + } + + count = SubscriptionInstanceRecord.objects.filter(**query_kwargs).count() + return {"count": count} diff --git a/apps/node_man/handlers/job.py b/apps/node_man/handlers/job.py index dfaf347a6..e45ddc525 100644 --- a/apps/node_man/handlers/job.py +++ b/apps/node_man/handlers/job.py @@ -846,12 +846,12 @@ def retrieve(self, params: Dict[str, Any]): host_execute_status_list.extend(filter_hosts) # 补充业务名、管控区域名称 - cloud_id_name_map = models.Cloud.cloud_id_name_map() + cloud_id_name_map = models.Cloud.cloud_id_name_map(get_cache=True) biz_name_map = CmdbHandler.biz_id_name_without_permission() for host_execute_status in host_execute_status_list: host_execute_status.update( bk_biz_name=biz_name_map.get(host_execute_status.get("bk_biz_id")), - bk_cloud_name=cloud_id_name_map.get(host_execute_status["bk_cloud_id"]), + bk_cloud_name=cloud_id_name_map.get(str(host_execute_status["bk_cloud_id"])), ) tools.JobTools.update_job_statistics(self.data, task_result["status_counter"]) diff --git a/apps/node_man/handlers/password.py b/apps/node_man/handlers/password.py index f398a633e..aaf9e849b 100644 --- a/apps/node_man/handlers/password.py +++ b/apps/node_man/handlers/password.py @@ -11,7 +11,9 @@ import base64 import hashlib import hmac +import logging import os +import pprint import re import time from typing import Any, Dict, Tuple @@ -21,8 +23,11 @@ from Crypto.Cipher import AES from apps.node_man import constants +from apps.prometheus import metrics from apps.utils import env +logger = logging.getLogger("app") + class BasePasswordHandler(object): def get_password(self, username: str, cloud_ip_list: list, **options) -> Tuple[bool, Dict, Dict, str]: @@ -36,6 +41,12 @@ def get_password(self, username: str, cloud_ip_list: list, **options) -> Tuple[b """ raise NotImplementedError() + def hande_get_password_result( + self, is_ok: bool, success_ips: Dict, failed_ips: Dict, err_msg: str + ) -> Tuple[bool, Dict, Dict, str]: + metrics.app_core_password_requests_total.labels(handler=self.__class__.__name__, is_ok=is_ok).inc() + return is_ok, success_ips, failed_ips, err_msg + class DefaultPasswordHandler(BasePasswordHandler): """ @@ -191,16 +202,18 @@ def get_password(self, username: str, cloud_ip_list: list, **options) -> Tuple[b try: result = self.post(self.TJJ_ACTION, kwargs) except Exception as e: - return False, {}, {}, str(e) + return self.hande_get_password_result(False, {}, {}, str(e)) if not result["result"]: - return False, {}, {}, result["message"] + return self.hande_get_password_result(False, {}, {}, result["message"]) if result["data"]["HasError"]: - return False, {}, {}, str(result["data"]["ResponseItems"]) + return self.hande_get_password_result(False, {}, {}, str(result["data"]["ResponseItems"])) parse_response_items_result = self.parse_response_items(result["data"]["ResponseItems"]) - return True, parse_response_items_result["success_ips"], parse_response_items_result["failed_ips"], "success" + return self.hande_get_password_result( + True, parse_response_items_result["success_ips"], parse_response_items_result["failed_ips"], "success" + ) class TjjPasswordHandler(DefaultPasswordHandler): @@ -226,10 +239,23 @@ def get_password(self, username: str, cloud_ip_list: list, **options) -> Tuple[b ) result = response.json()["Result"] except Exception as e: - return False, {}, {}, str(e) + logger.exception( + "[TjjPasswordHandler] failed to get_password, Username -> %s, IpList -> %s", + username, + pprint.pformat(ip_list), + ) + return self.hande_get_password_result(False, {}, {}, str(e)) if result["HasError"]: - return False, {}, {}, str(result["ResponseItems"]) + logger.error( + "[TjjPasswordHandler] failed to get_password, Username -> %s, IpList -> %s, err -> %s", + username, + pprint.pformat(ip_list), + str(result["ResponseItems"]), + ) + return self.hande_get_password_result(False, {}, {}, str(result["ResponseItems"])) parse_response_items_result = self.parse_response_items(result["ResponseItems"]) - return True, parse_response_items_result["success_ips"], parse_response_items_result["failed_ips"], "success" + return self.hande_get_password_result( + True, parse_response_items_result["success_ips"], parse_response_items_result["failed_ips"], "success" + ) diff --git a/apps/node_man/handlers/policy.py b/apps/node_man/handlers/policy.py index e0f7ae5e9..8ad07843d 100644 --- a/apps/node_man/handlers/policy.py +++ b/apps/node_man/handlers/policy.py @@ -509,7 +509,7 @@ def migrate_preview(cls, query_params: Dict[str, Any]) -> List[Dict[str, Any]]: ) # 补充业务名、管控区域名称 - cloud_id_name_map = models.Cloud.cloud_id_name_map() + cloud_id_name_map = models.Cloud.cloud_id_name_map(get_cache=True) biz_name_map = CmdbHandler.biz_id_name_without_permission() results = [] @@ -517,7 +517,7 @@ def migrate_preview(cls, query_params: Dict[str, Any]) -> List[Dict[str, Any]]: for instance in instances: instance.update( bk_biz_name=biz_name_map.get(instance.get("bk_biz_id")), - bk_cloud_name=cloud_id_name_map.get(instance["bk_cloud_id"]), + bk_cloud_name=cloud_id_name_map.get(str(instance["bk_cloud_id"])), ) inst_job_type = constants.ACTION_NAME_JOB_TYPE_MAP.get(action_id) diff --git a/apps/node_man/migrations/0076_auto_20230924_2330.py b/apps/node_man/migrations/0076_auto_20230924_2330.py new file mode 100644 index 000000000..05992137a --- /dev/null +++ b/apps/node_man/migrations/0076_auto_20230924_2330.py @@ -0,0 +1,77 @@ +# Generated by Django 3.2.4 on 2023-09-24 15:30 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("node_man", "0075_pluginconfigtemplate_variables"), + ] + + operations = [ + migrations.AddField( + model_name="job", + name="from_system", + field=models.CharField(db_index=True, default="", max_length=45, verbose_name="所属系统"), + ), + migrations.AlterField( + model_name="job", + name="created_by", + field=models.CharField(db_index=True, default="", max_length=45, verbose_name="操作人"), + ), + migrations.AlterField( + model_name="job", + name="end_time", + field=models.DateTimeField(blank=True, db_index=True, null=True, verbose_name="任务结束时间"), + ), + migrations.AlterField( + model_name="job", + name="job_type", + field=models.CharField( + choices=[ + ("INSTALL_AGENT", "INSTALL_AGENT"), + ("RESTART_AGENT", "RESTART_AGENT"), + ("REINSTALL_AGENT", "REINSTALL_AGENT"), + ("UNINSTALL_AGENT", "UNINSTALL_AGENT"), + ("REMOVE_AGENT", "REMOVE_AGENT"), + ("UPGRADE_AGENT", "UPGRADE_AGENT"), + ("IMPORT_AGENT", "IMPORT_AGENT"), + ("RESTART_AGENT", "RESTART_AGENT"), + ("RELOAD_AGENT", "RELOAD_AGENT"), + ("ACTIVATE_AGENT", "ACTIVATE_AGENT"), + ("MAIN_START_PLUGIN", "MAIN_START_PLUGIN"), + ("MAIN_STOP_PLUGIN", "MAIN_STOP_PLUGIN"), + ("MAIN_RESTART_PLUGIN", "MAIN_RESTART_PLUGIN"), + ("MAIN_RELOAD_PLUGIN", "MAIN_RELOAD_PLUGIN"), + ("MAIN_DELEGATE_PLUGIN", "MAIN_DELEGATE_PLUGIN"), + ("MAIN_UNDELEGATE_PLUGIN", "MAIN_UNDELEGATE_PLUGIN"), + ("MAIN_INSTALL_PLUGIN", "MAIN_INSTALL_PLUGIN"), + ("MAIN_STOP_AND_DELETE_PLUGIN", "MAIN_STOP_AND_DELETE_PLUGIN"), + ("DEBUG_PLUGIN", "DEBUG_PLUGIN"), + ("STOP_DEBUG_PLUGIN", "STOP_DEBUG_PLUGIN"), + ("PUSH_CONFIG_PLUGIN", "PUSH_CONFIG_PLUGIN"), + ("REMOVE_CONFIG_PLUGIN", "REMOVE_CONFIG_PLUGIN"), + ("PACKING_PLUGIN", "PACKING_PLUGIN"), + ("INSTALL_PROXY", "INSTALL_PROXY"), + ("RESTART_PROXY", "RESTART_PROXY"), + ("REINSTALL_PROXY", "REINSTALL_PROXY"), + ("REPLACE_PROXY", "REPLACE_PROXY"), + ("UNINSTALL_PROXY", "UNINSTALL_PROXY"), + ("UPGRADE_PROXY", "UPGRADE_PROXY"), + ("IMPORT_PROXY", "IMPORT_PROXY"), + ("RESTART_PROXY", "RESTART_PROXY"), + ("RELOAD_PROXY", "RELOAD_PROXY"), + ], + db_index=True, + default="INSTALL_PROXY", + max_length=45, + verbose_name="作业类型", + ), + ), + migrations.AlterField( + model_name="job", + name="start_time", + field=models.DateTimeField(auto_now_add=True, db_index=True, verbose_name="创建任务时间"), + ), + ] diff --git a/apps/node_man/models.py b/apps/node_man/models.py index 8b1747289..4e9460e43 100644 --- a/apps/node_man/models.py +++ b/apps/node_man/models.py @@ -44,6 +44,7 @@ from apps.backend.subscription.errors import PipelineExecuteFailed, SubscriptionNotExist from apps.backend.subscription.render_functions import get_hosts_by_node from apps.backend.utils.data_renderer import nested_render_data +from apps.core.concurrent.cache import FuncCacheDecorator from apps.core.files.storage import get_storage from apps.exceptions import ValidationError from apps.node_man import constants @@ -61,6 +62,7 @@ export_subscription_prometheus_mixin, ) from apps.utils import basic, files, orm, translation +from apps.utils.cache import class_member_cache from common.log import logger from env.constants import GseVersion from pipeline.parser import PipelineParser @@ -718,11 +720,13 @@ class Cloud(models.Model): is_deleted = models.BooleanField(_("是否删除"), default=False) @classmethod - def cloud_id_name_map(cls) -> Dict: + @FuncCacheDecorator(cache_time=20 * constants.TimeUnit.SECOND) + def cloud_id_name_map(cls) -> Dict[str, str]: all_cloud_map = { - cloud.bk_cloud_id: cloud.bk_cloud_name for cloud in cls.objects.all().only("bk_cloud_id", "bk_cloud_name") + str(cloud.bk_cloud_id): cloud.bk_cloud_name + for cloud in cls.objects.all().only("bk_cloud_id", "bk_cloud_name") } - all_cloud_map[constants.DEFAULT_CLOUD] = str(_("直连区域")) + all_cloud_map[str(constants.DEFAULT_CLOUD)] = str(_("直连区域")) return all_cloud_map @classmethod @@ -805,14 +809,19 @@ class Meta: class Job(export_job_prometheus_mixin(), models.Model): """任务信息""" - created_by = models.CharField(_("操作人"), max_length=45, default="") + created_by = models.CharField(_("操作人"), max_length=45, default="", db_index=True) + from_system = models.CharField(_("所属系统"), max_length=45, default="", db_index=True) job_type = models.CharField( - _("作业类型"), max_length=45, choices=constants.JOB_CHOICES, default=constants.JobType.INSTALL_PROXY + _("作业类型"), + max_length=45, + choices=constants.JOB_CHOICES, + default=constants.JobType.INSTALL_PROXY, + db_index=True, ) subscription_id = models.IntegerField(_("订阅ID"), db_index=True) task_id_list = JSONField(_("任务ID列表"), default=list) - start_time = models.DateTimeField(_("创建任务时间"), auto_now_add=True) - end_time = models.DateTimeField(_("任务结束时间"), blank=True, null=True) + start_time = models.DateTimeField(_("创建任务时间"), auto_now_add=True, db_index=True) + end_time = models.DateTimeField(_("任务结束时间"), blank=True, null=True, db_index=True) status = models.CharField( _("任务状态"), max_length=45, choices=constants.JobStatusType.get_choices(), default=constants.JobStatusType.PENDING ) @@ -1588,6 +1597,14 @@ def __str__(self): f"selector -> [{self.plugin_name}|{self.plugin_version}|{self.os}|{self.cpu_arch}]>" ) + @property + @class_member_cache() + def md5(self) -> str: + md5 = hashlib.md5() + md5.update(self.content.encode()) + md5sum = md5.hexdigest() + return md5sum + def create_instance(self, data, creator=None, source_app_code=None): """ 返回 PluginConfigInstance 实例 diff --git a/apps/node_man/periodic_tasks/add_biz_to_gse2_gray_scope.py b/apps/node_man/periodic_tasks/add_biz_to_gse2_gray_scope.py index a600fbd81..4b955cebd 100644 --- a/apps/node_man/periodic_tasks/add_biz_to_gse2_gray_scope.py +++ b/apps/node_man/periodic_tasks/add_biz_to_gse2_gray_scope.py @@ -13,7 +13,7 @@ from celery.task import periodic_task from django.db import transaction -from apps.backend.subscription.tools import search_business +from apps.backend.subscription.tools import fetch_biz_info_map from apps.core.gray.handlers import GrayHandler from apps.node_man.constants import SYNC_BIZ_TO_GRAY_SCOPE_LIST_INTERVAL from apps.node_man.models import GlobalSettings @@ -37,7 +37,7 @@ def sync_new_biz_to_gray_scope_list(): logger.info(f"sync_new_biz_to_gray_scope_list: {task_id} No need to add new biz to GSE2_GRAY_SCOPE_LIST.") return None - cc_all_biz: List[Dict[str, int]] = search_business({}) + cc_all_biz: List[Dict[str, int]] = list(fetch_biz_info_map().values()) cc_all_biz_ids: List[int] = [biz["bk_biz_id"] for biz in cc_all_biz] new_biz_ids: List[int] = list(set(cc_all_biz_ids) - set(all_biz_ids)) diff --git a/apps/node_man/periodic_tasks/resource_watch_task.py b/apps/node_man/periodic_tasks/resource_watch_task.py index 4436057e0..4d61b09b3 100644 --- a/apps/node_man/periodic_tasks/resource_watch_task.py +++ b/apps/node_man/periodic_tasks/resource_watch_task.py @@ -22,6 +22,7 @@ from apps.component.esbclient import client_v2 from apps.node_man import constants from apps.node_man.models import GlobalSettings, Host, ResourceWatchEvent, Subscription +from apps.prometheus import metrics from apps.utils.cache import format_cache_key logger = logging.getLogger("app") @@ -200,6 +201,11 @@ def _resource_watch(cursor_key, kwargs): ] ResourceWatchEvent.objects.bulk_create(objs) + for obj in objs: + metrics.app_resource_watch_events_total.labels( + type="producer", bk_resource=obj.bk_resource, bk_event_type=obj.bk_event_type + ).inc() + logger.info(f"[{cursor_key}] receive new resource watch event: count -> {len(objs)}") # 记录最新cursor @@ -277,12 +283,16 @@ def _get_event_str(_event): continue events_after_convergence = HostEventPreprocessHelper.event_convergence(events) + logger.info(f"[{config_key}] length of events_after_convergence -> {len(events_after_convergence)}") for event in events_after_convergence: event_str = _get_event_str(event) event_bk_biz_id = event["bk_detail"].get("bk_biz_id") + metrics.app_resource_watch_events_total.labels(type="convergence", bk_resource="-", bk_event_type="-").inc() logger.info(f"[{config_key}] event being consumed -> {event_str}") try: + if event_bk_biz_id: + metrics.app_resource_watch_biz_events_total.labels(bk_biz_id=event_bk_biz_id).inc() if event_bk_biz_id: # 触发同步CMDB trigger_sync_cmdb_host(bk_biz_id=event_bk_biz_id) @@ -301,6 +311,10 @@ def _get_event_str(_event): # 删除事件记录 ResourceWatchEvent.objects.filter(bk_cursor__in=[event["bk_cursor"] for event in events]).delete() + for event in events: + metrics.app_resource_watch_events_total.labels( + type="consumer", bk_resource=event["bk_resource"], bk_event_type=event["bk_event_type"] + ).inc() def func_debounce_decorator(func): @@ -374,6 +388,9 @@ def trigger_sync_cmdb_host(bk_biz_id, debounce_time=0): """ from apps.node_man.periodic_tasks.sync_cmdb_host import sync_cmdb_host_periodic_task + metrics.app_resource_watch_trigger_total.labels( + method="sync_cmdb_host", bk_biz_id=bk_biz_id, debounce_time=debounce_time + ).inc() sync_cmdb_host_periodic_task.apply_async(kwargs={"bk_biz_id": bk_biz_id}, countdown=debounce_time) logger.info(f"[trigger_sync_cmdb_host] bk_biz_id -> {bk_biz_id} will be run after {debounce_time} s") @@ -406,6 +423,10 @@ def trigger_nodeman_subscription(bk_biz_id, debounce_time=0): if not subscription_ids: logger.info("[trigger_nodeman_subscription] bk_biz_id->({}) no subscriptions to run".format(bk_biz_id)) + metrics.app_resource_watch_trigger_total.labels( + method="subscription", bk_biz_id=bk_biz_id, debounce_time=debounce_time + ).inc() + update_subscription_instances_chunk.apply_async( kwargs={"subscription_ids": subscription_ids}, countdown=debounce_time ) diff --git a/apps/node_man/tests/test_handlers/test_policy.py b/apps/node_man/tests/test_handlers/test_policy.py index 88ff05064..cf2021184 100644 --- a/apps/node_man/tests/test_handlers/test_policy.py +++ b/apps/node_man/tests/test_handlers/test_policy.py @@ -34,7 +34,7 @@ from apps.utils.unittest.testcase import CustomBaseTestCase -def get_instances_by_scope(scope): +def get_instances_by_scope(scope, **kwargs): host_id = scope["nodes"][0]["bk_host_id"] host = Host.objects.filter(bk_host_id=host_id) instance_key = f"host|instance|host|{host_id}" diff --git a/apps/node_man/tests/test_pericdic_tasks/test_add_biz_to_gse2_gray_scope.py b/apps/node_man/tests/test_pericdic_tasks/test_add_biz_to_gse2_gray_scope.py index fe9d121fd..932623ca2 100644 --- a/apps/node_man/tests/test_pericdic_tasks/test_add_biz_to_gse2_gray_scope.py +++ b/apps/node_man/tests/test_pericdic_tasks/test_add_biz_to_gse2_gray_scope.py @@ -39,8 +39,8 @@ def init_db(self): ) @mock.patch( - "apps.node_man.periodic_tasks.add_biz_to_gse2_gray_scope.search_business", - return_value=[{"bk_biz_id": 10, "bk_biz_name": ""}], + "apps.node_man.periodic_tasks.add_biz_to_gse2_gray_scope.fetch_biz_info_map", + return_value={10: {"bk_biz_id": 10, "bk_biz_name": ""}}, ) def test_add_biz_to_gse2_gray_scope(self, *args, **kwargs): # 未开启同步 diff --git a/apps/node_man/tools/job.py b/apps/node_man/tools/job.py index 6b6ce88b5..1549627bd 100644 --- a/apps/node_man/tools/job.py +++ b/apps/node_man/tools/job.py @@ -17,11 +17,14 @@ from django.utils import timezone from django.utils.translation import ugettext_lazy as _ +from apps.backend.subscription import tools from apps.node_man import constants, models from apps.utils import basic -from apps.utils.local import get_request_username +from apps.utils.local import ( + get_request_app_code_or_local_app_code, + get_request_username, +) from common.api import NodeApi -from apps.backend.subscription import tools class JobTools: @@ -259,7 +262,7 @@ def get_instance_ids_by_ips(cls, inner_ip_list: List[str]) -> List[str]: instance_id_list = [] for host in list( - models.Host.objects.filter(host_query).values("inner_ip", "inner_ipv6", "bk_cloud_id", "bk_host_id") + models.Host.objects.filter(host_query).values("inner_ip", "inner_ipv6", "bk_cloud_id", "bk_host_id") ): instance_id_list.extend( [ @@ -378,6 +381,7 @@ def create_job( statistics=statistics or {}, error_hosts=error_hosts or [], created_by=get_request_username(), + from_system=get_request_app_code_or_local_app_code(), ) return {"job_id": job.id, "job_url": cls.get_job_url(job.id)} diff --git a/apps/prometheus/helper.py b/apps/prometheus/helper.py new file mode 100644 index 000000000..046e37373 --- /dev/null +++ b/apps/prometheus/helper.py @@ -0,0 +1,213 @@ +# -*- coding: utf-8 -*- +""" +TencentBlueKing is pleased to support the open source community by making 蓝鲸智云-节点管理(BlueKing-BK-NODEMAN) available. +Copyright (C) 2017-2022 THL A29 Limited, a Tencent company. All rights reserved. +Licensed under the MIT License (the "License"); you may not use this file except in compliance with the License. +You may obtain a copy of the License at https://opensource.org/licenses/MIT +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +specific language governing permissions and limitations under the License. +""" + + +import inspect +import time +import typing +from contextlib import contextmanager + +import wrapt +from prometheus_client import Counter, Gauge, Histogram + +from apps.utils import local, sync + +HOST_NAME = local.get_hostname() + +GetLabelsFuncT = typing.Callable[ + [typing.Callable, typing.Any, typing.Tuple[typing.Any], typing.Dict[str, typing.Any]], typing.Dict[str, typing.Any] +] + + +def get_call_resource_labels_func( + wrapped: typing.Callable, + instance: typing.Any, + args: typing.Tuple[typing.Any], + kwargs: typing.Dict[str, typing.Any], +) -> typing.Dict[str, str]: + source = kwargs.pop("source", "default") + return {"method": wrapped.__name__, "source": source} + + +@contextmanager +def observe(histogram: Histogram, **labels): + start = time.perf_counter() + yield + histogram.labels(**labels).observe(time.perf_counter() - start) + + +class SetupObserve: + gauge: typing.Optional[Gauge] = None + counter: typing.Optional[Counter] = None + histogram: typing.Optional[Histogram] = None + labels: typing.Optional[typing.Dict[str, typing.Any]] = None + get_labels_func: typing.Optional[GetLabelsFuncT] = None + include_exception_histogram: bool = True + + @staticmethod + def default_get_labels_func( + wrapped: typing.Callable, + instance: typing.Any, + args: typing.Tuple[typing.Any], + kwargs: typing.Dict[str, typing.Any], + ) -> typing.Dict[str, str]: + return {} + + def __init__( + self, + gauge: typing.Optional[Gauge] = None, + counter: typing.Optional[Counter] = None, + histogram: typing.Optional[Histogram] = None, + get_labels_func: typing.Optional[GetLabelsFuncT] = None, + labels: typing.Optional[typing.Dict[str, typing.Any]] = None, + include_exception_histogram: bool = True, + ): + """ + :param 测量指标数组 + :param get_labels_func: 获取标签的方法 + :param labels: 标签 + """ + self.gauge = gauge + self.counter = counter + self.histogram = histogram + self.labels = labels + self.get_labels_func = get_labels_func or self.default_get_labels_func + self.include_exception_histogram = include_exception_histogram + + async def wrapped_async_executor( + self, + wrapped: typing.Callable, + instance: typing.Any, + args: typing.Tuple[typing.Any], + kwargs: typing.Dict[str, typing.Any], + ) -> typing.Any: + """ + 实例任务执行器,协程模式 + :param wrapped: 被装饰的函数或类方法 + :param instance: 参考 __init__ + :param args: 位置参数 + :param kwargs: 关键字参数 + :return: + """ + + labels = await sync.sync_to_async(self.get_labels)(wrapped, instance, args, kwargs) + self.gauge_inc(labels) + start = time.perf_counter() + try: + result = await wrapped(*args, **kwargs) + except Exception: + raise + else: + if not self.include_exception_histogram: + self.histogram_observe(labels, start) + return result + finally: + self.gauge_dec(labels) + if self.include_exception_histogram: + self.histogram_observe(labels, start) + + def wrapped_executor( + self, + wrapped: typing.Callable, + instance: typing.Any, + args: typing.Tuple[typing.Any], + kwargs: typing.Dict[str, typing.Any], + ): + """ + 实例任务执行器 + :param wrapped: 被装饰的函数或类方法 + :param instance: 基础Pipeline服务 + :param args: 位置参数 + :param kwargs: 关键字参数 + :return: + """ + labels = self.get_labels(wrapped, instance, args, kwargs) + self.gauge_inc(labels) + self.counter_inc(labels) + start = time.perf_counter() + try: + result = wrapped(*args, **kwargs) + except Exception: + raise + else: + if not self.include_exception_histogram: + self.histogram_observe(labels, start) + return result + finally: + self.gauge_dec(labels) + if self.include_exception_histogram: + self.histogram_observe(labels, start) + + def get_labels( + self, + wrapped: typing.Callable, + instance: typing.Any, + args: typing.Tuple[typing.Any], + kwargs: typing.Dict[str, typing.Any], + ) -> typing.Dict[str, str]: + return self.labels or self.get_labels_func(wrapped, instance, args, kwargs) or {} + + def gauge_inc(self, labels: typing.Dict[str, str]): + if not self.gauge: + return + if labels: + self.gauge.labels(**labels).inc(1) + else: + self.gauge.inc(1) + + def counter_inc(self, labels: typing.Dict[str, str]): + if not self.counter: + return + if labels: + self.counter.labels(**labels).inc(1) + else: + self.counter.inc(1) + + def gauge_dec(self, labels: typing.Dict[str, str]): + if not self.gauge: + return + + if labels: + self.gauge.labels(**labels).dec(1) + else: + self.gauge.dec(1) + + def histogram_observe(self, labels: typing.Dict[str, str], start: float): + if not self.histogram: + return + if labels: + self.histogram.labels(**labels).observe(time.perf_counter() - start) + else: + self.histogram.observe(time.perf_counter() - start) + + @wrapt.decorator + def __call__( + self, + wrapped: typing.Callable, + instance: typing.Any, + args: typing.Tuple[typing.Any], + kwargs: typing.Dict[str, typing.Any], + ) -> typing.Any: + """ + :param wrapped: 被装饰的函数或类方法 + :param instance: + - 如果被装饰者为普通类方法,该值为类实例 + - 如果被装饰者为 classmethod / 类方法,该值为类 + - 如果被装饰者为类/函数/静态方法,该值为 None + :param args: 位置参数 + :param kwargs: 关键字参数 + :return: + """ + if inspect.iscoroutinefunction(wrapped): + # 交给上层通过 await 方式执行 + return self.wrapped_async_executor(wrapped, instance, args, kwargs) + else: + return self.wrapped_executor(wrapped, instance, args, kwargs) diff --git a/apps/prometheus/metrics.py b/apps/prometheus/metrics.py new file mode 100644 index 000000000..021f4cc9c --- /dev/null +++ b/apps/prometheus/metrics.py @@ -0,0 +1,271 @@ +# -*- coding: utf-8 -*- +""" +TencentBlueKing is pleased to support the open source community by making 蓝鲸智云-节点管理(BlueKing-BK-NODEMAN) available. +Copyright (C) 2017-2022 THL A29 Limited, a Tencent company. All rights reserved. +Licensed under the MIT License (the "License"); you may not use this file except in compliance with the License. +You may obtain a copy of the License at https://opensource.org/licenses/MIT +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +specific language governing permissions and limitations under the License. +""" + +import os + +from django_prometheus.conf import NAMESPACE +from prometheus_client import Counter, Gauge, Histogram + + +def decode_buckets(buckets_list): + return [float(x) for x in buckets_list.split(",")] + + +def get_histogram_buckets_from_env(env_name): + if env_name in os.environ: + buckets = decode_buckets(os.environ.get(env_name)) + else: + buckets = ( + 0.005, + 0.01, + 0.025, + 0.05, + 0.075, + 0.1, + 0.25, + 0.5, + 0.75, + 1.0, + 2.5, + 5.0, + 7.5, + 10.0, + 15.0, + 20.0, + 25.0, + 50.0, + 75.0, + 100.0, + float("inf"), + ) + return buckets + + +jobs_by_op_type_operate_step = Counter( + "django_app_jobs_by_operate_step", + "Count of jobs by operate, step.", + ["operate", "step"], + namespace=NAMESPACE, +) + + +subscriptions_by_object_node_category = Counter( + "django_app_subscriptions_by_object_node_category", + "Count of subscriptions by object, node, category.", + ["object", "node", "category"], + namespace=NAMESPACE, +) + + +app_task_jobs_total = Counter( + name="app_task_jobs_total", + documentation="Cumulative count of jobs.", + labelnames=["operate", "step", "from_system"], + namespace=NAMESPACE, +) + +app_task_subscriptions_total = Counter( + name="app_task_subscriptions_total", + documentation="Cumulative count of subscriptions.", + labelnames=["object", "node", "category"], + namespace=NAMESPACE, +) + + +app_task_instances_migrate_actions_total = Counter( + name="app_task_instances_migrate_actions_total", + documentation="Cumulative count of instances migrate actions per step_id, per action.", + labelnames=["step_id", "action"], + namespace=NAMESPACE, +) + +app_task_instances_migrate_reasons_total = Counter( + name="app_task_instances_migrate_reasons_total", + documentation="Cumulative count of instances migrate reasons per step_id, per reason.", + labelnames=["step_id", "reason"], + namespace=NAMESPACE, +) + +app_task_get_instances_by_scope_duration_seconds = Histogram( + name="app_task_get_instances_by_scope_duration_seconds", + documentation="Histogram of the time (in seconds) each get instances per source", + buckets=get_histogram_buckets_from_env("BKAPP_MONITOR_METRICS_ENGINE_BUCKETS"), + labelnames=["node_type", "object_type", "source"], +) + +app_task_engine_running_executes_info = Gauge( + name="app_task_engine_running_executes", + documentation="Number of engine running executes per code.", + labelnames=["code"], +) + +app_task_engine_running_schedules_info = Gauge( + name="app_task_engine_running_schedules", + documentation="Number of engine running schedules per code.", + labelnames=["code"], +) + +app_task_engine_execute_duration_seconds = Histogram( + name="app_task_engine_execute_duration_seconds", + documentation="Histogram of the time (in seconds) each engine execute per code.", + buckets=get_histogram_buckets_from_env("BKAPP_MONITOR_METRICS_ENGINE_BUCKETS"), + labelnames=["code"], +) + +app_task_engine_schedule_duration_seconds = Histogram( + name="app_task_engine_schedule_duration_seconds", + documentation="Histogram of the time (in seconds) each engine schedule per code.", + buckets=get_histogram_buckets_from_env("BKAPP_MONITOR_METRICS_ENGINE_BUCKETS"), + labelnames=["code"], +) + +app_task_engine_service_run_exceptions_total = Counter( + name="app_task_engine_service_run_exceptions_total", + documentation="Cumulative count of engine service run exceptions " "per code, per exc_type, per exc_code.", + labelnames=["code", "exc_type", "exc_code"], +) + +app_task_engine_sub_inst_statuses_total = Counter( + name="app_task_engine_sub_inst_statuses_total", + documentation="Cumulative count of engine subscription instance statuses per status.", + labelnames=["status"], +) + +app_task_engine_sub_inst_step_statuses_total = Counter( + name="app_task_engine_sub_inst_step_statuses_total", + documentation="Cumulative count of engine subscription instance step statuses " + "per step_id, step_type, step_num, step_index, gse_version, action, code, status.", + labelnames=["step_id", "step_type", "step_num", "step_index", "gse_version", "action", "code", "status"], +) + +app_task_engine_get_common_data_duration_seconds = Histogram( + name="app_task_engine_get_common_data_duration_seconds", + documentation="Histogram of the time (in seconds) each get common data per step_type.", + buckets=get_histogram_buckets_from_env("BKAPP_MONITOR_METRICS_ENGINE_BUCKETS"), + labelnames=["step_type"], +) + + +app_task_engine_set_sub_inst_statuses_duration_seconds = Histogram( + name="app_task_engine_set_sub_inst_statuses_duration_seconds", + documentation="Histogram of the time (in seconds) each set subscription instance statuses.", + buckets=get_histogram_buckets_from_env("BKAPP_MONITOR_METRICS_ENGINE_BUCKETS"), +) + +app_task_engine_set_sub_inst_act_statuses_duration_seconds = Histogram( + name="app_task_engine_set_sub_inst_act_statuses_duration_seconds", + documentation="Histogram of the time (in seconds) each set subscription instance activity statuses.", + buckets=get_histogram_buckets_from_env("BKAPP_MONITOR_METRICS_ENGINE_BUCKETS"), +) + +app_plugin_render_configs_total = Counter( + name="app_plugin_render_configs_total", + documentation="Cumulative count of render plugin configs " + "per plugin_name, per name, per os, per cpu_arch, per type.", + labelnames=["plugin_name", "name", "os", "cpu_arch", "source", "type"], +) + +app_core_remote_connects_total = Counter( + name="app_core_remote_connects_total", + documentation="Cumulative count of remote connects per method," + " per username, per port, per auth_type, per os_type, per status", + labelnames=["method", "username", "port", "auth_type", "os_type", "status"], +) + +app_core_remote_connect_exceptions_total = Counter( + name="app_core_remote_connect_exceptions_total", + documentation="Cumulative count of remote connect exceptions per method," + " per username, per port, per auth_type, per os_type, per exc_type, per exc_code.", + labelnames=["method", "username", "port", "auth_type", "os_type", "exc_type", "exc_code"], +) + +app_core_remote_execute_duration_seconds = Histogram( + name="app_core_remote_execute_duration_seconds", + documentation="Histogram of the time (in seconds) each remote execute per method.", + buckets=get_histogram_buckets_from_env("BKAPP_MONITOR_METRICS_CORE_BUCKETS"), + labelnames=["method"], +) + +app_core_remote_batch_execute_duration_seconds = Histogram( + name="app_core_remote_batch_execute_duration_seconds", + documentation="Histogram of the time (in seconds) each remote batch execute per method.", + buckets=get_histogram_buckets_from_env("BKAPP_MONITOR_METRICS_CORE_BUCKETS"), + labelnames=["method"], +) + +app_core_remote_proxy_info = Gauge( + name="app_core_remote_proxy_info", + documentation="A metric with a constants '1' value labeled by proxy_name, proxy_ip, bk_cloud_id, paramiko_version", + labelnames=["proxy_name", "proxy_ip", "bk_cloud_id", "paramiko_version"], +) + +app_core_remote_install_exceptions_total = Counter( + name="app_core_remote_install_exceptions_total", + documentation="Cumulative count of remote install exceptions per step, per os_type, per node_type", + labelnames=["step", "os_type", "node_type"], +) + +app_core_cache_decorator_requests_total = Counter( + name="app_core_cache_decorator_requests_total", + documentation="Cumulative count of cache decorator requests per type, per backend, per method, per get_cache.", + labelnames=["type", "backend", "method", "get_cache"], +) + +app_core_cache_decorator_hits_total = Counter( + name="app_core_cache_decorator_hits_total", + documentation="Cumulative count of cache decorator hits per type, per backend, per method.", + labelnames=["type", "backend", "method"], +) + +app_core_cache_decorator_get_duration_seconds = Histogram( + name="app_core_cache_decorator_get_duration_seconds", + documentation="Histogram of the time (in seconds) each decorator get cache per type, per backend, per method.", + buckets=get_histogram_buckets_from_env("BKAPP_MONITOR_METRICS_CORE_BUCKETS"), + labelnames=["type", "backend", "method"], +) + +app_core_cache_decorator_set_duration_seconds = Histogram( + name="app_core_cache_decorator_set_duration_seconds", + documentation="Histogram of the time (in seconds) each decorator set cache per type, per backend, per method.", + buckets=get_histogram_buckets_from_env("BKAPP_MONITOR_METRICS_CORE_BUCKETS"), + labelnames=["type", "backend", "method"], +) + +app_core_password_requests_total = Counter( + name="app_core_password_requests_total", + documentation="Cumulative count of password requests per handler, per is_ok.", + labelnames=["handler", "is_ok"], +) + + +app_common_method_requests_total = Counter( + name="app_common_method_requests_total", + documentation="Cumulative count of method requests per method, per source.", + labelnames=["method", "source"], +) + +app_resource_watch_events_total = Counter( + name="app_resource_watch_events_total", + documentation="Cumulative count of resource watch events per type, per bk_resource, per bk_event_type.", + labelnames=["type", "bk_resource", "bk_event_type"], +) + +app_resource_watch_trigger_total = Counter( + name="app_resource_watch_trigger_total", + documentation="Cumulative count of resource watch trigger per method, per bk_biz_id, per debounce_time.", + labelnames=["method", "bk_biz_id", "debounce_time"], +) + +app_resource_watch_biz_events_total = Counter( + name="app_resource_watch_biz_events_total", + documentation="Cumulative count of resource watch biz events per bk_biz_id.", + labelnames=["bk_biz_id"], +) diff --git a/apps/prometheus/middlewares.py b/apps/prometheus/middlewares.py index 370581c36..781787eb3 100644 --- a/apps/prometheus/middlewares.py +++ b/apps/prometheus/middlewares.py @@ -15,6 +15,8 @@ from django_prometheus.middleware import Metrics from prometheus_client import Counter +from apps.utils.local import get_appcode_from_request_or_none + class NodeManMetrics(Metrics): def register(self): @@ -38,7 +40,7 @@ def label_metric(self, metric, request, response=None, **labels): return metric.labels(**labels) if labels else metric def _app_code(self, request): - return request.META.get("HTTP_BK_APP_CODE", settings.APP_CODE) + return get_appcode_from_request_or_none(request) or settings.APP_CODE def process_view(self, request, view_func, *view_args, **view_kwargs): if hasattr(request, "resolver_match"): diff --git a/apps/prometheus/models.py b/apps/prometheus/models.py index 78dfb8a44..3a01556c7 100644 --- a/apps/prometheus/models.py +++ b/apps/prometheus/models.py @@ -11,23 +11,7 @@ import typing -from django_prometheus.conf import NAMESPACE -from prometheus_client import Counter - -jobs_by_op_type_operate_step = Counter( - "django_app_jobs_by_operate_step", - "Count of jobs by operate, step.", - ["operate", "step"], - namespace=NAMESPACE, -) - - -subscriptions_by_object_node_category = Counter( - "django_app_subscriptions_by_object_node_category", - "Count of subscriptions by object, node, category.", - ["object", "node", "category"], - namespace=NAMESPACE, -) +from . import metrics def export_job_prometheus_mixin(): @@ -35,6 +19,7 @@ def export_job_prometheus_mixin(): class Mixin: job_type: str = None + from_system: str = None task_id_list: typing.List[int] = None _origin_task_id_list: typing.List[int] = None @@ -51,7 +36,8 @@ def unpacking_job_type(self) -> typing.Tuple[str, str]: def inc(self): operate, step = self.unpacking_job_type() - jobs_by_op_type_operate_step.labels(operate, step).inc() + metrics.jobs_by_op_type_operate_step.labels(operate, step).inc() + metrics.app_task_jobs_total.labels(operate, step, self.from_system or "default").inc() def _do_insert(self, *args, **kwargs): self.inc() @@ -69,7 +55,10 @@ class Mixin: category: typing.Optional[str] = None def _do_insert(self, *args, **kwargs): - subscriptions_by_object_node_category.labels( + metrics.app_task_subscriptions_total.labels( + self.object_type, self.node_type, self.category or "subscription" + ).inc() + metrics.subscriptions_by_object_node_category.labels( self.object_type, self.node_type, self.category or "subscription" ).inc() return super()._do_insert(*args, **kwargs) diff --git a/apps/prometheus/reporter.py b/apps/prometheus/reporter.py new file mode 100644 index 000000000..7516b9248 --- /dev/null +++ b/apps/prometheus/reporter.py @@ -0,0 +1,75 @@ +# -*- coding: utf-8 -*- +""" +TencentBlueKing is pleased to support the open source community by making 蓝鲸智云-节点管理(BlueKing-BK-NODEMAN) available. +Copyright (C) 2017-2022 THL A29 Limited, a Tencent company. All rights reserved. +Licensed under the MIT License (the "License"); you may not use this file except in compliance with the License. +You may obtain a copy of the License at https://opensource.org/licenses/MIT +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +specific language governing permissions and limitations under the License. +""" +import os +import time +from typing import Dict + +from bk_monitor_report import MonitorReporter as Reporter +from celery.utils.nodenames import gethostname, host_format +from prometheus_client import generate_latest +from prometheus_client.parser import text_string_to_metric_families + +# logger = logging.getLogger("bk-monitor-report") + + +class MonitorReporter(Reporter): + def __init__( + self, + data_id: int, + access_token: str, + target: str, + url: str, + report_interval: int = 60, + chunk_size: int = 500, + proc_type: str = "", + instance_tmpl: str = "", + ): + super().__init__(data_id, access_token, target, url, report_interval, chunk_size) + self.proc_type = proc_type + self.instance_tmpl = instance_tmpl + + def generate_addition_labels(self) -> Dict[str, str]: + addition_labels: Dict[str, str] = {"hostname": gethostname()} + if self.proc_type == "celery": + # 进程可用变量:https://docs.celeryq.dev/en/stable/userguide/workers.html + # 启动参数:https://docs.celeryq.dev/en/stable/reference/cli.html#cmdoption-celery-worker-n + addition_labels["nodeman_instance"] = host_format(self.instance_tmpl) + else: + addition_labels["nodeman_instance"] = host_format(self.instance_tmpl, P=str(os.getpid())) + return addition_labels + + def generate_chunked_report_data(self): + timestamp = round(time.time() * 1000) + + addition_labels = self.generate_addition_labels() + data = {"data_id": self.data_id, "access_token": self.access_token, "data": []} + size = 0 + metrics_text = generate_latest(self.registry).decode("utf-8") + for family in text_string_to_metric_families(metrics_text): + for sample in family.samples: + labels = sample.labels or {} + # 补充维度 + labels.update(addition_labels) + data["data"].append( + { + "metrics": {sample.name: sample.value}, + "target": self.target, + "dimension": labels, + "timestamp": timestamp, + } + ) + size += 1 + if size % self.chunk_size == 0: + yield data + data = {"data_id": self.data_id, "access_token": self.access_token, "data": []} + + if data["data"]: + yield data diff --git a/apps/utils/cache.py b/apps/utils/cache.py index 81b1eb7b4..0ab5d736d 100644 --- a/apps/utils/cache.py +++ b/apps/utils/cache.py @@ -8,17 +8,61 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ +import json from functools import wraps from typing import Callable, Optional -import ujson as json -from django.core.cache import cache +from django.core.serializers.json import DjangoJSONEncoder +from django_redis.pool import ConnectionFactory as Factory +from django_redis.pool import SentinelConnectionFactory as SentinelFactory +from django_redis.serializers.base import BaseSerializer from apps.utils.md5 import count_md5 DEFAULT_CACHE_TIME = 60 * 15 +class JSONSerializer(BaseSerializer): + """ + 自定义JSON序列化器用于redis序列化 + django-redis的默认JSON序列化器假定`decode_responses`被禁用。 + """ + + def dumps(self, value): + return json.dumps(value, cls=DjangoJSONEncoder) + + def loads(self, value): + return json.loads(value) + + +class ConnectionFactoryMixin: + """自定义ConnectionFactory以注入decode_responses参数""" + + def make_connection_params(self, url): + kwargs = super().make_connection_params(url) + kwargs["decode_responses"] = True + return kwargs + + +class ConnectionFactory(ConnectionFactoryMixin, Factory): + pass + + +class SentinelConnectionFactory(ConnectionFactoryMixin, SentinelFactory): + pass + + +def django_cache_key_maker(key: str, key_prefix: str, version: str) -> str: + """ + 自定义缓存键生成函数 + :param key: + :param key_prefix: + :param version: + :return: + """ + return f"{key_prefix}:v2:{key}" + + def class_member_cache(name: Optional[str] = None): """ 类成员缓存 @@ -48,31 +92,3 @@ def format_cache_key(func: Callable, *args, **kwargs): """计算缓存的key,通过函数名加上参数md5值得到""" kwargs.update({"args": args}) return f"{func.__name__}_{count_md5(kwargs)}" - - -def func_cache_decorator(cache_time: int = DEFAULT_CACHE_TIME): - """ - 函数缓存装饰器 - :param cache_time: 缓存时间 - """ - - def decorate(func): - @wraps(func) - def wrapper(*args, **kwargs): - get_cache = kwargs.pop("get_cache", False) - cache_key = format_cache_key(func, *args, **kwargs) - func_result = None - if get_cache: - func_result = cache.get(cache_key, None) - - # 若无需从缓存中获取数据或者缓存中没有数据,则执行函数得到结果,并设置缓存 - if func_result is None: - func_result = func(*args, **kwargs) - cache.set(cache_key, json.dumps(func_result), cache_time) - else: - func_result = json.loads(func_result) - return func_result - - return wrapper - - return decorate diff --git a/apps/utils/exc.py b/apps/utils/exc.py index 4353ad23e..4899d1743 100644 --- a/apps/utils/exc.py +++ b/apps/utils/exc.py @@ -23,6 +23,7 @@ class ExceptionHandler: """ exc_handler: Callable[[Callable, Any, Tuple[Any], Dict[str, Any], Exception], Any] = None + success_handler: Callable[[Callable, Any, Tuple[Any], Dict[str, Any]], None] = None @staticmethod def default_exc_handler( @@ -40,6 +41,18 @@ def default_exc_handler( # 抛出原异常 raise + @staticmethod + def default_success_handler(wrapped: Callable, instance: Any, args: Tuple[Any], kwargs: Dict[str, Any]) -> None: + """ + 默认无异常处理器 + :param wrapped: 被装饰的函数或类方法 + :param instance: 参考 __init__ + :param args: 位置参数 + :param kwargs: 关键字参数 + :return: + """ + pass + async def wrapped_async_executor( self, wrapped: Callable, instance: Any, args: Tuple[Any], kwargs: Dict[str, Any] ) -> Any: @@ -52,12 +65,18 @@ async def wrapped_async_executor( :return: """ try: - return await wrapped(*args, **kwargs) + result = await wrapped(*args, **kwargs) except Exception as exc: if inspect.iscoroutinefunction(self.exc_handler): return await self.exc_handler(wrapped, instance, args, kwargs, exc) else: return await sync.sync_to_async(self.exc_handler)(wrapped, instance, args, kwargs, exc) + else: + if inspect.iscoroutinefunction(self.success_handler): + await self.success_handler(wrapped, instance, args, kwargs) + else: + await sync.sync_to_async(self.success_handler)(wrapped, instance, args, kwargs) + return result def wrapped_executor(self, wrapped: Callable, instance: Any, args: Tuple[Any], kwargs: Dict[str, Any]): """ @@ -74,12 +93,16 @@ def wrapped_executor(self, wrapped: Callable, instance: Any, args: Tuple[Any], k return self.exc_handler(wrapped, instance, args, kwargs, exc) def __init__( - self, exc_handler: Optional[Callable[[Callable, Any, Tuple[Any], Dict[str, Any], Exception], Any]] = None + self, + exc_handler: Optional[Callable[[Callable, Any, Tuple[Any], Dict[str, Any], Exception], Any]] = None, + success_handler: Optional[Callable[[Callable, Any, Tuple[Any], Dict[str, Any]], None]] = None, ): """ :param exc_handler: 异常处理器 + :param success_handler: 无异常处理器 """ self.exc_handler = exc_handler or self.default_exc_handler + self.success_handler = success_handler or self.default_success_handler @wrapt.decorator def __call__( diff --git a/apps/utils/local.py b/apps/utils/local.py index b5f6c75ec..1def04d56 100644 --- a/apps/utils/local.py +++ b/apps/utils/local.py @@ -8,6 +8,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ +import socket import uuid from threading import local @@ -21,6 +22,38 @@ _local = local() +_HOSTNAME = socket.gethostname() + + +def get_username_from_request_or_none(request): + username = request.META.get("HTTP_BK_USERNAME") or request.META.get("HTTP_X_BKAPI_USER_NAME") + if username: + return username + + try: + return request.jwt.payload["user"]["bk_username"] + except Exception: + return None + + +def get_appcode_from_request_or_none(request): + app_code = request.META.get("HTTP_BK_APP_CODE") or request.META.get("HTTP_X_BKAPI_APP_CODE") + if app_code: + return app_code + + try: + return request.app.bk_app_code + except Exception: + return None + + +def get_hostname(): + """ + 获取当前主机名 + """ + return _HOSTNAME + + def activate_request(request, request_id=None): """ 激活request线程变量 @@ -71,11 +104,15 @@ def get_request_app_code(): 获取线程请求中的 APP_CODE,非线程请求返回空字符串 """ try: - return _local.request.META["HTTP_BK_APP_CODE"] + return get_appcode_from_request_or_none(_local.request) or "" except (AttributeError, KeyError): return "" +def get_request_app_code_or_local_app_code(): + return get_request_app_code() or settings.APP_CODE + + def set_local_param(key, value): """ 设置自定义线程变量 diff --git a/common/api/base.py b/common/api/base.py index 5b6c5295c..23e8bf96d 100644 --- a/common/api/base.py +++ b/common/api/base.py @@ -156,6 +156,8 @@ def __call__( self.data = data try: + if "no_request" in params and params["no_request"]: + use_admin = True response = self._send_request(params, headers, use_admin=use_admin) if raw: return response.response @@ -196,7 +198,7 @@ def _send_request(self, params, headers, use_admin=False): # 使用管理员账户请求时,设置用户名为管理员用户名,移除bk_token等认证信息 if use_admin: - params["bk_username"] = settings.BK_ADMIN_USERNAME + params["bk_username"] = "admin" params = remove_auth_args(params) # 是否有默认返回,调试阶段可用 diff --git a/common/api/modules/utils.py b/common/api/modules/utils.py index 6cd0e5b7e..b8a520f2b 100644 --- a/common/api/modules/utils.py +++ b/common/api/modules/utils.py @@ -66,8 +66,7 @@ def add_esb_info_before_request(params): params["appenv"] = settings.RUN_VER if "no_request" in params and params["no_request"]: - params["bk_username"] = "no_user" - params["operator"] = "no_user" + params["bk_username"] = "admin" else: req = get_request() auth_info = build_auth_args(req) @@ -87,4 +86,5 @@ def add_esb_info_before_request(params): params["uin"] = params["bk_username"] params["app_code"] = settings.APP_CODE params["app_secret"] = settings.SECRET_KEY + params.pop("_request", None) return params diff --git a/common/context_processors.py b/common/context_processors.py index d26741d7f..6a96afbba 100644 --- a/common/context_processors.py +++ b/common/context_processors.py @@ -16,9 +16,9 @@ from django.utils.translation import ugettext as _ from version_log.utils import get_latest_version +from apps.core.concurrent.cache import FuncCacheDecorator from apps.node_man import constants, models from apps.node_man.handlers.iam import IamHandler -from apps.utils.cache import func_cache_decorator from apps.utils.local import get_request_username """ @@ -45,7 +45,7 @@ def get_title(): return _("节点管理 | 腾讯蓝鲸智云") -@func_cache_decorator(cache_time=60 * constants.TimeUnit.SECOND) +@FuncCacheDecorator(cache_time=60 * constants.TimeUnit.SECOND) def get_ap_version_mutex(): return models.GlobalSettings.get_config( key=models.GlobalSettings.KeyEnum.ENABLE_AP_VERSION_MUTEX.value, diff --git a/config/default.py b/config/default.py index ada361c51..6159f1382 100644 --- a/config/default.py +++ b/config/default.py @@ -30,6 +30,7 @@ from pipeline.celery.settings import CELERY_ROUTES as PIPELINE_CELERY_ROUTES from .patchers import logging +from .patchers.monitor_reporter import monitor_report_config # =============================================================================== # 运行时,用于区分环境差异 @@ -482,23 +483,6 @@ }, ] -# ============================================================================== -# Cache -# ============================================================================== -CACHES.update( - { - "db": { - "BACKEND": "django.core.cache.backends.db.DatabaseCache", - "LOCATION": "django_cache", - "OPTIONS": {"MAX_ENTRIES": 10000, "CULL_FREQUENCY": 10}, - } - } -) - -CACHES["default"] = CACHES["db"] - -CACHE_KEY_TMPL = APP_CODE + ":scope:{scope}:body:{body}" - # ============================================================================== # 文件存储 # ============================================================================== @@ -636,6 +620,96 @@ def get_standard_redis_mode(cls, config_redis_mode: str, default: Optional[str] return cls.get_config_mode__redis_mode_map().get(config_redis_mode, default) +# ============================================================================== +# Cache +# ============================================================================== +CACHES.update( + { + "db": { + "BACKEND": "django.core.cache.backends.db.DatabaseCache", + "LOCATION": "django_cache", + "OPTIONS": {"MAX_ENTRIES": 10000, "CULL_FREQUENCY": 10}, + } + } +) + +CACHE_KEY_TMPL = APP_CODE + ":scope:{scope}:body:{body}" + +CONFIG_REDIS_MODE = os.getenv("REDIS_MODE", ConfigRedisMode.SENTINEL.value) +REDIS_MODE = RedisMode.get_standard_redis_mode(CONFIG_REDIS_MODE, default=RedisMode.REPLICATION.value) + +REDIS_PASSWORD = os.getenv("REDIS_PASSWORD") +REDIS_MASTER_NAME = os.getenv("REDIS_MASTER_NAME") +REDIS_SENTINEL_PASSWORD = os.getenv("REDIS_SENTINEL_PASSWORD") + +DJANGO_REDIS_COMMON_OPTIONS = { + "CLIENT_CLASS": "django_redis.client.DefaultClient", + "REDIS_CLIENT_CLASS": "redis.client.StrictRedis", + "SERIALIZER": "apps.utils.cache.JSONSerializer", +} + +if REDIS_MODE == "replication": + # redis 集群sentinel模式 + REDIS_HOST = os.getenv("REDIS_SENTINEL_HOST") + REDIS_PORT = os.getenv("REDIS_SENTINEL_PORT") + # # celery redbeat config + REDBEAT_REDIS_URL = "redis-sentinel://redis-sentinel:{port}/0".format(port=REDIS_PORT or 26379) + REDBEAT_REDIS_OPTIONS = { + "sentinels": [(REDIS_HOST, REDIS_PORT)], + "password": REDIS_PASSWORD, + "service_name": REDIS_MASTER_NAME or "mymaster", + "socket_timeout": 0.1, + "retry_period": 60, + "sentinel_kwargs": {"password": REDIS_SENTINEL_PASSWORD}, + } + DJANGO_REDIS_CONNECTION_FACTORY = "apps.utils.cache.SentinelConnectionFactory" + CACHES["redis"] = { + "BACKEND": "django_redis.cache.RedisCache", + "LOCATION": f"redis://{REDBEAT_REDIS_OPTIONS['service_name']}:{REDIS_PORT}/0", + "KEY_PREFIX": "nodeman", + "KEY_FUNCTION": "apps.utils.cache.django_cache_key_maker", + "OPTIONS": { + "PASSWORD": REDIS_PASSWORD, + "SENTINELS": REDBEAT_REDIS_OPTIONS["sentinels"], + "SENTINEL_KWARGS": REDBEAT_REDIS_OPTIONS["sentinel_kwargs"], + "CONNECTION_POOL_CLASS": "redis.sentinel.SentinelConnectionPool", + **DJANGO_REDIS_COMMON_OPTIONS, + }, + } +else: + REDIS_HOST = os.getenv("REDIS_HOST") + REDIS_PORT = os.getenv("REDIS_PORT") + # # celery redbeat config + REDBEAT_REDIS_URL = "redis://:{passwd}@{host}:{port}/0".format( + passwd=REDIS_PASSWORD, host=REDIS_HOST, port=REDIS_PORT or 6379 + ) + DJANGO_REDIS_CONNECTION_FACTORY = "apps.utils.cache.ConnectionFactory" + CACHES["redis"] = { + "BACKEND": "django_redis.cache.RedisCache", + "LOCATION": REDBEAT_REDIS_URL, + "KEY_PREFIX": "nodeman", + "KEY_FUNCTION": "apps.utils.cache.django_cache_key_maker", + "OPTIONS": {**DJANGO_REDIS_COMMON_OPTIONS}, + } + +REDIS = { + "host": REDIS_HOST, + "port": REDIS_PORT, + "password": REDIS_PASSWORD, + "service_name": REDIS_MASTER_NAME, + "sentinel_password": REDIS_SENTINEL_PASSWORD, + "mode": REDIS_MODE, # 哨兵模式,可选 single, cluster, replication +} + +CACHE_BACKEND = env.CACHE_BACKEND +CACHE_ENABLE_PREHEAT = env.CACHE_ENABLE_PREHEAT +CACHES["default"] = CACHES[CACHE_BACKEND] + + +# ============================================================================== +# 后台配置 +# ============================================================================== + if BK_BACKEND_CONFIG: DISABLED_APPS = [] @@ -672,44 +746,6 @@ def get_standard_redis_mode(cls, config_redis_mode: str, default: Optional[str] # BROKER_URL BROKER_URL = BK_NODEMAN_CELERY_RESULT_BACKEND_BROKER_URL - CONFIG_REDIS_MODE = os.getenv("REDIS_MODE", ConfigRedisMode.SENTINEL.value) - REDIS_MODE = RedisMode.get_standard_redis_mode(CONFIG_REDIS_MODE, default=RedisMode.REPLICATION.value) - - REDIS_PASSWORD = os.getenv("REDIS_PASSWORD") - REDIS_MASTER_NAME = os.getenv("REDIS_MASTER_NAME") - REDIS_SENTINEL_PASSWORD = os.getenv("REDIS_SENTINEL_PASSWORD") - - if REDIS_MODE == "replication": - # redis 集群sentinel模式 - REDIS_HOST = os.getenv("REDIS_SENTINEL_HOST") - REDIS_PORT = os.getenv("REDIS_SENTINEL_PORT") - # # celery redbeat config - REDBEAT_REDIS_URL = "redis-sentinel://redis-sentinel:{port}/0".format(port=REDIS_PORT or 26379) - REDBEAT_REDIS_OPTIONS = { - "sentinels": [(REDIS_HOST, REDIS_PORT)], - "password": REDIS_PASSWORD, - "service_name": REDIS_MASTER_NAME or "mymaster", - "socket_timeout": 0.1, - "retry_period": 60, - "sentinel_kwargs": {"password": REDIS_SENTINEL_PASSWORD}, - } - else: - REDIS_HOST = os.getenv("REDIS_HOST") - REDIS_PORT = os.getenv("REDIS_PORT") - # # celery redbeat config - REDBEAT_REDIS_URL = "redis://:{passwd}@{host}:{port}/0".format( - passwd=REDIS_PASSWORD, host=REDIS_HOST, port=REDIS_PORT or 6379 - ) - - REDIS = { - "host": REDIS_HOST, - "port": REDIS_PORT, - "password": REDIS_PASSWORD, - "service_name": REDIS_MASTER_NAME, - "sentinel_password": REDIS_SENTINEL_PASSWORD, - "mode": REDIS_MODE, # 哨兵模式,可选 single, cluster, replication - } - REDBEAT_KEY_PREFIX = "nodeman" @@ -772,6 +808,14 @@ def get_standard_redis_mode(cls, config_redis_mode: str, default: Optional[str] VERSION_LOG = {"MD_FILES_DIR": os.path.join(PROJECT_ROOT, "release"), "LANGUAGE_MAPPINGS": {"en": "en"}} +# ============================================================================== +# 可观测 +# ============================================================================== + +# 自定义上报监控配置 +if env.BKAPP_MONITOR_REPORTER_ENABLE: + monitor_report_config() + # remove disabled apps if locals().get("DISABLED_APPS"): INSTALLED_APPS = locals().get("INSTALLED_APPS", []) diff --git a/config/patchers/__init__.py b/config/patchers/__init__.py new file mode 100644 index 000000000..29ed269e0 --- /dev/null +++ b/config/patchers/__init__.py @@ -0,0 +1,10 @@ +# -*- coding: utf-8 -*- +""" +TencentBlueKing is pleased to support the open source community by making 蓝鲸智云-节点管理(BlueKing-BK-NODEMAN) available. +Copyright (C) 2017-2022 THL A29 Limited, a Tencent company. All rights reserved. +Licensed under the MIT License (the "License"); you may not use this file except in compliance with the License. +You may obtain a copy of the License at https://opensource.org/licenses/MIT +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +specific language governing permissions and limitations under the License. +""" diff --git a/config/patchers/monitor_reporter.py b/config/patchers/monitor_reporter.py new file mode 100644 index 000000000..f454f8359 --- /dev/null +++ b/config/patchers/monitor_reporter.py @@ -0,0 +1,135 @@ +# -*- coding: utf-8 -*- +""" +TencentBlueKing is pleased to support the open source community by making 蓝鲸智云-节点管理(BlueKing-BK-NODEMAN) available. +Copyright (C) 2017-2022 THL A29 Limited, a Tencent company. All rights reserved. +Licensed under the MIT License (the "License"); you may not use this file except in compliance with the License. +You may obtain a copy of the License at https://opensource.org/licenses/MIT +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +specific language governing permissions and limitations under the License. +""" +import sys + +import env + + +def monitor_report_config(): + boot_cmd = " ".join(sys.argv) + print(boot_cmd) + if "celery -A apps.backend worker" in boot_cmd: + try: + q_conf_index = sys.argv.index("-Q") + except ValueError as e: + sys.stdout.write( + "[!]can't found -Q option in command: %s, skip celery monitor report config: %s\n" % (boot_cmd, e) + ) + return + + try: + queues = sys.argv[q_conf_index + 1] + except IndexError as e: + sys.stdout.write( + "[!]can't found -Q value in command: %s, skip celery monitor report config: %s\n" % (boot_cmd, e) + ) + return + + # 只对存在以下队列的情况进行上报 + monitor_queues = ["backend", "backend_additional_task", "default", "service_schedule", "pipeline_priority"] + if not any([monitor_queue in queues for monitor_queue in monitor_queues]): + sys.stdout.write("[!]can't found er queue in command: %s, skip celery monitor report config\n" % boot_cmd) + return + + proc_type = "celery" + try: + n_conf_index = sys.argv.index("-n") + except ValueError as e: + # 没有 get 到,说明是单 workers 场景 + instance_tmpl = "celery@%h-%i" + sys.stdout.write("[!]can't found -n option in command: %s, use default -> celery@xxx: %s\n" % (boot_cmd, e)) + else: + # %i 区分单 workers 多进程的情况 + # -n 区分单主机多 workers 的情况 + instance_tmpl = sys.argv[n_conf_index + 1] + "-%i" + + from bk_monitor_report.contrib.celery import MonitorReportStep # noqa + + from apps.backend.celery import app as celery_app # noqa + from apps.prometheus.reporter import MonitorReporter # noqa + + reporter = MonitorReporter( + data_id=env.BKAPP_MONITOR_REPORTER_DATA_ID, # 监控 Data ID + access_token=env.BKAPP_MONITOR_REPORTER_ACCESS_TOKEN, # 自定义上报 Token + target=env.BKAPP_MONITOR_REPORTER_TARGET, # 上报唯一标志符 + url=env.BKAPP_MONITOR_REPORTER_URL, # 上报地址 + report_interval=env.BKAPP_MONITOR_REPORTER_REPORT_INTERVAL, # 上报周期,秒 + chunk_size=env.BKAPP_MONITOR_REPORTER_CHUNK_SIZE, # 上报指标分块大小 + proc_type=proc_type, + instance_tmpl=instance_tmpl, + ) + + # 针对多进程worker需要做特殊梳理,在worker进程中进行reporter start + prefork_config_check = [("-P", "-P prefork"), ("--pool", "--pool=prefork")] + if any([config[0] in boot_cmd and config[1] not in boot_cmd for config in prefork_config_check]): + MonitorReportStep.setup_reporter(reporter) + celery_app.steps["worker"].add(MonitorReportStep) + else: + # prefork 场景下,每个进程都会有一个 Reporter + from celery.signals import worker_process_init # noqa + + worker_process_init.connect(reporter.start, weak=False) + + sys.stdout.write( + "[Monitor reporter] init success, proc_type -> %s, instance_tmpl -> %s \n" % (proc_type, instance_tmpl) + ) + + sys.stdout.write("[Monitor reporter] init success\n") + + else: + from apps.prometheus.reporter import MonitorReporter # noqa + + match_proc_name = None + proc_names = [ + "gunicorn", + "runserver", + "sync_host_event", + "sync_host_relation_event", + "sync_process_event", + "apply_resource_watched_events", + ] + + for proc_name in proc_names: + if proc_name in boot_cmd: + match_proc_name = proc_name + break + + if not match_proc_name: + sys.stdout.write("[!]unknown boot cmd: %s, skip monitor report config\n" % boot_cmd) + return + else: + sys.stdout.write("[Monitor reporter] match_proc_name %s \n" % match_proc_name) + + if match_proc_name in ["gunicorn", "runserver"]: + # gunicorn -w 参数会派生出 n 个进程,每个进程都有一个 Reporter + # Worker 模型:https://docs.gunicorn.org/en/latest/design.html?highlight=gthread#server-model + proc_type = "web" + instance_tmpl = str(match_proc_name) + "@%h-%P" + else: + # 单进程运行,无需 pid + proc_type = "sync" + instance_tmpl = str(match_proc_name) + "@%h" + + reporter = MonitorReporter( + data_id=env.BKAPP_MONITOR_REPORTER_DATA_ID, # 监控 Data ID + access_token=env.BKAPP_MONITOR_REPORTER_ACCESS_TOKEN, # 自定义上报 Token + target=env.BKAPP_MONITOR_REPORTER_TARGET, # 上报唯一标志符 + url=env.BKAPP_MONITOR_REPORTER_URL, # 上报地址 + report_interval=env.BKAPP_MONITOR_REPORTER_REPORT_INTERVAL, # 上报周期,秒 + chunk_size=env.BKAPP_MONITOR_REPORTER_CHUNK_SIZE, # 上报指标分块大小 + proc_type=proc_type, + instance_tmpl=instance_tmpl, + ) + reporter.start() + + sys.stdout.write( + "[Monitor reporter] init success, proc_type -> %s, instance_tmpl -> %s \n" % (proc_type, instance_tmpl) + ) diff --git a/env/__init__.py b/env/__init__.py index cbe6e1986..be69654fb 100644 --- a/env/__init__.py +++ b/env/__init__.py @@ -22,6 +22,8 @@ "BKPAAS_BK_CRYPTO_TYPE", "LOG_TYPE", "LOG_LEVEL", + "CACHE_BACKEND", + "CACHE_ENABLE_PREHEAT", "BK_LOG_DIR", "GSE_VERSION", "BKAPP_ENABLE_OTEL_TRACE", @@ -29,6 +31,13 @@ "BKAPP_OTEL_SAMPLER", "BKAPP_OTEL_BK_DATA_TOKEN", "BKAPP_OTEL_GRPC_URL", + "BKAPP_MONITOR_REPORTER_ENABLE", + "BKAPP_MONITOR_REPORTER_DATA_ID", + "BKAPP_MONITOR_REPORTER_ACCESS_TOKEN", + "BKAPP_MONITOR_REPORTER_TARGET", + "BKAPP_MONITOR_REPORTER_URL", + "BKAPP_MONITOR_REPORTER_REPORT_INTERVAL", + "BKAPP_MONITOR_REPORTER_CHUNK_SIZE", "BKAPP_NAV_OPEN_SOURCE_URL", "BKAPP_NAV_HELPER_URL", "BK_CC_HOST", @@ -77,6 +86,14 @@ BK_LOG_DIR = get_type_env(key="BK_LOG_DIR", default="./../bk_nodeman/logs", _type=str) +# =============================================================================== +# 缓存 +# =============================================================================== +# 缓存后端,默认值为 `db`,可选项:`db`、`redis` - 仅存在 `REDIS_HOST` 变量时生效,否则仍默认使用 `db` +CACHE_BACKEND = get_type_env(key="CACHE_BACKEND", default=constants.CacheBackend.DB.value, _type=str) +# 是否预热关键缓存,一般在切换缓存前需要开启,开启前请确保 SaaS 模块也配置了 Redis +CACHE_ENABLE_PREHEAT = get_type_env(key="CACHE_ENABLE_PREHEAT", default=False, _type=bool) + # =============================================================================== # 蓝鲸管控平台 # =============================================================================== @@ -113,6 +130,24 @@ BKAPP_OTEL_BK_DATA_TOKEN = get_type_env(key="BKAPP_OTEL_BK_DATA_TOKEN", _type=str) BKAPP_OTEL_GRPC_URL = get_type_env(key="BKAPP_OTEL_GRPC_URL", _type=str) +# 是否启用自定义上报 +BKAPP_MONITOR_REPORTER_ENABLE = get_type_env(key="BKAPP_MONITOR_REPORTER_ENABLE", default=False, _type=bool) +# 监控 Data ID +BKAPP_MONITOR_REPORTER_DATA_ID = get_type_env(key="BKAPP_MONITOR_REPORTER_DATA_ID", default=0, _type=int) +# 自定义上报 Token +BKAPP_MONITOR_REPORTER_ACCESS_TOKEN = get_type_env(key="BKAPP_MONITOR_REPORTER_ACCESS_TOKEN", default="", _type=str) +# 上报唯一标志符 +BKAPP_MONITOR_REPORTER_TARGET = get_type_env(key="BKAPP_MONITOR_REPORTER_TARGET", default="prod", _type=str) +# 上报地址 +BKAPP_MONITOR_REPORTER_URL = get_type_env(key="BKAPP_MONITOR_REPORTER_URL", default="", _type=str) +# 上报间隔 +BKAPP_MONITOR_REPORTER_REPORT_INTERVAL = get_type_env( + key="BKAPP_MONITOR_REPORTER_REPORT_INTERVAL", default=10, _type=int +) +# 块大小 +BKAPP_MONITOR_REPORTER_CHUNK_SIZE = get_type_env(key="BKAPP_MONITOR_REPORTER_CHUNK_SIZE", default=200, _type=int) + + # =============================================================================== # 第三方依赖 # =============================================================================== diff --git a/env/constants.py b/env/constants.py index 547f1a434..c0cf19e8b 100644 --- a/env/constants.py +++ b/env/constants.py @@ -51,3 +51,12 @@ class BkCryptoType(EnhanceEnum): @classmethod def _get_member__alias_map(cls) -> Dict[Enum, str]: return {cls.SHANGMI: "国密算法", cls.CLASSIC: "国密算法"} + + +class CacheBackend(EnhanceEnum): + DB = "db" + REDIS = "redis" + + @classmethod + def _get_member__alias_map(cls) -> Dict[Enum, str]: + return {cls.DB: "MySQL", cls.REDIS: "Redis(如果 Redis 未配置,使用 MySQL)"} diff --git a/script_tools/setup_pagent2.py b/script_tools/setup_pagent2.py index efa7c2a8e..4ba2a094b 100644 --- a/script_tools/setup_pagent2.py +++ b/script_tools/setup_pagent2.py @@ -16,6 +16,7 @@ import sys import time import traceback +import typing from functools import partial from io import StringIO from pathlib import Path @@ -132,6 +133,7 @@ def emit(self, record): "timestamp": round(time.time()), "level": record.levelname, "step": record.step, + "metrics": record.metrics, "log": f"({status}) {record.message}", "status": status, "prefix": "[proxy]", @@ -155,13 +157,21 @@ def _log(self, level, msg, *args, extra=None, **kwargs): step: str = extra.pop("step", "N/A") is_report: str = extra.pop("is_report", True) - kwargs = {"step": step, "is_report": is_report} + metrics: typing.Dict[str, typing.Any] = extra.pop("metrics", {}) + kwargs = {"step": step, "is_report": is_report, "metrics": metrics} kwargs.update(extra) super()._log(level, msg, args, extra=kwargs) - def logging(self, step: str, msg: str, level: int = logging.INFO, is_report: bool = True): - self._log(level, msg, extra={"step": step, "is_report": is_report}) + def logging( + self, + step: str, + msg: str, + metrics: typing.Optional[typing.Dict[str, typing.Any]] = None, + level: int = logging.INFO, + is_report: bool = True, + ): + self._log(level, msg, extra={"step": step, "is_report": is_report, "metrics": metrics or {}}) console_handler = logging.StreamHandler() @@ -291,10 +301,11 @@ def execute_batch_solution( try: res = execute_cmd(cmd, login_ip, account, identity, is_no_output=content["name"] == "run_cmd") - except Exception as exc: + except Exception: # 过程中只要有一条命令执行失败,视为执行方案失败 - logger.logging("execute_batch_solution", f"execute {cmd} failed, err_msg -> {exc}", level=logging.ERROR) - return + logger.logging("execute_batch_solution", f"execute {cmd} failed", level=logging.WARNING) + # 把异常抛给最外层 + raise print(res) @@ -340,36 +351,6 @@ def execute_shell_solution( raise ProcessError(f"Command returned non-zero: {run_output}") logger.logging("send_cmd", str(run_output), is_report=False) - # cmds: List[str] = [] - # shell_pkg: str = ("bash", "ksh")[os_type == "aix"] - # for step in execution_solution["steps"]: - # # 暂不支持 dependencies 等其他步骤类型 - # if step["type"] == "commands": - # for content in step["contents"]: - # cmds.append(content["text"]) - # - # # 串联执行 - # command: str = "{shell_pkg} -c 'exec 2>&1 && {multi_cmds_str} '\n".format( - # shell_pkg=shell_pkg, multi_cmds_str=" && ".join(cmds) - # ) - # # 根据用户名判断是否采用sudo - # if account not in ["root", "Administrator", "administrator"]: - # command = "sudo %s" % command - # - # with ParamikoConn( - # host=login_ip, - # port=port, - # username=account, - # password=identity, - # client_key_strings=client_key_strings, - # connect_timeout=15, - # ) as conn: - # logger.logging("send_cmd", command, is_report=False) - # run_output: RunOutput = conn.run(command, check=True, timeout=60) - # if run_output.exit_status != 0: - # raise ProcessError(f"Command returned non-zero: {run_output}") - # logger.logging("send_cmd", str(run_output), is_report=False) - def is_port_listen(ip: str, port: int) -> bool: s = socket.socket((socket.AF_INET, socket.AF_INET6)[is_v6(ip)], socket.SOCK_STREAM) @@ -441,6 +422,26 @@ def download_file(url: str, dest_dir: str): raise DownloadFileError(err_msg) from exc +def use_shell() -> bool: + os_type: str = args.host_os_type + port = int(args.host_port) + if os_type not in ["windows"] or (os_type in ["windows"] and port != 445): + return True + else: + return False + + +def get_common_labels() -> typing.Dict[str, typing.Any]: + os_type: str = args.host_os_type or "unknown" + return { + "method": ("proxy_wmiexe", "proxy_ssh")[use_shell()], + "username": args.host_account, + "port": int(args.host_port), + "auth_type": args.host_auth_type, + "os_type": os_type.upper(), + } + + def main() -> None: login_ip = args.host_login_ip @@ -479,32 +480,39 @@ def main() -> None: execution_solution=host_solution, ) + app_core_remote_connects_total_labels = {**get_common_labels(), "status": "success"} + logger.logging( + "metrics", + f"app_core_remote_connects_total_labels -> {app_core_remote_connects_total_labels}", + metrics={"name": "app_core_remote_connects_total", "labels": app_core_remote_connects_total_labels}, + ) + BytesOrStr = Union[str, bytes] class RemoteBaseException(Exception): - pass + code = 0 class RunCmdError(RemoteBaseException): - pass + code = 1 class PermissionDeniedError(RemoteBaseException): - pass + code = 2 class DisconnectError(RemoteBaseException): - pass + code = 3 class RemoteTimeoutError(RemoteBaseException): - pass + code = 4 class ProcessError(RemoteBaseException): - pass + code = 5 class RunOutput: @@ -736,11 +744,69 @@ def _run( if __name__ == "__main__": + _paramiko_version: str = "-" + try: + _paramiko_version = str(paramiko.__version__) + except Exception: + logger.logging("proxy", "Failed to get paramiko version", is_report=False, level=logging.WARNING) + + _app_core_remote_proxy_info_labels = { + "proxy_name": socket.gethostname(), + "proxy_ip": args.lan_eth_ip, + "bk_cloud_id": args.host_cloud, + "paramiko_version": _paramiko_version, + } + logger.logging( + "metrics", + f"app_core_remote_proxy_info_labels -> {_app_core_remote_proxy_info_labels}", + metrics={"name": "app_core_remote_proxy_info", "labels": _app_core_remote_proxy_info_labels}, + ) + logger.logging("proxy", "setup_pagent2 will start running now.", is_report=False) + _start = time.perf_counter() + try: main() except Exception as _e: + _app_core_remote_connects_total_labels = {**get_common_labels(), "status": "failed"} + logger.logging( + "metrics", + f"app_core_remote_connects_total_labels -> {_app_core_remote_connects_total_labels}", + metrics={"name": "app_core_remote_connects_total", "labels": _app_core_remote_connects_total_labels}, + ) + + if isinstance(_e, RemoteBaseException): + exc_type = "app" + exc_code = str(_e.code) + else: + exc_type = "unknown" + exc_code = _e.__class__.__name__ + + _app_core_remote_connect_exceptions_total_labels = { + **get_common_labels(), + "exc_type": exc_type, + "exc_code": exc_code, + } + logger.logging( + "metrics", + f"app_core_remote_connect_exceptions_total_labels -> {_app_core_remote_connect_exceptions_total_labels}", + metrics={ + "name": "app_core_remote_connect_exceptions_total", + "labels": _app_core_remote_connect_exceptions_total_labels, + }, + ) logger.logging("proxy_fail", str(_e), level=logging.ERROR) logger.logging("proxy_fail", traceback.format_exc(), level=logging.ERROR, is_report=False) else: - logger.logging("proxy", "setup_pagent2 succeeded.", is_report=False) + _app_core_remote_execute_duration_seconds_labels = {"method": ("proxy_wmiexe", "proxy_ssh")[use_shell()]} + cost_time = time.perf_counter() - _start + logger.logging( + "metrics", + f"app_core_remote_execute_duration_seconds_labels -> {_app_core_remote_execute_duration_seconds_labels}", + metrics={ + "name": "app_core_remote_execute_duration_seconds", + "labels": _app_core_remote_execute_duration_seconds_labels, + "data": {"cost_time": cost_time}, + }, + ) + logger.logging("proxy", f"setup_pagent2 succeeded: cost_time -> {cost_time}", is_report=False) diff --git a/support-files/kubernetes/helm/bk-nodeman/templates/_helpers.tpl b/support-files/kubernetes/helm/bk-nodeman/templates/_helpers.tpl index 08ea39f76..47158dc67 100644 --- a/support-files/kubernetes/helm/bk-nodeman/templates/_helpers.tpl +++ b/support-files/kubernetes/helm/bk-nodeman/templates/_helpers.tpl @@ -240,6 +240,8 @@ envFrom: name: "{{ printf "%s-%s" (include "bk-nodeman.fullname" .) "db-env-configmap" }}" - configMapRef: name: "{{ printf "%s-%s" (include "bk-nodeman.fullname" .) "env-configmap" }}" + - configMapRef: + name: "{{ printf "%s-%s" (include "bk-nodeman.fullname" .) "redis-env-configmap" }}" {{- if .Values.extraEnvVarsCM }} - configMapRef: name: "{{ .Values.extraEnvVarsCM }}" diff --git a/support-files/kubernetes/helm/bk-nodeman/templates/configmaps/env-configmap.yaml b/support-files/kubernetes/helm/bk-nodeman/templates/configmaps/env-configmap.yaml index 133240c75..2e405629f 100644 --- a/support-files/kubernetes/helm/bk-nodeman/templates/configmaps/env-configmap.yaml +++ b/support-files/kubernetes/helm/bk-nodeman/templates/configmaps/env-configmap.yaml @@ -28,6 +28,9 @@ data: LOG_LEVEL: "{{ .Values.config.logLevel }}" BK_LOG_DIR: "{{ .Values.config.bkLogDir }}" + CACHE_BACKEND: "{{ .Values.config.cacheBackend }}" + CACHE_ENABLE_PREHEAT: "{{ .Values.config.cacheEnablePreheat }}" + BK_CMDB_RESOURCE_POOL_BIZ_ID: "{{ .Values.config.bkCmdbResourcePoolBizId }}" DEFAULT_SUPPLIER_ACCOUNT: "{{ .Values.config.defaultSupplierAccount }}" @@ -99,6 +102,25 @@ data: BKAPP_OTEL_GRPC_URL: "{{ .Values.config.bkAppOtelGrpcUrl }}" {{- end }} + {{- if .Values.config.bkAppEnableOtelTrace }} + BKAPP_ENABLE_OTEL_TRACE: "{{ .Values.config.bkAppEnableOtelTrace }}" + BKAPP_OTEL_SERVICE_NAME: {{ printf "%s-%s" (include "bk-nodeman.fullname" .) "saas" }} + BKAPP_OTEL_INSTRUMENT_DB_API: "{{ .Values.config.bkAppOtelInstrumentDbApi }}" + BKAPP_OTEL_SAMPLER: "{{ .Values.config.bkAppOtelSampler }}" + BKAPP_OTEL_BK_DATA_TOKEN: "{{ .Values.config.bkAppOtelBkDataToken }}" + BKAPP_OTEL_GRPC_URL: "{{ .Values.config.bkAppOtelGrpcUrl }}" + {{- end }} + + {{- if .Values.config.bkAppMonitorReporterEnable }} + BKAPP_MONITOR_REPORTER_ENABLE: "{{ .Values.config.bkAppMonitorReporterEnable }}" + BKAPP_MONITOR_REPORTER_DATA_ID: "{{ int .Values.config.bkAppMonitorReporterDataId }}" + BKAPP_MONITOR_REPORTER_ACCESS_TOKEN: "{{ .Values.config.bkAppMonitorReporterAccessToken }}" + BKAPP_MONITOR_REPORTER_TARGET: "{{ .Values.config.bkAppMonitorReporterTarget }}" + BKAPP_MONITOR_REPORTER_URL: "{{ .Values.config.bkAppMonitorReporterUrl }}" + BKAPP_MONITOR_REPORTER_REPORT_INTERVAL: "{{ .Values.config.bkAppMonitorReporterInterval }}" + BKAPP_MONITOR_REPORTER_CHUNK_SIZE: "{{ .Values.config.bkAppMonitorReporterChunkSize }}" + {{- end }} + CONCURRENT_NUMBER: "{{ .Values.config.concurrentNumber }}" SAAS_API_PORT: "{{ .Values.saas.api.service.port }}" diff --git a/support-files/kubernetes/helm/bk-nodeman/values.yaml b/support-files/kubernetes/helm/bk-nodeman/values.yaml index 97ad05391..5cfa1c28e 100644 --- a/support-files/kubernetes/helm/bk-nodeman/values.yaml +++ b/support-files/kubernetes/helm/bk-nodeman/values.yaml @@ -326,6 +326,12 @@ config: ## 日志所在目录 bkLogDir: "/data/bkee/logs/bknodeman" + ## 缓存 + ## 缓存后端,默认值为 `db`,可选项:`db`、`redis` - 仅存在 `REDIS_HOST` 变量时生效,否则仍默认使用 `db` + cacheBackend: "db" + ## 是否预热关键缓存,一般在切换缓存前需要开启 + cacheEnablePreheat: false + ## 蓝鲸配置平台 ## ## 资源池 ID @@ -430,6 +436,21 @@ config: bkAppOtelBkDataToken: "" bkAppOtelGrpcUrl: "" + ## 是否启用自定义上报 + bkAppMonitorReporterEnable: false + ## 监控 Data ID + bkAppMonitorReporterDataId: 0 + ## 自定义上报 Token + bkAppMonitorReporterAccessToken: "" + ## 上报唯一标志符 + bkAppMonitorReporterTarget: "prod" + ## 上报地址 + bkAppMonitorReporterUrl: "" + ## 上报间隔 + bkAppMonitorReporterInterval: 10 + ## 块大小 + bkAppMonitorReporterChunkSize: 200 + ## 导航栏开源社区地址 bkAppNavOpenSourceUrl: "https://github.com/TencentBlueKing/bk-nodeman" ## 导航栏技术支持地址 diff --git a/support-files/templates/#etc#supervisor-bknodeman-nodeman.conf b/support-files/templates/#etc#supervisor-bknodeman-nodeman.conf index a0887b370..c29505063 100644 --- a/support-files/templates/#etc#supervisor-bknodeman-nodeman.conf +++ b/support-files/templates/#etc#supervisor-bknodeman-nodeman.conf @@ -79,7 +79,7 @@ redirect_stderr=true directory=__BK_HOME__/bknodeman/nodeman [program:nodeman_celery_default] -command=/bin/bash -c "sleep 10 && source bin/environ.sh && exec celery -A apps.backend worker -Q default --autoscale=8,2 --maxtasksperchild=50 -O fair --time-limit=1800" +command=/bin/bash -c "sleep 10 && source bin/environ.sh && exec celery -A apps.backend worker -Q default -n default_%(process_num)02d@%%h --autoscale=8,2 --maxtasksperchild=50 -O fair --time-limit=1800" numprocs=1 process_name = %(program_name)s_%(process_num)02d autostart=true @@ -105,7 +105,7 @@ redirect_stderr=true directory=__BK_HOME__/bknodeman/nodeman [program:nodeman_celery_backend_additional] -command=/bin/bash -c "sleep 10 && source bin/environ.sh && exec celery -A apps.backend worker -Q backend_additional_task -c 10 -O fair --time-limit=1800 --maxtasksperchild=50" +command=/bin/bash -c "sleep 10 && source bin/environ.sh && exec celery -A apps.backend worker -Q backend_additional_task -n baworker_%(process_num)02d@%%h -c 10 -O fair --time-limit=1800 --maxtasksperchild=50" numprocs=1 process_name = %(program_name)s_%(process_num)02d autostart=true diff --git a/support-files/templates/nodeman#bin#environ.sh b/support-files/templates/nodeman#bin#environ.sh index 6a862473f..849dd3199 100755 --- a/support-files/templates/nodeman#bin#environ.sh +++ b/support-files/templates/nodeman#bin#environ.sh @@ -59,6 +59,10 @@ export MYSQL_PASSWORD="__BK_NODEMAN_MYSQL_PASSWORD__" export MYSQL_HOST="__BK_NODEMAN_MYSQL_HOST__" export MYSQL_PORT="__BK_NODEMAN_MYSQL_PORT__" +# 缓存 +export CACHE_BACKEND="__BK_NODEMAN_CACHE_BACKEND__" +export CACHE_ENABLE_PREHEAT="__BK_NODEMAN_CACHE_ENABLE_PREHEAT__" + # Redis # standalone: 单实例 # sentinel: 哨兵