From 8aa34bbd8938096a038b2604a6927a1b3a39f3f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B0=AD=E5=BD=AA?= Date: Sat, 2 Nov 2024 10:20:39 +0800 Subject: [PATCH] docs: update docs based on version 1.60.0 --- datakit.template.yaml | 2 +- internal/datakit/dkconf.go | 2 +- internal/export/doc/en/changelog.md | 40 ++++++++++ internal/export/doc/en/datakit-conf.md | 85 ++++++++++++++-------- internal/export/doc/en/datakit-install.md | 30 +++++++- internal/export/doc/en/datakit-metrics.md | 9 ++- internal/export/doc/en/datakit-monitor.md | 69 ++++++++++-------- internal/export/doc/en/datakit-operator.md | 8 +- internal/export/doc/en/development.md | 1 - internal/export/doc/zh/changelog.md | 7 +- internal/export/doc/zh/datakit-conf.md | 42 ++--------- internal/export/doc/zh/datakit-install.md | 30 +++++++- internal/export/doc/zh/datakit-metrics.md | 15 ++-- internal/export/doc/zh/datakit-monitor.md | 60 ++++++++------- internal/export/doc/zh/datakit-operator.md | 16 ++-- internal/export/doc/zh/development.md | 1 - internal/export/non_input_docs.go | 11 +-- 17 files changed, 265 insertions(+), 163 deletions(-) diff --git a/datakit.template.yaml b/datakit.template.yaml index 7a66dc3953..cde9ab7fdb 100644 --- a/datakit.template.yaml +++ b/datakit.template.yaml @@ -194,7 +194,7 @@ spec: # value: iploc # # ---iploc-end image: pubrepo.guance.com/datakit/datakit:{{.Version}} - imagePullPolicy: Always + imagePullPolicy: IfNotPresent name: datakit ports: - containerPort: 9529 diff --git a/internal/datakit/dkconf.go b/internal/datakit/dkconf.go index 29734a0455..f053033a01 100644 --- a/internal/datakit/dkconf.go +++ b/internal/datakit/dkconf.go @@ -153,7 +153,7 @@ ulimit = 64000 # Datakit will upload data points if cached(in memory) points # reached(>=) the max_cache_count or the flush_interval triggered. max_cache_count = 1000 - flush_workers = 0 # default to (cpu_core * 2 + 1) + flush_workers = 0 # default to (cpu_core * 2) flush_interval = "10s" # Queue size of feed. diff --git a/internal/export/doc/en/changelog.md b/internal/export/doc/en/changelog.md index 1c65221e95..4cf5445391 100644 --- a/internal/export/doc/en/changelog.md +++ b/internal/export/doc/en/changelog.md @@ -1,5 +1,45 @@ # Changelog +## 1.60.0 (2024/10/18) {#cl-1.60.0} + +This release is an iterative update, with the following main changes: + +### New Features {#cl-1.60.0-new} + +- Added a new Prometheus v2 collector, which significantly optimizes parsing performance compared to the v1 version (#2427). +- [APM Automatic Instrumentation](datakit-install.md#apm-instrumentation): During the Datakit installation, by setting specific flags, we can automatically inject APM into the corresponding applications (Java/Python) by restarting the applications(#2139). +- RUM Session Replay add supports for blacklist rules configured in GuanCe console (#2424). +- The Datakit [`/v1/write/:category` interface](apis.md#api-v1-write) now supports multiple compression formats(HTTP `Content-Encoding`) (#2368). + +### Bug Fixes {#cl-1.60.0-fix} + +- Fixed a crash issue in the HTTP service caused by the Gin timeout middleware(#2423). +- Fixed a timestamp unit issue in the New Relic collector (#2417). +- Fixed a crash issue caused by the Pipeline function `point_window()` (#2416). + +### Performance Improvements {#cl-1.60.0-opt} + +- Many performance optimizations have been made in this version (#2414): + + - The experimental feature point-pool is now enabled by default. + - Improved Prometheus exporter data collection performance and reduced memory consumption. + - Enabled [HTTP API rate limiting](datakit-conf.md#set-http-api-limit) by default to prevent sudden traffic from consuming too much memory. + - Added a [WAL disk queue](datakit-conf.md#dataway-wal) to handle memory occupation that may be caused by upload blocking. The new disk queue *will cache data that fails to upload by default*. + - Refined Datakit's own memory usage metrics, adding memory occupation across multiple dimensions. + - Added a WAL panel display in the `datakit monitor -V` command. + - Improved KubernetesPrometheus collection performance (#2426). + - Improved container log collection performance (#2425). + - Removed debug-related fields within logging to optimize network traffic and storage. + +### Compatibility Adjustments {#cl-1.60.0-brk} + +- Due to some performance adjustments, there are compatibility differences in the following areas: + + - The maximum size of a single HTTP body upload has been adjusted to 1MB. At the same time, the maximum size of a single log has also been reduced to 1MB. This adjustment is to reduce the amount of pooled memory used by Datakit under low load conditions. + - The original failed retry disk queue has been deprecated (this feature was not enabled by default). The new version will enable a new failed retry disk queue by default. + +--- + ## 1.39.0 (2024/09/25) {#cl-1.39.0} This release is an iterative update with the following changes: diff --git a/internal/export/doc/en/datakit-conf.md b/internal/export/doc/en/datakit-conf.md index aab31ab4c6..21faeeddbd 100644 --- a/internal/export/doc/en/datakit-conf.md +++ b/internal/export/doc/en/datakit-conf.md @@ -202,10 +202,10 @@ We can also enable `content_encoding = "v2"`([:octicons-tag-24: Version-1.32.0]( ```toml [io] - feed_chan_size = 4096 # length of data processing queue (a job typically has multiple points) - max_cache_count = 512 # data bulk sending points, beyond which sending is triggered in the cache + feed_chan_size = 1 # length of compact queue + max_cache_count = 1000 # data bulk sending points, beyond which sending is triggered in the cache flush_interval = "10s" # threshold for sending data at least once every 10s - flush_workers = 8 # upload workers, default CPU-core * 2 + 1 + flush_workers = 0 # upload workers, default is the limited CPU-core * 2 ``` See [corresponding description in k8s](datakit-daemonset-deploy.md#env-io) for blocking mode @@ -215,34 +215,6 @@ We can also enable `content_encoding = "v2"`([:octicons-tag-24: Version-1.32.0]( See [here](datakit-daemonset-deploy.md#env-io) -#### IO Disk Cache {#io-disk-cache} - -[:octicons-tag-24: Version-1.5.8](changelog.md#cl-1.5.8) · [:octicons-beaker-24: Experimental](index.md#experimental) - -When DataKit fails to send data, disk cache can be turned on in order not to lose critical data. The purpose of disk cache is to temporarily store the data on disk when upload Dataway failed, and then fetch data from disk and upload again later. - - -=== "`datakit.conf`" - - ```toml - [io] - enable_cache = true # turn on disk caching - cache_all = false # cache all categories(default metric,object and dial-testing data point not cached) - cache_max_size_gb = 5 # specify a disk size of 5GB - ``` - -=== "Kubernetes" - - See [here](datakit-daemonset-deploy.md#env-io) - - ---- - -???+ attention - - The `cache_max_size_gb` used to control max disk capacity of each data category. For there are 10 categories, if each on configured with 5GB, the max disk usage may reach to 50GB. - - ### Resource Limit {#resource-limit} Because the amount of data processed on the DataKit cannot be estimated, if the resources consumed by the DataKit are not physically limited, it may consume a large amount of resources of the node where it is located. Here we can limit it with the help of cgroup in Linux or job object in Windows, which has the following configuration in *datakit.conf*: @@ -319,6 +291,57 @@ Dataway got following settings to be configured: See [here](datakit-daemonset-deploy.md#env-dataway) for configuration under Kubernetes. +#### WAL Queue Configuration {#dataway-wal} + +[:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0) + +In the `[dataway.wal]` section, we can adjust the configuration of the WAL queue: + +```toml + [dataway.wal] + max_capacity_gb = 2.0 # 2GB reserved disk space for each category (M/L/O/T/...) + workers = 0 # flush workers on WAL (default to CPU limited cores) + mem_cap = 0 # in-memory queue capacity (default to CPU limited cores) + fail_cache_clean_interval = "30s" # duration for cleaning failed uploaded data +``` + +The disk files are located in the *cache/dw-wal* directory under the Datakit installation directory: + +```shell +/usr/local/datakit/cache/dw-wal/ +├── custom_object +│   └── data +├── dialtesting +│   └── data +├── dynamic_dw +│   └── data +├── fc +│   └── data +├── keyevent +│   └── data +├── logging +│   ├── data +│   └── data.00000000000000000000000000000000 +├── metric +│   └── data +├── network +│   └── data +├── object +│   └── data +├── profiling +│   └── data +├── rum +│   └── data +├── security +│   └── data +└── tracing + └── data + +13 directories, 14 files +``` + +Here, except for the *fc* directory, which is the failure retry queue, the other directories correspond to different data types. When data upload fails, these data will be cached in the *fc* directory, and Datakit will periodically upload them later. + ### Dataway Sinker {#dataway-sink} See [here](../deployment/dataway-sink.md) diff --git a/internal/export/doc/en/datakit-install.md b/internal/export/doc/en/datakit-install.md index e640b54b60..5b1d197e4d 100644 --- a/internal/export/doc/en/datakit-install.md +++ b/internal/export/doc/en/datakit-install.md @@ -315,6 +315,35 @@ Only Linux and Windows ([:octicons-tag-24: Version-1.15.0](changelog.md#cl-1.15. - `DK_LIMIT_CPUMAX`: Maximum CPU power, default 30.0 - `DK_LIMIT_MEMMAX`: Limit memory (including swap), default 4096 (4GB) +### APM Instrumentation {#apm-instrumentation} + +[:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0) · [:octicons-beaker-24: Experimental](index.md#experimental) + +By specifying `DK_APM_INSTRUMENTATION_ENABLED=host` in the installation command, you can automatically inject APM for Java/Python applications: + +```shell +DK_APM_INSTRUMENTATION_ENABLED=host \ + DK_DATAWAY=https://openway.guance.com?token= \ + bash -c "$(curl -L https://static.guance.com/datakit/install.sh)" +``` + +After Datakit is installed, reopen a shell and restart the corresponding Java/Python applications. + +To enable or disable this feature, modify the value of the `instrumentation_enabled` configuration under `[apm_inject]` in the `datakit.conf` file: + +- Value `"host"`, enable +- Value `""` or `"disable"`, disable + +Operating environment requirements: + +- Linux system + - CPU architecture: x86_64 or arm64 + - C standard library: glibc 2.4 and above, or musl + - Java 8 and above + - Python 3.7 and above + +In Kubernetes, you can inject APM through the [Datakit Operator](datakit-operator.md#datakit-operator-inject-lib). + ### Other Installation Options {#env-others} | Environment Variable Name | Sample | Description | @@ -332,7 +361,6 @@ Only Linux and Windows ([:octicons-tag-24: Version-1.15.0](changelog.md#cl-1.15. | `DK_VERBOSE` | `on` | Enable more verbose info during install(only for Linux/Mac)[:octicons-tag-24: Version-1.19.0](changelog.md#cl-1.19.0) | | `DK_CRYPTO_AES_KEY` | `0123456789abcdfg` | Use the encrypted password decryption key to protect plaintext passwords in the collector. [:octicons-tag-24: Version-1.31.0](changelog.md#cl-1.31.0) | | `DK_CRYPTO_AES_KEY_FILE` | `/usr/local/datakit/enc4dk` | Another way to configure the secret key takes priority over the previous one. Put the key into the file and configure the configuration file path through environment variables. | -| `DK_APM_INSTRUMENTATION_ENABLED` | `host`, `disable` | Enable APM automatic injection for newly started Java and Python applications on the host. | ## FAQ {#faq} diff --git a/internal/export/doc/en/datakit-metrics.md b/internal/export/doc/en/datakit-metrics.md index 381c813ffa..38dc810404 100644 --- a/internal/export/doc/en/datakit-metrics.md +++ b/internal/export/doc/en/datakit-metrics.md @@ -51,6 +51,7 @@ We can also playing other metrics too(change the `grep` string), all available m |*internal/httpcli*|SUMMARY|`datakit_httpcli_dns_cost_seconds`|`from`|HTTP DNS cost| |*internal/httpcli*|SUMMARY|`datakit_httpcli_tls_handshake_seconds`|`from`|HTTP TLS handshake cost| |*internal/httpcli*|SUMMARY|`datakit_httpcli_http_connect_cost_seconds`|`from`|HTTP connect cost| +|*internal/io/dataway*|GAUGE|`datakit_io_dataway_wal_mem_len`|`category`|Dataway WAL's memory queue length| |*internal/io/dataway*|SUMMARY|`datakit_io_flush_failcache_bytes`|`category`|IO flush fail-cache bytes(in gzip) summary| |*internal/io/dataway*|SUMMARY|`datakit_io_build_body_cost_seconds`|`category,encoding,stage`|Build point HTTP body cost| |*internal/io/dataway*|SUMMARY|`datakit_io_build_body_batches`|`category,encoding`|Batch HTTP body batches| @@ -58,12 +59,12 @@ We can also playing other metrics too(change the `grep` string), all available m |*internal/io/dataway*|SUMMARY|`datakit_io_build_body_batch_points`|`category,encoding`|Batch HTTP body points| |*internal/io/dataway*|SUMMARY|`datakit_io_dataway_wal_flush`|`category,gzip,queue`|Dataway WAL worker flushed bytes| |*internal/io/dataway*|COUNTER|`datakit_io_dataway_point_total`|`category,status`|Dataway uploaded points, partitioned by category and send status(HTTP status)| +|*internal/io/dataway*|COUNTER|`datakit_io_wal_point_total`|`category,status`|WAL queued points| |*internal/io/dataway*|COUNTER|`datakit_io_dataway_point_bytes_total`|`category,enc,status`|Dataway uploaded points bytes, partitioned by category and pint send status(HTTP status)| |*internal/io/dataway*|COUNTER|`datakit_io_dataway_http_drop_point_total`|`category,error`|Dataway write drop points| |*internal/io/dataway*|SUMMARY|`datakit_io_dataway_api_latency_seconds`|`api,status`|Dataway HTTP request latency partitioned by HTTP API(method@url) and HTTP status| |*internal/io/dataway*|COUNTER|`datakit_io_http_retry_total`|`api,status`|Dataway HTTP retried count| |*internal/io/dataway*|SUMMARY|`datakit_io_grouped_request`|`category`|Grouped requests under sinker| -|*internal/io/dataway*|GAUGE|`datakit_io_dataway_wal_mem_len`|`category`|Dataway WAL's memory queue length| |*internal/io/filter*|COUNTER|`datakit_filter_update_total`|`N/A`|Filters(remote) updated count| |*internal/io/filter*|GAUGE|`datakit_filter_last_update_timestamp_seconds`|`N/A`|Filter last update time| |*internal/io/filter*|COUNTER|`datakit_filter_point_total`|`category,filters,source`|Filter points of filters| @@ -143,6 +144,7 @@ We can also playing other metrics too(change the `grep` string), all available m |*internal/plugins/inputs/promremote*|SUMMARY|`datakit_input_promremote_collect_points`|`source`|Total number of promremote collection points| |*internal/plugins/inputs/promremote*|SUMMARY|`datakit_input_promremote_time_diff_in_second`|`source`|Time diff with local time| |*internal/plugins/inputs/promremote*|COUNTER|`datakit_input_promremote_no_time_points_total`|`source`|Total number of promremote collection no time points| +|*internal/plugins/inputs/promv2*|SUMMARY|`datakit_input_promv2_scrape_points`|`source,remote`|The number of points scrape from endpoint| |*internal/plugins/inputs/proxy/bench/client*|GAUGE|`api_elapsed_seconds`|`N/A`|Proxied API elapsed seconds| |*internal/plugins/inputs/proxy/bench/client*|COUNTER|`api_post_bytes_total`|`api,status`|Proxied API post bytes total| |*internal/plugins/inputs/proxy/bench/client*|SUMMARY|`api_latency_seconds`|`api,status`|Proxied API latency| @@ -169,14 +171,13 @@ We can also playing other metrics too(change the `grep` string), all available m |*internal/prom*|GAUGE|`datakit_input_prom_stream_size`|`mode,source`|Stream size| |*internal/statsd*|SUMMARY|`datakit_input_statsd_collect_points`|`N/A`|Total number of statsd collection points| |*internal/statsd*|SUMMARY|`datakit_input_statsd_accept_bytes`|`N/A`|Accept bytes from network| -|*internal/tailer*|COUNTER|`datakit_input_logging_socket_feed_message_count_total`|`network`|socket feed to IO message count| -|*internal/tailer*|SUMMARY|`datakit_input_logging_socket_log_length`|`network`|record the length of each log line| -|*internal/tailer*|COUNTER|`datakit_tailer_collect_multiline_state_total`|`source,filepath,multilinestate`|Tailer multiline state total| |*internal/tailer*|COUNTER|`datakit_tailer_file_rotate_total`|`source,filepath`|Tailer rotate total| |*internal/tailer*|COUNTER|`datakit_tailer_buffer_force_flush_total`|`source,filepath`|Tailer force flush total| |*internal/tailer*|COUNTER|`datakit_tailer_parse_fail_total`|`source,filepath,mode`|Tailer parse fail total| |*internal/tailer*|GAUGE|`datakit_tailer_open_file_num`|`mode`|Tailer open file total| |*internal/tailer*|COUNTER|`datakit_input_logging_socket_connect_status_total`|`network,status`|connect and close count for net.conn| +|*internal/tailer*|COUNTER|`datakit_input_logging_socket_feed_message_count_total`|`network`|socket feed to IO message count| +|*internal/tailer*|SUMMARY|`datakit_input_logging_socket_log_length`|`network`|record the length of each log line| |*internal/trace*|COUNTER|`datakit_input_tracing_total`|`input,service`|The total links number of Trace processed by the trace module| |*internal/trace*|COUNTER|`datakit_input_sampler_total`|`input,service`|The sampler number of Trace processed by the trace module| |*vendor/github.com/GuanceCloud/cliutils/diskcache*|SUMMARY|`diskcache_dropped_data`|`path,reason`|Dropped data during Put() when capacity reached.| diff --git a/internal/export/doc/en/datakit-monitor.md b/internal/export/doc/en/datakit-monitor.md index 367782befc..fc977e8689 100644 --- a/internal/export/doc/en/datakit-monitor.md +++ b/internal/export/doc/en/datakit-monitor.md @@ -1,12 +1,12 @@ -# View Monitor for DataKit +# View Monitor for Datakit --- -DataKit provides relatively complete output of basic observable information. By looking at the monitor output of DataKit, we can clearly know the current operation of DataKit. +Datakit provides relatively complete output of basic observable information. By looking at the monitor output of Datakit, we can clearly know the current operation of Datakit. ## View Monitor {#view} -Execute the following command to get the running status of the native DataKit. +Execute the following command to get the running status of the native Datakit. ```shell datakit monitor @@ -16,7 +16,7 @@ datakit monitor You can see more monitor options through the `datakit help monitor`. -The DataKit Basic Monitor page information is shown in the following figure: +The Datakit Basic Monitor page information is shown in the following figure: ![`onitor-basic-v1`](https://static.guance.com/images/datakit/monitor-basic-v1.png) @@ -24,28 +24,29 @@ The elements in this diagram can be manipulated by mouse or keyboard. Blocks sel The information of each UI block in the above figure is: -- `Basic Info` is used to show the basic information of DataKit, such as version number, host name, runtime and so on. From here, we can have a basic understanding of the current situation of DataKit. Now select a few fields to explain separately: - - `Version`: Current version number of DataKit - - `Uptime`: Startup time of DataKit - - `Branch`: The current code branch of DataKit, which is generally master - - `Build`:DataKit release date - - `Resource Limit`: Show the resource limit configuration of the current DataKit, where mem refers to the maximum memory limit and cpu refers to the usage limit (If cgroup not set, the value is `-`) - - `Hostname`: Current hostname - - `OS/Arch`: Current software and hardware platforms of DataKit - - `Elected`: Election info(See [Election](election.md#status)) - - `From`: The DataKit address of the current Monitor, such as `http://localhost:9529/metrics` - -- `Runtime Info` Runtime Info is used to show the basic running consumption of Datakit (mainly related to memory and Goroutine): - - - `Goroutines`: The number of Goroutines currently running - - `Mem`: The actual number of bytes of memory currently consumed by the DataKit process (*excluding externally running collectors*) - - `System`: Virtual memory currently consumed by the DataKit process (*excluding externally running collectors*) - - `GC Paused`: Time elapsed and count of Golang GC (garbage collection) since DataKit started - -???+ info +- `Basic Info` is used to display basic information about Datakit, such as the version number, hostname, and runtime duration. From here, we can get a basic understanding of the current status of Datakit. Here are a few fields highlighted for individual explanation: + - `Uptime`: The startup time of Datakit. + - `Version`: The current version number of Datakit. + - `Build`: The release date of Datakit. + - `Branch`: The current code branch of Datakit, which is usually `master`. + - `Build Tag`: The compilation options for Datakit; for the [Lite version](datakit-install.md#lite-install), this is `lite`. + - `OS/Arch`: The current hardware and software platform of Datakit. + - `Hostname`: The current hostname. + - `Resource Limit`: Displays the current resource limit configurations for Datakit, where `mem` refers to the maximum memory limit, and `cpu` refers to the usage limit range (if displayed as `-`, it means the current cgroup is not set). + - `Elected`: Shows the election status, see [here](election.md#status) for details. + - `From`: The current Datakit address being monitored, such as `http://localhost:9529/metrics`. + - `Proxy`: The current proxy server being used. + +- `Runtime Info` is used to display the basic runtime consumption of Datakit (mainly memory, CPU and Golang runtime), including: + + - `Goroutines`: The number of Goroutine currently running. + - `Total/Heap`: The memory occupied by the Golang virtual memory(`sys-alloc`) and the memory currently in use(`heap-alloc`) [^go-mem]. + - `RSS/VMS`: The RSS memory usage and VMS. + - `GC Paused`: The time and number of times the GC (garbage collection) has consumed since Datakit started. + - `OpenFiles`: The number of files currently open (on some platforms, it may display as `-1`, indicating that the feature is not supported). + +[^go-mem]: Note that the memory usage displayed here is specific to the Golang virtual machine and does not include the memory used by external collectors that may be running. - For Runtime Info here, see [Golang doc](https://pkg.go.dev/runtime#ReadMemStats){:target="_blank"} - - `Enabled Inputs` displays a list of open collectors: - `Input`: Refer to the collector(input) name, which is fixed and cannot be modified @@ -71,13 +72,19 @@ If the verbose option (`-V`) is specified when Monitor is run, additional inform ![`monitor-verbose-v1`](https://static.guance.com/images/datakit/monitor-verbose-v1.png) -- `Goroutine Groups` shows the existing Goroutine Groups in the DataKit (the number of Goroutines in the group < = the number of `Goroutines` in the panel above). +- `Goroutine Groups` shows the existing Goroutine Groups in the Datakit (the number of Goroutines in the group < = the number of `Goroutines` in the panel above). - `HTTP APIs`: HTTP API request info - `Filter`: Pull of blacklist filtering rules - `Filter Rules`: Filtering of each type of blacklist - `Pipeline Info`: Pipeline running info -- `IO Info`: Data upload info -- `DataWay APIs`: Dataway API request info +- `WAL Info` WAL Queue Usage [:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0) + + The WAL queue consists of two parts: a small in-memory queue and a default 2GB disk queue. Here, `mem` refers to the number of points processed by the in-memory queue, `disk` refers to the number of points processed by the disk queue, and `drop` refers to the number of points discarded by the disk queue (for example, when the disk queue is full). Total refers to the total number of points. + +- `Point Upload Info` Displays the operation of the data upload channel [^point-upload-info-on-160]. +- `DataWay APIs` Displays the invocation situation of Dataway APIs. + +[^point-upload-info-on-160]: [:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0) There have been updates here, and previous versions may show slightly different information. ## FAQ {#faq} @@ -134,17 +141,17 @@ datakit monitor --refresh 1s ### :material-chat-question: How to Monitor other DataKits? {#remote-monitor} -Sometimes, the DataKit installed does not use the default 9529 port, and this time, an error like the following will occur: +Sometimes, the Datakit installed does not use the default 9529 port, and this time, an error like the following will occur: ```shell request stats failed: Get "http://localhost:9528/stats": dial tcp ... ``` -We can view its monitor data by specifying the DataKit address: +We can view its monitor data by specifying the Datakit address: ```shell datakit monitor --to localhost:19528 -# We can also view the monitor of another remote DataKit +# We can also view the monitor of another remote Datakit datakit monitor --to :9528 ``` diff --git a/internal/export/doc/en/datakit-operator.md b/internal/export/doc/en/datakit-operator.md index 791f62a3d8..91bb5307cb 100644 --- a/internal/export/doc/en/datakit-operator.md +++ b/internal/export/doc/en/datakit-operator.md @@ -253,8 +253,8 @@ The following functions are currently supported: - Injection of `ddtrace` agent and environment - Mounting of `logfwd` sidecar and enabling log collection -- 注入 [`async-profiler`](https://github.com/async-profiler/async-profiler){:target="_blank"} *:octicons-beaker-24: Experimental* 采集 JVM 程序的 profile 数据 -- 注入 [`py-spy`](https://github.com/benfred/py-spy){:target="_blank"} *:octicons-beaker-24: Experimental* 采集 Python 应用的 profile 数据 +- Inject [`async-profiler`](https://github.com/async-profiler/async-profiler){:target="_blank"} for JVM profiling [:octicons-beaker-24: Experimental](index.md#experimental) +- Inject [`py-spy`](https://github.com/benfred/py-spy){:target="_blank"} for Python profiling [:octicons-beaker-24: Experimental](index.md#experimental) ???+ info @@ -263,7 +263,7 @@ The following functions are currently supported: -### Injection of ddtrace Agent and Relevant Environment Variables {#datakit-operator-inject-lib} +### DDTrace Agent {#datakit-operator-inject-lib} #### Usage {#datakit-operator-inject-lib-usage} @@ -321,7 +321,7 @@ datakit-lib-init ``` -### Injecting Logfwd Program and Enabling Log Collection {#datakit-operator-inject-logfwd} +### logfwd {#datakit-operator-inject-logfwd} #### Prerequisites {#datakit-operator-inject-logfwd-prerequisites} diff --git a/internal/export/doc/en/development.md b/internal/export/doc/en/development.md index bf599b2faa..3da8ccac27 100644 --- a/internal/export/doc/en/development.md +++ b/internal/export/doc/en/development.md @@ -211,7 +211,6 @@ DataKit runs in the specified directory (/usr/local/DataKit under Linux) as ==se - `http_api.listen` change the address - Change the token in `dataway.urls` - Change the logging directory/level if necessary - - No more 2. Start the DataKit, taking Linux as an example: `DK_DEBUG_WORKDIR=~/datakit ./dist/datakit-linux-amd64/datakit` 3. You can add a new alias to your local bash so that you can just run `ddk` each time you compile the DataKit (that is, Debugging-DataKit) diff --git a/internal/export/doc/zh/changelog.md b/internal/export/doc/zh/changelog.md index 71eaf22ce1..7a04bbc5d9 100644 --- a/internal/export/doc/zh/changelog.md +++ b/internal/export/doc/zh/changelog.md @@ -7,9 +7,9 @@ ### 新加功能 {#cl-1.60.0-new} - 新加 prom v2 版本采集器,相比 v1 版本,它的解析性能有大幅度优化(#2427) -- 安装 Datakit 过程中,通过设置特定的开关后,重启对应的应用(Java/Python)即可自动注入 APM(#2139) +- [APM Automatic Instrumentation](datakit-install.md#apm-instrumentation):安装 Datakit 过程中,通过设置特定的开关后,重启对应的应用(Java/Python)即可自动自动注入 APM(#2139) - RUM Session Replay 数据支持联动中心配置的黑名单规则(#2424) -- Datakit `/v1/write/:category` 接口增加多种压缩格式支持(#2368) +- Datakit [`/v1/write/:category` 接口](apis.md#api-v1-write)增加多种压缩格式支持(HTTP `Content-Encoding`)(#2368) ### 问题修复 {#cl-1.60.0-fix} @@ -24,13 +24,12 @@ - 实验性功能 point-pool 默认启用 - 优化 Prometheus exporter 数据采集性能,减少内存消耗 - 默认开启 [HTTP API 限流](datakit-conf.md#set-http-api-limit),避免突发流量消耗太多内存 - - 增加[磁盘队列](datakit-conf.md#dataway-wal),以处理上传阻塞可能导致的内存占用。新增的磁盘队列*默认会缓存上传失败的数据*。 + - 增加[WAL 磁盘队列](datakit-conf.md#dataway-wal),以处理上传阻塞可能导致的内存占用。新增的磁盘队列*默认会缓存上传失败的数据*。 - 细化 Datakit 自身内存使用指标,指标中增加多个维度的内存占用 - `datakit monitor -V` 命令中增加 WAL 面板展示 - 优化 KubernetesPrometheus 采集性能(#2426) - 优化容器日志采集性能(#2425) - 移除日志调试有关字段,以优化网络流量和存储 - - 调整 Datakit 自身指标暴露,修复关于自身内存指标暴露的一些错误实现 ### 兼容调整 {#cl-1.60.0-brk} diff --git a/internal/export/doc/zh/datakit-conf.md b/internal/export/doc/zh/datakit-conf.md index 3935fa05b2..c08092c5cb 100644 --- a/internal/export/doc/zh/datakit-conf.md +++ b/internal/export/doc/zh/datakit-conf.md @@ -170,8 +170,7 @@ DataKit 默认日志等级为 `info`。编辑 `datakit.conf`,可修改日志 ### Point 缓存 {#point-pool} -[:octicons-tag-24: Version-1.28.0](changelog.md#cl-1.28.0) · -[:octicons-beaker-24: Experimental](index.md#experimental) +[:octicons-tag-24: Version-1.28.0](changelog.md#cl-1.28.0) 为了优化 Datakit 高负载情况下的内存占用,可以开启 Point Pool 来缓解: @@ -187,7 +186,9 @@ DataKit 默认日志等级为 `info`。编辑 `datakit.conf`,可修改日志 ???+ attention - 在低负载(Datakit 内存占用 100MB 左右)的情况下,开启 Point-Pool 会增加 Datakit 自身的内存占用,但也不至于太多。所谓的高负载,一般指占用内存在 2GB+ 的场景。同时开启后也能改善 Datakit 自身的 CPU 消耗。 + - 在低负载(Datakit 内存占用 100MB 左右)的情况下,开启 point pool 会增加 Datakit 自身的内存占用。所谓的高负载,一般指占用内存在 2GB+ 的场景。同时开启后也能改善 Datakit 自身的 CPU 消耗 + + - [:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0) 已经默认开启了该功能。如果要禁用 point pool,可以将上面的 `enable` 置为 `false` ### IO 模块调参 {#io-tuning} @@ -202,10 +203,10 @@ DataKit 默认日志等级为 `info`。编辑 `datakit.conf`,可修改日志 ```toml [io] - feed_chan_size = 4096 # 数据处理队列(一个 job 一般都有多个 point)长度 - max_cache_count = 512 # 数据批量发送点数的阈值,缓存中超过该值即触发发送 + feed_chan_size = 1 # 数据处理队列长度 + max_cache_count = 1000 # 数据批量发送点数的阈值,缓存中超过该值即触发发送 flush_interval = "10s" # 数据发送的间隔阈值,每隔 10s 至少发送一次 - flush_workers = 8 # 数据上传 worker 数(默认 CPU-core * 2 + 1) + flush_workers = 0 # 数据上传 worker 数(默认配额 CPU 核心 * 2) ``` 阻塞模式参见 [k8s 中的对应说明](datakit-daemonset-deploy.md#env-io) @@ -215,33 +216,6 @@ DataKit 默认日志等级为 `info`。编辑 `datakit.conf`,可修改日志 参见[这里](datakit-daemonset-deploy.md#env-io) -#### IO 磁盘缓存 {#io-disk-cache} - -[:octicons-tag-24: Version-1.5.8](changelog.md#cl-1.5.8) · [:octicons-beaker-24: Experimental](index.md#experimental) - -当 DataKit 发送数据失败后,为了不丢失关键数据,可以开启磁盘缓存。磁盘缓存的目的在于将发送失败的数据暂时存入磁盘,待条件允许时,再将数据发送出去。 - - -=== "*datakit.conf*" - - ```toml - [io] - enable_cache = true # 开启磁盘缓存 - cache_all = false # 是否全类缓存(默认情况下,指标/对象/拨测数据不缓存) - cache_max_size_gb = 5 # 指定每个分类磁盘大小为 5GB - ``` - -=== "Kubernetes" - - 参见[这里](datakit-daemonset-deploy.md#env-io) - ---- - -???+ attention - - 这里的 `cache_max_size_gb` 指每个分类(Category)的缓存大小,总共 10 个分类的话,如果每个指定 5GB,理论上会占用 50GB 左右的空间。 - - ### 资源限制 {#resource-limit} 由于 DataKit 上处理的数据量无法估计,如果不对 DataKit 消耗的资源做物理限制,将有可能消耗所在节点大量资源。这里我们可以借助 Linux 的 cgroup 和 Windows 的 job object 来限制,在 *datakit.conf* 中有如下配置: @@ -367,7 +341,7 @@ Kubernetes 下部署相关配置参见[这里](datakit-daemonset-deploy.md#env-d 13 directories, 14 files ``` -此处,除了 *fc* 是失败重传队列,其它目录分别对应一种数据类型。 +此处,除了 *fc* 是失败重传队列,其它目录分别对应一种数据类型。当数据上传失败,这些数据会缓存到 *fc* 目录下,后续 Datakit 会间歇性将它们上传上去。 ### Sinker 配置 {#dataway-sink} diff --git a/internal/export/doc/zh/datakit-install.md b/internal/export/doc/zh/datakit-install.md index 0927283b67..ed55ea0e7d 100644 --- a/internal/export/doc/zh/datakit-install.md +++ b/internal/export/doc/zh/datakit-install.md @@ -338,6 +338,35 @@ NAME1="value1" NAME2="value2" - `DK_LIMIT_CPUMAX`:支持 CPU 的最大功率,默认 30.0 - `DK_LIMIT_MEMMAX`:限制内存(含 swap)最大用量,默认 4096(4GB) +### APM Instrumentation {#apm-instrumentation} + +[:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0) · [:octicons-beaker-24: Experimental](index.md#experimental) + +在安装命令中,指定 `DK_APM_INSTRUMENTATION_ENABLED=host` 即可针对 Java/Python 等应用自动注入 APM: + +```shell +DK_APM_INSTRUMENTATION_ENABLED=host \ + DK_DATAWAY=https://openway.guance.com?token= \ + bash -c "$(curl -L https://static.guance.com/datakit/install.sh)" +``` + +Datakit 安装完成后,重新开启一个 shell,并重启对应的 Java/Python 应用即可。 + +开启和关闭该功能,修改 `datakit.conf` 文件中 `[apm_inject]` 下的 `instrumentation_enabled` 配置的值: + +- 值 `"host"`,开启 +- 值 `""` 或者 `"disable"`,关闭 + +运行环境要求: + +- Linux 系统 + - CPU 架构:x86_64 或 arm64 + - C 标准库:glibc 2.4 及以上版本,或 musl + - Java 8 及以上版本 + - Python 3.7 及以上版本 + +在 Kubernetes 中,可以通过 [Datakit Operator 来注入 APM](datakit-operator.md#datakit-operator-inject-lib)。 + ### 其它安装选项 {#env-others} | 环境变量名 | 取值示例 | 说明 | @@ -355,7 +384,6 @@ NAME1="value1" NAME2="value2" | `DK_VERBOSE` | `on` | 打开安装过程中的 verbose 选项(仅 Linux/Mac 支持),将输出更多调试信息[:octicons-tag-24: Version-1.19.0](changelog.md#cl-1.19.0) | | `DK_CRYPTO_AES_KEY` | `0123456789abcdfg` | 使用加密后的密码解密秘钥,用于采集器中明文密码的保护 [:octicons-tag-24: Version-1.31.0](changelog.md#cl-1.31.0) | | `DK_CRYPTO_AES_KEY_FILE` | `/usr/local/datakit/enc4dk` | 秘钥的另一种配置方式,优先于上一种。将秘钥放到该文件中,并将配置文件路径通过环境变量方式配置即可。 | -| `DK_APM_INSTRUMENTATION_ENABLED` | `host`, `disable` | 对主机上的新启动的 Java 和 Python 应用开启 APM 自动注入功能 | ## FAQ {#faq} diff --git a/internal/export/doc/zh/datakit-metrics.md b/internal/export/doc/zh/datakit-metrics.md index 8552ab11c5..50fe187898 100644 --- a/internal/export/doc/zh/datakit-metrics.md +++ b/internal/export/doc/zh/datakit-metrics.md @@ -51,6 +51,7 @@ datakit_cpu_usage 4.9920266849857144 |*internal/httpcli*|SUMMARY|`datakit_httpcli_dns_cost_seconds`|`from`|HTTP DNS cost| |*internal/httpcli*|SUMMARY|`datakit_httpcli_tls_handshake_seconds`|`from`|HTTP TLS handshake cost| |*internal/httpcli*|SUMMARY|`datakit_httpcli_http_connect_cost_seconds`|`from`|HTTP connect cost| +|*internal/io/dataway*|GAUGE|`datakit_io_dataway_wal_mem_len`|`category`|Dataway WAL's memory queue length| |*internal/io/dataway*|SUMMARY|`datakit_io_flush_failcache_bytes`|`category`|IO flush fail-cache bytes(in gzip) summary| |*internal/io/dataway*|SUMMARY|`datakit_io_build_body_cost_seconds`|`category,encoding,stage`|Build point HTTP body cost| |*internal/io/dataway*|SUMMARY|`datakit_io_build_body_batches`|`category,encoding`|Batch HTTP body batches| @@ -58,12 +59,12 @@ datakit_cpu_usage 4.9920266849857144 |*internal/io/dataway*|SUMMARY|`datakit_io_build_body_batch_points`|`category,encoding`|Batch HTTP body points| |*internal/io/dataway*|SUMMARY|`datakit_io_dataway_wal_flush`|`category,gzip,queue`|Dataway WAL worker flushed bytes| |*internal/io/dataway*|COUNTER|`datakit_io_dataway_point_total`|`category,status`|Dataway uploaded points, partitioned by category and send status(HTTP status)| +|*internal/io/dataway*|COUNTER|`datakit_io_wal_point_total`|`category,status`|WAL queued points| |*internal/io/dataway*|COUNTER|`datakit_io_dataway_point_bytes_total`|`category,enc,status`|Dataway uploaded points bytes, partitioned by category and pint send status(HTTP status)| |*internal/io/dataway*|COUNTER|`datakit_io_dataway_http_drop_point_total`|`category,error`|Dataway write drop points| |*internal/io/dataway*|SUMMARY|`datakit_io_dataway_api_latency_seconds`|`api,status`|Dataway HTTP request latency partitioned by HTTP API(method@url) and HTTP status| |*internal/io/dataway*|COUNTER|`datakit_io_http_retry_total`|`api,status`|Dataway HTTP retried count| |*internal/io/dataway*|SUMMARY|`datakit_io_grouped_request`|`category`|Grouped requests under sinker| -|*internal/io/dataway*|GAUGE|`datakit_io_dataway_wal_mem_len`|`category`|Dataway WAL's memory queue length| |*internal/io/filter*|COUNTER|`datakit_filter_update_total`|`N/A`|Filters(remote) updated count| |*internal/io/filter*|GAUGE|`datakit_filter_last_update_timestamp_seconds`|`N/A`|Filter last update time| |*internal/io/filter*|COUNTER|`datakit_filter_point_total`|`category,filters,source`|Filter points of filters| @@ -85,9 +86,9 @@ datakit_cpu_usage 4.9920266849857144 |*internal/metrics*|COUNTER|`datakit_error_total`|`source,category`|Total errors, only count on error source, not include error message| |*internal/metrics*|GAUGE|`datakit_goroutines`|`N/A`|Goroutine count within Datakit| |*internal/metrics*|GAUGE|`datakit_mem_stat`|`type`|Datakit memory system bytes| -|*internal/metrics*|GAUGE|`datakit_heap_alloc_bytes`|`N/A`|Datakit memory heap bytes(Deprecated by datakit_golang_mem_usage)| -|*internal/metrics*|GAUGE|`datakit_sys_alloc_bytes`|`N/A`|Datakit memory system bytes(Deprecated by datakit_golang_mem_usage)| -|*internal/metrics*|GAUGE|`datakit_golang_mem_usage`|`type`|Datakit golang memory_usage| +|*internal/metrics*|GAUGE|`datakit_heap_alloc_bytes`|`N/A`|Datakit memory heap bytes(Deprecated by `datakit_golang_mem_usage`)| +|*internal/metrics*|GAUGE|`datakit_sys_alloc_bytes`|`N/A`|Datakit memory system bytes(Deprecated by `datakit_golang_mem_usage`)| +|*internal/metrics*|GAUGE|`datakit_golang_mem_usage`|`type`|Datakit golang memory usage stats| |*internal/metrics*|GAUGE|`datakit_cpu_usage`|`N/A`|Datakit CPU usage(%)| |*internal/metrics*|GAUGE|`datakit_open_files`|`N/A`|Datakit open files(only available on Linux)| |*internal/metrics*|GAUGE|`datakit_cpu_cores`|`N/A`|Datakit CPU cores| @@ -143,6 +144,7 @@ datakit_cpu_usage 4.9920266849857144 |*internal/plugins/inputs/promremote*|SUMMARY|`datakit_input_promremote_collect_points`|`source`|Total number of promremote collection points| |*internal/plugins/inputs/promremote*|SUMMARY|`datakit_input_promremote_time_diff_in_second`|`source`|Time diff with local time| |*internal/plugins/inputs/promremote*|COUNTER|`datakit_input_promremote_no_time_points_total`|`source`|Total number of promremote collection no time points| +|*internal/plugins/inputs/promv2*|SUMMARY|`datakit_input_promv2_scrape_points`|`source,remote`|The number of points scrape from endpoint| |*internal/plugins/inputs/proxy/bench/client*|GAUGE|`api_elapsed_seconds`|`N/A`|Proxied API elapsed seconds| |*internal/plugins/inputs/proxy/bench/client*|COUNTER|`api_post_bytes_total`|`api,status`|Proxied API post bytes total| |*internal/plugins/inputs/proxy/bench/client*|SUMMARY|`api_latency_seconds`|`api,status`|Proxied API latency| @@ -169,14 +171,13 @@ datakit_cpu_usage 4.9920266849857144 |*internal/prom*|GAUGE|`datakit_input_prom_stream_size`|`mode,source`|Stream size| |*internal/statsd*|SUMMARY|`datakit_input_statsd_collect_points`|`N/A`|Total number of statsd collection points| |*internal/statsd*|SUMMARY|`datakit_input_statsd_accept_bytes`|`N/A`|Accept bytes from network| -|*internal/tailer*|COUNTER|`datakit_input_logging_socket_feed_message_count_total`|`network`|socket feed to IO message count| -|*internal/tailer*|SUMMARY|`datakit_input_logging_socket_log_length`|`network`|record the length of each log line| -|*internal/tailer*|COUNTER|`datakit_tailer_collect_multiline_state_total`|`source,filepath,multilinestate`|Tailer multiline state total| |*internal/tailer*|COUNTER|`datakit_tailer_file_rotate_total`|`source,filepath`|Tailer rotate total| |*internal/tailer*|COUNTER|`datakit_tailer_buffer_force_flush_total`|`source,filepath`|Tailer force flush total| |*internal/tailer*|COUNTER|`datakit_tailer_parse_fail_total`|`source,filepath,mode`|Tailer parse fail total| |*internal/tailer*|GAUGE|`datakit_tailer_open_file_num`|`mode`|Tailer open file total| |*internal/tailer*|COUNTER|`datakit_input_logging_socket_connect_status_total`|`network,status`|connect and close count for net.conn| +|*internal/tailer*|COUNTER|`datakit_input_logging_socket_feed_message_count_total`|`network`|socket feed to IO message count| +|*internal/tailer*|SUMMARY|`datakit_input_logging_socket_log_length`|`network`|record the length of each log line| |*internal/trace*|COUNTER|`datakit_input_tracing_total`|`input,service`|The total links number of Trace processed by the trace module| |*internal/trace*|COUNTER|`datakit_input_sampler_total`|`input,service`|The sampler number of Trace processed by the trace module| |*vendor/github.com/GuanceCloud/cliutils/diskcache*|SUMMARY|`diskcache_dropped_data`|`path,reason`|Dropped data during Put() when capacity reached.| diff --git a/internal/export/doc/zh/datakit-monitor.md b/internal/export/doc/zh/datakit-monitor.md index b08294e3de..2d0bc15535 100644 --- a/internal/export/doc/zh/datakit-monitor.md +++ b/internal/export/doc/zh/datakit-monitor.md @@ -1,12 +1,12 @@ -# 查看 DataKit 的 Monitor +# 查看 Datakit 的 Monitor --- -DataKit 提供了相对完善的基本可观测信息输出,通过查看 DataKit 的 monitor 输出,我们能清晰的知道当前 DataKit 的运行情况。 +Datakit 提供了相对完善的基本可观测信息输出,通过查看 Datakit 的 monitor 输出,我们能清晰的知道当前 Datakit 的运行情况。 ## 查看 Monitor {#view} -执行如下命令即可获取本机 DataKit 的运行情况。 +执行如下命令即可获取本机 Datakit 的运行情况。 ``` shell datakit monitor @@ -18,7 +18,7 @@ datakit monitor 可通过 `datakit help monitor` 查看更多 monitor 选项。 -DataKit 基本 Monitor 页面信息如下图所示: +Datakit 基本 Monitor 页面信息如下图所示: ![not-set](https://static.guance.com/images/datakit/monitor-basic-v1.png) @@ -26,30 +26,28 @@ DataKit 基本 Monitor 页面信息如下图所示: 上图中的每个 UI 块的信息分别是: -- `Basic Info` 用来展示 DataKit 的基本信息,如版本号、主机名、运行时长等信息。从这里我们可以对 DataKit 当前的情况有个基本了解。现挑选几个字段出来单独说明: - - `Uptime`:DataKit 的启动时间 - - `Branch`:DataKit 当前的代码分支,一般情况下都是 master - - `Build`:DataKit 的发布时间 - - `Resource Limit`:展示当前 DataKit 的资源限制配置,其中 `mem` 指最大内存限制,`cpu` 指使用率限制范围 (如果展示为 `-` 表示当前 cgroup 未设置) +- `Basic Info` 用来展示 Datakit 的基本信息,如版本号、主机名、运行时长等信息。从这里我们可以对 Datakit 当前的情况有个基本了解。现挑选几个字段出来单独说明: + - `Uptime`:Datakit 的启动时间 + - `Version`:Datakit 当前的版本号 + - `Build`:Datakit 的发布时间 + - `Branch`:Datakit 当前的代码分支,一般情况下都是 master + - `Build Tag`:Datakit 的编译选项,[精简版](datakit-install.md#lite-install)此处是 `lite` + - `OS/Arch`:当前 Datakit 的软硬件平台 - `Hostname`:当前主机名 - - `OS/Arch`:当前 DataKit 的软硬件平台 - - `Version`:DataKit 当前的版本号 + - `Resource Limit`:展示当前 Datakit 的资源限制配置,其中 `mem` 指最大内存限制,`cpu` 指使用率限制范围 (如果展示为 `-` 表示当前 cgroup 未设置) - `Elected`:展示选举情况,详见[这里](election.md#status) - - `From`:当前被 Monitor 的 DataKit 地址,如 `http://localhost:9529/metrics` + - `From`:当前被 Monitor 的 Datakit 地址,如 `http://localhost:9529/metrics` + - `Proxy`:当前使用的代理服务器 -- `Runtime Info` 用来展示 Datakit 的基本运行消耗(主要是内存以及 Goroutine 有关),其中: +- `Runtime Info` 用来展示 Datakit 的基本运行消耗(主要是内存、CPU 以及 Golang runtime),其中: - `Goroutines`:当前正在运行的 Goroutine 个数 - - `Mem`:DataKit 进程当前实际消耗的内存字节数(*不含外部运行的采集器*) - - `System`:DataKit 进程当前消耗的虚拟内存(*不含外部运行的采集器*) - - `GC Paused`:自 DataKit 启动以来,GC(垃圾回收)所消耗的时间以及次数 + - `Total/Heap`:Golang vm 占用内存以及正在使用中的内存(*不含外部运行的采集器*)[^go-mem] + - `RSS/VMS`:RSS 内存占用以及 VMS(*不含外部运行的采集器*) + - `GC Paused`:自 Datakit 启动以来,GC(垃圾回收)所消耗的时间以及次数 - `OpenFiles`:当前打开的文件个数(部分平台可能显示为 `-1`,表示不支持该功能) - -???+ info - - 关于这里的 Runtime Info,参见 [Golang 官方文档](https://pkg.go.dev/runtime#ReadMemStats){:target="_blank"} - +[^go-mem]: 关于这里的 Runtime Info,参见 [Golang 官方文档](https://pkg.go.dev/runtime#ReadMemStats){:target="_blank"} - `Enabled Inputs` 展示开启的采集器列表,其中 @@ -76,14 +74,20 @@ DataKit 基本 Monitor 页面信息如下图所示: ![not-set](https://static.guance.com/images/datakit/monitor-verbose-v1.png) -- `Goroutine Groups` 展示 DataKit 中已有的 Goroutine 分组(该分组中的 Goroutine 个数 <= 上面面板中的 `Goroutines` 个数) -- `HTTP APIs` 展示 DataKit 中 API 调用情况 -- `Filter` 展示 DataKit 中黑名单过滤规则拉取情况 +- `Goroutine Groups` 展示 Datakit 中已有的 Goroutine 分组(该分组中的 Goroutine 个数 <= 上面面板中的 `Goroutines` 个数) +- `HTTP APIs` 展示 Datakit 中 API 调用情况 +- `Filter` 展示 Datakit 中黑名单过滤规则拉取情况 - `Filter Rules` 展示每类黑名单的过滤情况 - `Pipeline Info` 展示 Pipeline 运行情况 -- `IO Info` 展示数据上传通道的运行情况 +- `WAL Info` WAL 队列的使用情况 [:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0) + + WAL 队列由两部分组成,少量的内存队列以及默认 2GB 的磁盘队列。此处,`mem` 指内存队列处理的点数,`disk` 指磁盘队列处理的点数,`drop` 指磁盘队列丢弃的点数(比如磁盘队列满了)。Total 指总点数。 + +- `Point Upload Info` 展示数据上传通道的运行情况 [^point-upload-info-on-160] - `DataWay APIs` 展示 Dataway API 的调用情况 +[^point-upload-info-on-160]: [:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0) 对这里有更新,之前的版本在这里的显示稍有差异。 + ## FAQ {#faq} @@ -138,10 +142,10 @@ datakit monitor --refresh 1s -### :material-chat-question: 如何 Monitor 其它 DataKit? {#remote-monitor} +### :material-chat-question: 如何 Monitor 其它 Datakit? {#remote-monitor} -有时候,安装的 DataKit 并不是使用默认的 9529 端口,这时候就会出现类似如下的错误: +有时候,安装的 Datakit 并不是使用默认的 9529 端口,这时候就会出现类似如下的错误: ```shell request stats failed: Get "http://localhost:9528/stats": dial tcp ... @@ -152,6 +156,6 @@ request stats failed: Get "http://localhost:9528/stats": dial tcp ... ```shell datakit monitor --to localhost:19528 -# 也能查看另一个远程 DataKit 的 monitor +# 也能查看另一个远程 Datakit 的 monitor datakit monitor --to :9528 ``` diff --git a/internal/export/doc/zh/datakit-operator.md b/internal/export/doc/zh/datakit-operator.md index c5f906a95b..c7a3ff1c31 100644 --- a/internal/export/doc/zh/datakit-operator.md +++ b/internal/export/doc/zh/datakit-operator.md @@ -244,7 +244,7 @@ Datakit Operator 配置是 JSON 格式,在 Kubernetes 中单独以 ConfigMap - `logfwd.options.reuse_exist_volume` 允许在注入 logfwd 时,复用相同路径的 volume,避免因为存在同样路径的 volume 而注入报错。注意,路径末尾有斜线和无斜线的意义不同,所以这两个不是相同路径,不能复用。 -## 使用 Datakit Operator 注入文件和程序 {#datakit-operator-inject-sidecar} +## Datakit Operator 注入 {#datakit-operator-inject-sidecar} 在大型 Kubernetes 集群中,批量修改配置是比较麻烦的事情。Datakit-Operator 会根据 Annotation 配置,决定是否对其修改或注入。 @@ -252,8 +252,8 @@ Datakit Operator 配置是 JSON 格式,在 Kubernetes 中单独以 ConfigMap - 注入 `ddtrace` agent 和 environment 的功能 - 挂载 `logfwd` sidecar 并开启日志采集的功能 -- 注入 [`async-profiler`](https://github.com/async-profiler/async-profiler){:target="_blank"} *:octicons-beaker-24: Experimental* 采集 JVM 程序的 profile 数据 -- 注入 [`py-spy`](https://github.com/benfred/py-spy){:target="_blank"} *:octicons-beaker-24: Experimental* 采集 Python 应用的 profile 数据 +- 注入 [`async-profiler`](https://github.com/async-profiler/async-profiler){:target="_blank"} 采集 JVM 程序的 profile 数据 [:octicons-beaker-24: Experimental](index.md#experimental) +- 注入 [`py-spy`](https://github.com/benfred/py-spy){:target="_blank"} 采集 Python 应用的 profile 数据 [:octicons-beaker-24: Experimental](index.md#experimental) ???+ info @@ -261,7 +261,7 @@ Datakit Operator 配置是 JSON 格式,在 Kubernetes 中单独以 ConfigMap 只支持 v1 版本的 `deployments/daemonsets/cronjobs/jobs/statefulsets` 这五类 Kind,且因为 Datakit-Operator 实际对 PodTemplate 操作,所以不支持 Pod。 在本文中,以 `Deployment` 代替描述这五类 Kind。 -### 注入 ddtrace agent 和相关的环境变量 {#datakit-operator-inject-lib} +### DDtrace Agent {#datakit-operator-inject-lib} #### 使用说明 {#datakit-operator-inject-lib-usage} @@ -320,7 +320,7 @@ $ kubectl get pod nginx-deployment-7bd8dd85f-fzmt2 -o=jsonpath={.spec.initContai datakit-lib-init ``` -### 注入 logfwd 程序并开启日志采集 {#datakit-operator-inject-logfwd} +### logfwd {#datakit-operator-inject-logfwd} #### 前置条件 {#datakit-operator-inject-logfwd-prerequisites} @@ -423,7 +423,7 @@ log-container datakit-logfwd 最终可以在观测云日志平台查看日志是否采集。 -### 注入 `async-profiler` 工具采集 JVM 应用性能数据 {#inject-async-profiler} +### async-profiler {#inject-async-profiler} #### 前置条件 {#async-profiler-prerequisites} @@ -528,9 +528,7 @@ $ kubectl describe pod movies-java-784f4bb8c7-59g6s | grep datakit-profiler ``` - - -### 注入 `py-spy` 工具采集 Python 应用性能数据 {#inject-py-spy} +### py-spy {#inject-py-spy} #### 前置条件 {#py-spy-prerequisites} diff --git a/internal/export/doc/zh/development.md b/internal/export/doc/zh/development.md index 1f31b49076..441ba672eb 100644 --- a/internal/export/doc/zh/development.md +++ b/internal/export/doc/zh/development.md @@ -406,7 +406,6 @@ DataKit 新功能发布,大家最好做全套测试,包括安装、升级等 - `http_api.listen` 地址改一下 - `dataway.urls` 里面的 token 改一下 - 如有必要,logging 目录/level 都改一下 - - 没有了 - 启动 DataKit,以 Linux 为例:`DK_DEBUG_WORKDIR=~/datakit ./dist/datakit-linux-amd64/datakit` - 可在本地 bash 中新加个 alias,这样每次编译完 DataKit 后,直接运行 `ddk` 即可(即 Debugging-DataKit) diff --git a/internal/export/non_input_docs.go b/internal/export/non_input_docs.go index 27e0c0d127..81ef04aa31 100644 --- a/internal/export/non_input_docs.go +++ b/internal/export/non_input_docs.go @@ -205,15 +205,15 @@ func envDataway() []*inputs.ENVInfo { { ENVName: "ENV_DATAWAY_WAL_WORKERS", Type: doc.Int, - Desc: "Set WAL workers, default to limited CPU cores [:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0)", + Desc: "Set WAL workers, default to limited CPU cores X 2 [:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0)", DescZh: "设置 WAL worker 个数,默认为 CPU 配额核心数 X 2 [:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0)", }, { ENVName: "ENV_DATAWAY_WAL_MEM_CAPACITY", Type: doc.Int, - Desc: "Set WAL memory queue length, default to limited CPU cores [:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0)", - DescZh: "设置 WAL 内存队列长度,默认为 CPU 配额核心数 [:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0)", + Desc: "Set WAL memory queue length, default to limited CPU cores X 2 [:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0)", + DescZh: "设置 WAL 内存队列长度,默认为 CPU 配额核心数 X 2 [:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0)", }, { @@ -653,8 +653,9 @@ func envPointPool() []*inputs.ENVInfo { { ENVName: "ENV_POINT_POOL_RESERVED_CAPACITY", Type: doc.Int, - Desc: "Specify pool capacity(default 4096)", - DescZh: "指定 point pool 大小(默认 4096)", + Default: "4096", + Desc: "Specify the pool size that is immune to GC. if Datakit got too many data to collect, we can increase this reserved pool size(such as 40960) to decrease GC payload.", + DescZh: "指定 point pool 常驻大小,即不会被 GC 回收的 point 保有数,如果所在 Datakit 的采集量非常大,可以酌情将该数值调大一点", }, }