diff --git a/Makefile b/Makefile index 61045e528e..d556c0e4cd 100644 --- a/Makefile +++ b/Makefile @@ -476,7 +476,7 @@ code_stat: # promlinter: show prometheuse metrics defined in Datakit. # go install github.com/yeya24/promlinter/cmd/promlinter@latest show_metrics: - @promlinter list . --add-help -o md --with-vendor + @promlinter list . --add-help -o md --with-vendor --add-module clean: @rm -rf build/* diff --git a/cmd/awslambda/main.go b/cmd/awslambda/main.go index 6f8b4bd0e1..21fba18b33 100644 --- a/cmd/awslambda/main.go +++ b/cmd/awslambda/main.go @@ -75,7 +75,7 @@ func loadLambdaDefaultConf() { config.Cfg.HTTPAPI.Listen = "0.0.0.0:9529" config.Cfg.DefaultEnabledInputs = []string{"awslambda", "ddtrace", "opentelemetry", "statsd"} config.Cfg.Dataway.MaxRawBodySize = dataway.MinimalRawBodySize - config.Cfg.IO.FlushWorkers = 1 + config.Cfg.IO.CompactWorkers = 1 } func run() { @@ -129,27 +129,12 @@ func startIO() { opts := []dkio.IOOption{ dkio.WithFeederOutputer(dkio.NewAwsLambdaOutput()), dkio.WithDataway(config.Cfg.Dataway), - dkio.WithMaxCacheCount(c.MaxCacheCount), - dkio.WithDiskCache(c.EnableCache), - dkio.WithDiskCacheSize(c.CacheSizeGB), + dkio.WithCompactAt(c.MaxCacheCount), dkio.WithFilters(c.Filters), - dkio.WithCacheAll(c.CacheAll), - dkio.WithFlushWorkers(c.FlushWorkers), + dkio.WithCompactWorkers(c.CompactWorkers), dkio.WithRecorder(config.Cfg.Recorder), - dkio.WithConsumer(false), - } - - du, err := time.ParseDuration(c.FlushInterval) - if err != nil { - } else { - opts = append(opts, dkio.WithFlushInterval(du)) - } - - du, err = time.ParseDuration(c.CacheCleanInterval) - if err != nil { - l.Warnf("parse CacheCleanInterval failed: %s, use default 5s", err) - } else { - opts = append(opts, dkio.WithDiskCacheCleanInterval(du)) + dkio.WithCompactInterval(c.CompactInterval), + dkio.WithCompactor(false), } dkio.Start(opts...) diff --git a/cmd/datakit/main.go b/cmd/datakit/main.go index 044bfe9605..155b3cfca8 100644 --- a/cmd/datakit/main.go +++ b/cmd/datakit/main.go @@ -29,6 +29,7 @@ import ( "gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/gitrepo" "gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/httpapi" dkio "gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/io" + "gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/io/dataway" "gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/metrics" "gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/ntp" plRemote "gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/pipeline/remote" @@ -219,30 +220,29 @@ func startIO() { opts := []dkio.IOOption{ dkio.WithFeederOutputer(dkio.NewDatawayOutput(c.FeedChanSize)), dkio.WithDataway(config.Cfg.Dataway), - dkio.WithMaxCacheCount(c.MaxCacheCount), - dkio.WithDiskCache(c.EnableCache), - dkio.WithDiskCacheSize(c.CacheSizeGB), + dkio.WithCompactAt(c.MaxCacheCount), dkio.WithFilters(c.Filters), - dkio.WithCacheAll(c.CacheAll), - dkio.WithFlushWorkers(c.FlushWorkers), + dkio.WithCompactWorkers(c.CompactWorkers), dkio.WithRecorder(config.Cfg.Recorder), dkio.WithAvailableCPUs(datakit.AvailableCPUs), } - du, err := time.ParseDuration(c.FlushInterval) - if err != nil { - } else { - opts = append(opts, dkio.WithFlushInterval(du)) - } + dkio.Start(opts...) +} - du, err = time.ParseDuration(c.CacheCleanInterval) - if err != nil { - l.Warnf("parse CacheCleanInterval failed: %s, use default 5s", err) - } else { - opts = append(opts, dkio.WithDiskCacheCleanInterval(du)) +func startDatawayWorkers() { + dw := config.Cfg.Dataway + + // setup extra options on @dw + if dw.WAL.Workers == 0 { + n := datakit.AvailableCPUs * 2 + l.Infof("set %d flush WAL workers", n) + dataway.WithWALWorkers(n)(dw) } - dkio.Start(opts...) + if err := dw.StartFlushWorkers(); err != nil { + l.Errorf("StartFlushWorkers failed: %s", err) + } } func gc(du time.Duration) { @@ -280,10 +280,12 @@ func doRun() error { } cpuLimit := getCurrentCPULimits() + l.Infof("get limited cpu cores: %f", cpuLimit) if cpuLimit > 1.0 { datakit.AvailableCPUs = int(cpuLimit) } // else datakit.AvailableCPUs default to 1 + startDatawayWorkers() startIO() // start NTP syncer on dataway. diff --git a/cmd/installer/installer/dkconf.go b/cmd/installer/installer/dkconf.go index 017f0e11df..aca3bb3088 100644 --- a/cmd/installer/installer/dkconf.go +++ b/cmd/installer/installer/dkconf.go @@ -70,7 +70,7 @@ var ( EnableInputs, CloudProvider, Proxy, - Dataway string + DatawayURLs string HTTPPublicAPIs string @@ -313,15 +313,15 @@ func preEnableHostobjectInput(cloud string) []byte { func getDataway() (*dataway.Dataway, error) { dw := dataway.NewDefaultDataway() - if Dataway != "" { - dw.URLs = strings.Split(Dataway, ",") + if DatawayURLs != "" { + urls := strings.Split(DatawayURLs, ",") if Proxy != "" { l.Debugf("set proxy to %s", Proxy) dw.HTTPProxy = Proxy } - if err := dw.Init(); err != nil { + if err := dw.Init(dataway.WithURLs(urls...)); err != nil { return nil, err } else { tokens := dw.GetTokens() diff --git a/cmd/installer/installer/install.go b/cmd/installer/installer/install.go index a62143a83c..1868d48a1f 100644 --- a/cmd/installer/installer/install.go +++ b/cmd/installer/installer/install.go @@ -51,14 +51,14 @@ func Install(svc service.Service, userName string) { mc.DatakitUser = userName // prepare dataway info and check token format - if len(Dataway) != 0 { + if len(DatawayURLs) != 0 { mc.Dataway, err = getDataway() if err != nil { l.Errorf("getDataway failed: %s", err.Error()) l.Fatal(err) } - l.Infof("Set dataway to %s", Dataway) + l.Infof("Set dataway to %s", DatawayURLs) mc.Dataway.GlobalCustomerKeys = dataway.ParseGlobalCustomerKeys(SinkerGlobalCustomerKeys) mc.Dataway.EnableSinker = (EnableSinker != "") diff --git a/cmd/installer/installer/upgrade.go b/cmd/installer/installer/upgrade.go index 367572e78c..0824ec4ad8 100644 --- a/cmd/installer/installer/upgrade.go +++ b/cmd/installer/installer/upgrade.go @@ -12,6 +12,7 @@ import ( "github.com/GuanceCloud/cliutils/logger" "gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/config" "gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/datakit" + "gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/io/dataway" ) var l = logger.DefaultSLogger("upgrade") @@ -71,6 +72,13 @@ func upgradeMainConfig(c *config.Config) *config.Config { c.Dataway.DeprecatedHTTPTimeout = "" // always remove the config } + + if c.Dataway.MaxRawBodySize >= dataway.DeprecatedDefaultMaxRawBodySize { + l.Infof("to save memory, set max-raw-body-size from %d to %d", + c.Dataway.MaxRawBodySize, dataway.DefaultMaxRawBodySize) + + c.Dataway.MaxRawBodySize = dataway.DefaultMaxRawBodySize + } } l.Infof("Set log to %s", c.Logging.Log) @@ -126,9 +134,9 @@ func upgradeMainConfig(c *config.Config) *config.Config { c.IO.MaxCacheCount = 1000 } - if c.IntervalDeprecated != "" { - c.IO.FlushInterval = c.IntervalDeprecated - c.IntervalDeprecated = "" + if c.IntervalDeprecated != time.Duration(0) { + c.IO.CompactInterval = c.IntervalDeprecated + c.IntervalDeprecated = time.Duration(0) } if c.IO.FeedChanSize > 1 { diff --git a/cmd/installer/installer/upgrade_test.go b/cmd/installer/installer/upgrade_test.go index 81e5c6b2c1..de918fa4d0 100644 --- a/cmd/installer/installer/upgrade_test.go +++ b/cmd/installer/installer/upgrade_test.go @@ -13,6 +13,7 @@ import ( "github.com/stretchr/testify/assert" "gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/config" "gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/election" + "gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/io/dataway" ) func Test_setupDefaultInputs(t *T.T) { @@ -252,7 +253,7 @@ func TestUpgradeMainConfig(t *T.T) { old: func() *config.Config { c := config.DefaultConfig() c.IOCacheCountDeprecated = 10 - c.IntervalDeprecated = "100s" + c.IntervalDeprecated = 100 * time.Second return c }(), @@ -260,7 +261,7 @@ func TestUpgradeMainConfig(t *T.T) { expect: func() *config.Config { c := config.DefaultConfig() c.IO.MaxCacheCount = 1000 // auto reset to 10000 - c.IO.FlushInterval = "100s" + c.IO.CompactInterval = 100 * time.Second return c }(), @@ -280,6 +281,21 @@ func TestUpgradeMainConfig(t *T.T) { return c }(), }, + + { + name: "set-default-raw-body-size", + old: func() *config.Config { + c := config.DefaultConfig() + c.Dataway.MaxRawBodySize = dataway.DeprecatedDefaultMaxRawBodySize + + return c + }(), + + expect: func() *config.Config { + c := config.DefaultConfig() + return c + }(), + }, } for _, tc := range cases { diff --git a/cmd/installer/main.go b/cmd/installer/main.go index 946eac7a10..9aada1e1ac 100644 --- a/cmd/installer/main.go +++ b/cmd/installer/main.go @@ -131,7 +131,7 @@ func init() { // flag.StringVar(&flagAPMInstrumentationLibraries, "apm-instrumentation-libraries", "datadog|java,python", // "install and use the APM library of the specified provider") - flag.StringVar(&installer.Dataway, "dataway", "", "DataWay host(https://guance.openway.com?token=xxx)") + flag.StringVar(&installer.DatawayURLs, "dataway", "", "DataWay host(https://guance.openway.com?token=xxx)") flag.StringVar(&installer.Proxy, "proxy", "", "http proxy http://ip:port for datakit") flag.StringVar(&installer.DatakitName, "name", "", "specify DataKit name, example: prod-env-datakit") flag.StringVar(&installer.EnableInputs, "enable-inputs", "", "default enable inputs(comma splited, example:cpu,mem,disk)") @@ -542,12 +542,12 @@ __downloadOK: setupUserGroup(userName, userName) if flagInstallOnly != 0 { - l.Warnf("Only install service %q, NOT started", dkservice.Name) + l.Warnf("Only install service %q, NOT started", dkservice.Name()) } else { if err = service.Control(svc, "start"); err != nil { - l.Warnf("Start service %q failed: %s", dkservice.Name, err.Error()) + l.Warnf("Start service %q failed: %s", dkservice.Name(), err.Error()) } else { - l.Infof("Starting service %q ok", dkservice.Name) + l.Infof("Starting service %q ok", dkservice.Name()) } } diff --git a/go.mod b/go.mod index 757e49c395..fd713ddd99 100644 --- a/go.mod +++ b/go.mod @@ -348,7 +348,7 @@ require ( require ( github.com/DataDog/ebpf-manager v0.2.16 - github.com/GuanceCloud/cliutils v1.1.22-0.20240930074036-255c78c086fd + github.com/GuanceCloud/cliutils v1.1.22-0.20241018104846-17e816f0e123 github.com/GuanceCloud/kubernetes v0.0.0-20230801080916-ca299820872b github.com/GuanceCloud/zipstream v0.1.0 // indirect github.com/andrewkroh/sys v0.0.0-20151128191922-287798fe3e43 diff --git a/go.sum b/go.sum index cb85a381fb..1eb10eb729 100644 --- a/go.sum +++ b/go.sum @@ -145,6 +145,8 @@ github.com/GuanceCloud/client_model v0.0.0-20230418154757-93bd4e878a5e h1:i34dA4 github.com/GuanceCloud/client_model v0.0.0-20230418154757-93bd4e878a5e/go.mod h1:PMnE48aPzuRu83FmWZugC0O3d54ZupJd/MmiaYxz8sM= github.com/GuanceCloud/cliutils v1.1.22-0.20240930074036-255c78c086fd h1:KxbB1a1NybivPLnI+xVcR0WPPXlI1+jCyCmPMJ5LnpE= github.com/GuanceCloud/cliutils v1.1.22-0.20240930074036-255c78c086fd/go.mod h1:5bIAZ9yA6l7W8MMUKw0+SIZJRpmEwxM6ZYLy4vweTgU= +github.com/GuanceCloud/cliutils v1.1.22-0.20241018104846-17e816f0e123 h1:CigTx24h5Lc/49Zghr8d70jrMeoEJ9tJXUE/79uzbQ0= +github.com/GuanceCloud/cliutils v1.1.22-0.20241018104846-17e816f0e123/go.mod h1:5bIAZ9yA6l7W8MMUKw0+SIZJRpmEwxM6ZYLy4vweTgU= github.com/GuanceCloud/confd v0.1.101 h1:yjHgfl6YzAlTbFOFMTE4ERpFJzIyovOW7ZFc2/ZssL0= github.com/GuanceCloud/confd v0.1.101/go.mod h1:o0opIwOX+yNwV9nh56x5ymFMJ+YBD8JuPxBJ7a1mEmo= github.com/GuanceCloud/dockertest/v3 v3.9.4 h1:ScSNhfA2HSNLfrYoNd1KSRxkrymlKiBE60g4f6eUoOk= diff --git a/internal/cmds/debug_bugreport.go b/internal/cmds/debug_bugreport.go index 8e67fea695..b1e5e35317 100644 --- a/internal/cmds/debug_bugreport.go +++ b/internal/cmds/debug_bugreport.go @@ -658,6 +658,12 @@ func (info *datakitInfo) compressDir() (string, error) { date := time.Now().UnixMilli() fileName := fmt.Sprintf("info-%d", date) zipPath := fmt.Sprintf("%s.zip", fileName) + + if *flagDebugBugreportTag != "" { + fileName = fmt.Sprintf("%s-info-%d", *flagDebugBugreportTag, date) + zipPath = fmt.Sprintf("%s.zip", fileName) + } + // Open a file to write the compressed data to zipFile, err := os.Create(filepath.Clean(zipPath)) if err != nil { diff --git a/internal/cmds/debug_input.go b/internal/cmds/debug_input.go index 8d18fcbdb9..60d61ec339 100644 --- a/internal/cmds/debug_input.go +++ b/internal/cmds/debug_input.go @@ -29,7 +29,7 @@ func debugInput(conf string) error { dkio.Start(dkio.WithFeederOutputer(dkio.NewDebugOutput()), // disable filter and consumer, the debug output not implemented the Reader() dkio.WithFilter(false), - dkio.WithConsumer(false)) + dkio.WithCompactor(false)) loadedInputs, err := config.LoadSingleConfFile(conf, inputs.Inputs, false) if err != nil { diff --git a/internal/cmds/debug_upload_log.go b/internal/cmds/debug_upload_log.go index cc59f64156..b104d2c9d1 100644 --- a/internal/cmds/debug_upload_log.go +++ b/internal/cmds/debug_upload_log.go @@ -23,7 +23,8 @@ type successRes struct { } func uploadLog(urls []string) error { - dw := dataway.Dataway{URLs: urls} + dw := dataway.NewDefaultDataway() + dw.URLs = urls if config.Cfg.Dataway != nil { if len(config.Cfg.Dataway.HTTPProxy) > 0 { diff --git a/internal/cmds/import.go b/internal/cmds/import.go index 7e78206668..3844247931 100644 --- a/internal/cmds/import.go +++ b/internal/cmds/import.go @@ -172,7 +172,11 @@ func setupUploader() (uploader, error) { } u := &uploaderImpl{ - dw: &dataway.Dataway{URLs: dwURLS}, + dw: func() *dataway.Dataway { + x := dataway.NewDefaultDataway() + x.URLs = dwURLS + return x + }(), } if err := u.dw.Init(); err != nil { diff --git a/internal/cmds/monitor.go b/internal/cmds/monitor.go index fe54ba53d7..f59caba073 100644 --- a/internal/cmds/monitor.go +++ b/internal/cmds/monitor.go @@ -15,15 +15,16 @@ import ( ) var moduleMap = map[string]string{ - "G": "goroutine", - "B": "basic", - "R": "runtime", - "F": "filter", - "H": "http", - "In": "inputs", - "P": "pipeline", - "IO": "io_stats", - "W": "dataway", + "G": "goroutine", + "B": "basic", + "R": "runtime", + "F": "filter", + "H": "http", + "In": "inputs", + "P": "pipeline", + "IO": "io_stats", + "W": "dataway", + "WAL": "wal", } // loadLocalDatakitConf try to find where local datakit listen. diff --git a/internal/cmds/parse_flags.go b/internal/cmds/parse_flags.go index 0f68a1d148..bc3cd9a181 100644 --- a/internal/cmds/parse_flags.go +++ b/internal/cmds/parse_flags.go @@ -197,6 +197,7 @@ var ( flagDebugBugreportOSS = fsDebug.String("oss", "", "upload bug report file to specified object storage(format host:bucket:ak:sk)") flagDebugBugreportDisableProfile = fsDebug.Bool("disable-profile", false, "disable profile collection when running bug-report") flagDebugBugreportNMetrics = fsDebug.Int("nmetrics", 3, "collect N batch of datakit metrics") + flagDebugBugreportTag = fsDebug.String("tag", "", "ping a tag to current bug report") flagDebugInputConf = fsDebug.String("input-conf", "", "input TOML conf path") flagDebugHTTPListen = fsDebug.String("http-listen", "", "setup HTTP server on debugging some inputs(such as some Trace/RUM/...)") diff --git a/internal/config/conf_test.go b/internal/config/conf_test.go index 8b5ca6f8b8..e27fd0a8c4 100644 --- a/internal/config/conf_test.go +++ b/internal/config/conf_test.go @@ -24,15 +24,15 @@ func TestDefaultMainConf(t *testing.T) { c := DefaultConfig() c.Ulimit = 0 // ulimit diff among OS platforms - def := DefaultConfig() - _, err := bstoml.Decode(datakit.DatakitConfSample, &def) + x := DefaultConfig() + _, err := bstoml.Decode(datakit.DatakitConfSample, &x) require.NoError(t, err) - def.DefaultEnabledInputs = def.DefaultEnabledInputs[:0] // clear - def.GlobalHostTags = map[string]string{} // clear: host tags setted on default conf sample - def.Ulimit = 0 + x.DefaultEnabledInputs = x.DefaultEnabledInputs[:0] // clear + x.GlobalHostTags = map[string]string{} // clear: host tags setted on default conf sample + x.Ulimit = 0 - assert.Equal(t, c.String(), def.String()) + assert.Equal(t, c.String(), x.String()) } func TestEnableDefaultsInputs(t *testing.T) { @@ -407,9 +407,11 @@ func Test_setupDataway(t *testing.T) { }{ { name: "check_dev_null", - dw: &dataway.Dataway{ - URLs: []string{datakit.DatawayDisableURL}, - }, + dw: func() *dataway.Dataway { + x := dataway.NewDefaultDataway() + x.URLs = []string{datakit.DatawayDisableURL} + return x + }(), }, } diff --git a/internal/config/env.go b/internal/config/env.go index 9bc844fa29..53a04773eb 100644 --- a/internal/config/env.go +++ b/internal/config/env.go @@ -243,15 +243,16 @@ func (c *Config) loadPipelineEnvs() { } func (c *Config) loadPointPoolEnvs() { - if v := datakit.GetEnv("ENV_ENABLE_POINT_POOL"); v != "" { - c.PointPool.Enable = true + if v := datakit.GetEnv("ENV_DISABLE_POINT_POOL"); v != "" { + l.Warn("point pool disabled, this may cost too many memory") + c.PointPool.Enable = false + } - if v := datakit.GetEnv("ENV_POINT_POOL_RESERVED_CAPACITY"); v != "" { - if i, err := strconv.ParseInt(v, 10, 64); err == nil { - c.PointPool.ReservedCapacity = i - } else { - l.Warnf("invalid ENV_POINT_POOL_RESERVED_CAPACITY: %s, use default %d", v, c.PointPool.ReservedCapacity) - } + if v := datakit.GetEnv("ENV_POINT_POOL_RESERVED_CAPACITY"); v != "" { + if i, err := strconv.ParseInt(v, 10, 64); err == nil { + c.PointPool.ReservedCapacity = i + } else { + l.Warnf("invalid ENV_POINT_POOL_RESERVED_CAPACITY: %s, use default %d", v, c.PointPool.ReservedCapacity) } } } @@ -403,6 +404,47 @@ func (c *Config) loadDatawayEnvs() { } } } + + // WAL + if c.Dataway.WAL != nil { + if v := datakit.GetEnv("ENV_DATAWAY_WAL_CAPACITY"); v != "" { + if x, err := strconv.ParseFloat(v, 64); err != nil { + l.Warnf("invalid ENV_DATAWAY_WAL_CAPACITY, expect int or float, got %s, ignored", v) + } else { + c.Dataway.WAL.MaxCapacityGB = x + } + } + + if v := datakit.GetEnv("ENV_DATAWAY_WAL_WORKERS"); v != "" { + if x, err := strconv.ParseInt(v, 10, 64); err != nil { + l.Warnf("invalid ENV_DATAWAY_WAL_WORKERS, expect int, got %s, ignored", v) + } else { + c.Dataway.WAL.Workers = int(x) + } + } + + if v := datakit.GetEnv("ENV_DATAWAY_WAL_MEM_CAPACITY"); v != "" { + if x, err := strconv.ParseInt(v, 10, 64); err != nil { + l.Warnf("invalid ENV_DATAWAY_WAL_MEM_CAPACITY, expect int, got %s, ignored", v) + } else { + c.Dataway.WAL.MemCap = int(x) + } + } + + if v := datakit.GetEnv("ENV_DATAWAY_WAL_PATH"); v != "" { + c.Dataway.WAL.Path = v + } + + if v := datakit.GetEnv("ENV_DATAWAY_WAL_FAIL_CACHE_CLEAN_INTERVAL"); v != "" { + if x, err := time.ParseDuration(v); err != nil { + l.Warnf("invalid ENV_DATAWAY_WAL_FAIL_CACHE_CLEAN_INTERVAL, expect duration, got %s, ignored", v) + } else { + c.Dataway.WAL.FailCacheCleanInterval = x + } + } + } else { + l.Errorf("WAL not set, should not been here") + } } func (c *Config) loadElectionEnvs() { @@ -489,23 +531,13 @@ func (c *Config) loadIOEnvs() { } } - if v := datakit.GetEnv("ENV_IO_ENABLE_CACHE"); v != "" { - l.Info("ENV_IO_ENABLE_CACHE enabled") - c.IO.EnableCache = true - } - - if v := datakit.GetEnv("ENV_IO_CACHE_ALL"); v != "" { - l.Info("ENV_IO_CACHE_ALL enabled") - c.IO.CacheAll = true - } - - if v := datakit.GetEnv("ENV_IO_CACHE_MAX_SIZE_GB"); v != "" { - val, err := strconv.ParseInt(v, 10, 64) + if v := datakit.GetEnv("ENV_IO_FLUSH_WORKERS"); v != "" { + n, err := strconv.ParseInt(v, 10, 64) if err != nil { - l.Warnf("invalid env key ENV_IO_CACHE_MAX_SIZE_GB, value %s, err: %s ignored", v, err) + l.Warnf("invalid env key ENV_IO_FLUSH_WORKERS, value %s, err: %s ignored", v, err) } else { - l.Infof("set ENV_IO_CACHE_MAX_SIZE_GB to %d", val) - c.IO.CacheSizeGB = int(val) + l.Infof("set ENV_IO_FLUSH_WORKERS to %d", n) + c.IO.CompactWorkers = int(n) } } @@ -515,17 +547,7 @@ func (c *Config) loadIOEnvs() { l.Warnf("invalid env key ENV_IO_FLUSH_INTERVAL, value %s, err: %s ignored", v, err) } else { l.Infof("set ENV_IO_FLUSH_INTERVAL to %s", du) - c.IO.FlushInterval = v - } - } - - if v := datakit.GetEnv("ENV_IO_FLUSH_WORKERS"); v != "" { - n, err := strconv.ParseInt(v, 10, 64) - if err != nil { - l.Warnf("invalid env key ENV_IO_FLUSH_WORKERS, value %s, err: %s ignored", v, err) - } else { - l.Infof("set ENV_IO_FLUSH_WORKERS to %d", n) - c.IO.FlushWorkers = int(n) + c.IO.CompactInterval = du } } @@ -539,16 +561,6 @@ func (c *Config) loadIOEnvs() { } } - if v := datakit.GetEnv("ENV_IO_CACHE_CLEAN_INTERVAL"); v != "" { - du, err := time.ParseDuration(v) - if err != nil { - l.Warnf("invalid env key ENV_IO_CACHE_CLEAN_INTERVAL, value %s, err: %s ignored", v, err) - } else { - l.Infof("set ENV_IO_CACHE_CLEAN_INTERVAL to %s", du) - c.IO.CacheCleanInterval = v - } - } - // filters if v := datakit.GetEnv("ENV_IO_FILTERS"); v != "" { var x map[string]filter.FilterConditions diff --git a/internal/config/env_test.go b/internal/config/env_test.go index 61838e3fa4..bd7c48e6a2 100644 --- a/internal/config/env_test.go +++ b/internal/config/env_test.go @@ -287,7 +287,7 @@ func TestLoadEnv(t *testing.T) { }, expect: func() *Config { cfg := DefaultConfig() - cfg.HTTPAPI.RequestRateLimit = 0 + cfg.HTTPAPI.RequestRateLimit = 20.0 return cfg }(), }, @@ -487,13 +487,9 @@ func TestLoadEnv(t *testing.T) { cfg.IO.FeedChanSize = 1 // force reset to 1 cfg.IO.MaxCacheCount = 8192 - cfg.IO.EnableCache = true cfg.IO.FeedChanSize = 123 - cfg.IO.CacheSizeGB = 8 - cfg.IO.FlushInterval = "2s" - cfg.IO.FlushWorkers = 1 - cfg.IO.CacheCleanInterval = "100s" - cfg.IO.CacheAll = true + cfg.IO.CompactInterval = 2 * time.Second + cfg.IO.CompactWorkers = 1 return cfg }(), @@ -542,6 +538,22 @@ func TestLoadEnv(t *testing.T) { return cfg }(), }, + + { + name: "test-point-pool", + envs: map[string]string{ + "ENV_POINT_POOL_RESERVED_CAPACITY": "12345", + "ENV_DISABLE_POINT_POOL": "yes", + }, + + expect: func() *Config { + cfg := DefaultConfig() + cfg.PointPool.Enable = false + cfg.PointPool.ReservedCapacity = 12345 + + return cfg + }(), + }, } for _, tc := range cases { diff --git a/internal/config/httpapi.go b/internal/config/httpapi.go index 73179ed3b7..1e05d46929 100644 --- a/internal/config/httpapi.go +++ b/internal/config/httpapi.go @@ -21,7 +21,7 @@ type APIConfig struct { Disable404Page bool `toml:"disable_404page"` RUMAppIDWhiteList []string `toml:"rum_app_id_white_list"` PublicAPIs []string `toml:"public_apis"` - RequestRateLimit float64 `toml:"request_rate_limit,omitzero"` + RequestRateLimit float64 `toml:"request_rate_limit"` Timeout string `toml:"timeout"` CloseIdleConnection bool `toml:"close_idle_connection"` TLSConf *TLSConfig `toml:"tls"` diff --git a/internal/config/mainconf.go b/internal/config/mainconf.go index bb1b00808c..a5e09c0ad0 100644 --- a/internal/config/mainconf.go +++ b/internal/config/mainconf.go @@ -44,9 +44,9 @@ type Config struct { HTTPBindDeprecated string `toml:"http_server_addr,omitempty"` HTTPListenDeprecated string `toml:"http_listen,omitempty"` - IntervalDeprecated string `toml:"interval,omitempty"` - OutputFileDeprecated string `toml:"output_file,omitempty"` - UUIDDeprecated string `toml:"uuid,omitempty"` // deprecated + IntervalDeprecated time.Duration `toml:"interval,omitempty"` + OutputFileDeprecated string `toml:"output_file,omitempty"` + UUIDDeprecated string `toml:"uuid,omitempty"` PointPool *pointPool `toml:"point_pool"` @@ -124,7 +124,7 @@ func DefaultConfig() *Config { c := &Config{ //nolint:dupl DefaultEnabledInputs: []string{}, PointPool: &pointPool{ - Enable: false, + Enable: true, ReservedCapacity: 4096, }, @@ -149,14 +149,9 @@ func DefaultConfig() *Config { }, // default nothing IO: &io.IOConf{ - FeedChanSize: 1, - MaxCacheCount: 1000, - FlushInterval: "10s", - - // Enable disk cache on datakit send fail. - EnableCache: false, - CacheSizeGB: 10, - CacheCleanInterval: "5s", + FeedChanSize: 1, + MaxCacheCount: 1000, + CompactInterval: time.Second * 10, Filters: nil, }, @@ -181,6 +176,7 @@ func DefaultConfig() *Config { Listen: "localhost:9529", RUMAppIDWhiteList: []string{}, PublicAPIs: []string{}, + RequestRateLimit: 20, Timeout: "30s", CloseIdleConnection: false, TLSConf: &TLSConfig{}, diff --git a/internal/datakit/datakit.go b/internal/datakit/datakit.go index fe4f7b9cdd..6dfaa83622 100644 --- a/internal/datakit/datakit.go +++ b/internal/datakit/datakit.go @@ -283,6 +283,8 @@ func CategoryDirName() map[string]string { func SetWorkDir(dir string) { InstallDir = dir + l.Infof("set workdir to %q", dir) + DataDir = filepath.Join(InstallDir, "data") DataRUMDir = filepath.Join(DataDir, "rum") ConfdDir = filepath.Join(InstallDir, StrConfD) diff --git a/internal/datakit/dkconf.go b/internal/datakit/dkconf.go index c372afd853..29734a0455 100644 --- a/internal/datakit/dkconf.go +++ b/internal/datakit/dkconf.go @@ -42,10 +42,10 @@ datakit_user = "root" ulimit = 64000 ################################################ -# point_pool: use point pool for better memory usage(Experimental) +# point_pool: use point pool for better memory usage ################################################ [point_pool] - enable = false + enable = true reserved_capacity = 4096 ################################################ @@ -124,6 +124,9 @@ ulimit = 64000 timeout = "30s" close_idle_connection = false + # API rate limit(QPS) + request_rate_limit = 20.0 + # # RUM related: we should port these configures to RUM inputs(TODO) # @@ -212,14 +215,16 @@ ulimit = 64000 [dataway] # urls: Dataway URL list # NOTE: do not configure multiple URLs here, it's a deprecated feature. - urls = ["https://openway.guance.com?token=tkn_xxxxxxxxxxx"] + urls = [ + # "https://openway.guance.com?token=" + ] # Dataway HTTP timeout timeout_v2 = "30s" # max_retry_count specifies at most how many times the data sending operation will be tried when it fails, # valid minimum value is 1 (NOT 0) and maximum value is 10. - max_retry_count = 4 + max_retry_count = 1 # The interval between two retry operation, valid time units are "ns", "us" (or "µs"), "ms", "s", "m", "h" retry_delay = "1s" @@ -242,7 +247,7 @@ ulimit = 64000 # do NOT disable gzip or your get large network payload. gzip = true - max_raw_body_size = 10485760 # max body size(before gizp) in bytes + max_raw_body_size = 1048576 # max body size(before gizp) in bytes # Customer tag or field keys that will extract from exist points # to build the X-Global-Tags HTTP header value. @@ -258,6 +263,14 @@ ulimit = 64000 # NOTE: diff MUST larger than "1s" diff = "30s" + # WAL queue for uploading points + [dataway.wal] + max_capacity_gb = 2.0 # 2GB reserved disk space for each category(M/L/O/T/...) + #workers = 4 # flush workers on WAL(default to CPU limited cores) + #mem_cap = 4 # in-memory queue capacity(default to CPU limited cores) + #fail_cache_clean_interval = "30s" # duration for clean fail uploaded data + + ################################################ # Datakit logging configure ################################################ diff --git a/internal/export/doc/en/datakit-conf.md b/internal/export/doc/en/datakit-conf.md index 185bd8b0ff..aab31ab4c6 100644 --- a/internal/export/doc/en/datakit-conf.md +++ b/internal/export/doc/en/datakit-conf.md @@ -59,12 +59,14 @@ DataKit opens an HTTP service to receive external data or provide basic data ser After the configuration is complete, you can use the `curl` command to test whether the configuration is successful: `sudo curl --no-buffer -XGET --unix-socket /tmp/datakit.sock http:/localhost/v1/ping`. For more information on the test commands for `curl`, see [here](https://superuser.com/a/925610){:target="_blank"}. ### HTTP Request Frequency Control {#set-http-api-limit} + + > [:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0) default enabled this limit. As DataKit needs to receive a large number of external data writes, in order to avoid causing huge overhead to the host node, the following HTTP configuration can be modified (it is not turned on by default): ```toml [http_api] - request_rate_limit = 1000.0 # Limit ingeach HTTP API to receive only 1000 requests per second + request_rate_limit = 20.0 # Limit HTTP request(client IP + route) QPS ``` ### Other Settings {#http-other-settings} diff --git a/internal/export/doc/en/datakit-metrics.md b/internal/export/doc/en/datakit-metrics.md index 069cfdc904..381c813ffa 100644 --- a/internal/export/doc/en/datakit-metrics.md +++ b/internal/export/doc/en/datakit-metrics.md @@ -22,175 +22,187 @@ datakit_cpu_usage 4.9920266849857144 We can also playing other metrics too(change the `grep` string), all available metrics list below(current Datakit version is {{ .Version }}): -|TYPE|NAME|LABELS|HELP| -|---|---|---|---| -|GAUGE|`datakit_config_datakit_ulimit`|`status`|Datakit ulimit| -|COUNTER|`datakit_dns_domain_total`|`N/A`|DNS watched domain counter| -|COUNTER|`datakit_dns_ip_updated_total`|`domain`|Domain IP updated counter| -|COUNTER|`datakit_dns_watch_run_total`|`interval`|Watch run counter| -|SUMMARY|`datakit_dns_cost_seconds`|`domain,status`|DNS IP lookup cost| -|COUNTER|`datakit_election_pause_total`|`id,namespace`|Input paused count when election failed| -|COUNTER|`datakit_election_resume_total`|`id,namespace`|Input resume count when election OK| -|GAUGE|`datakit_election_status`|`elected_id,id,namespace,status`|Datakit election status, if metric = 0, meas not elected, or the elected time(unix timestamp second)| -|GAUGE|`datakit_election_inputs`|`namespace`|Datakit election input count| -|SUMMARY|`datakit_election_seconds`|`namespace,status`|Election latency| -|GAUGE|`datakit_goroutine_alive`|`name`|Alive Goroutine count| -|COUNTER|`datakit_goroutine_recover_total`|`name`|Recovered Goroutine count| -|COUNTER|`datakit_goroutine_stopped_total`|`name`|Stopped Goroutine count| -|COUNTER|`datakit_goroutine_crashed_total`|`name`|Crashed goroutines count| -|GAUGE|`datakit_goroutine_groups`|`N/A`|Goroutine group count| -|SUMMARY|`datakit_goroutine_cost_seconds`|`name`|Goroutine running duration| -|SUMMARY|`datakit_http_api_elapsed_seconds`|`api,method,status`|API request cost| -|SUMMARY|`datakit_http_api_req_size_bytes`|`api,method,status`|API request body size| -|COUNTER|`datakit_http_api_total`|`api,method,status`|API request counter| -|GAUGE|`datakit_http_api_global_tags_last_updated`|`api,method,status`|Global tag updated timestamp, in second| -|SUMMARY|`datakit_httpcli_got_first_resp_byte_cost_seconds`|`from`|Got first response byte cost| -|COUNTER|`datakit_httpcli_tcp_conn_total`|`from,remote,type`|HTTP TCP connection count| -|COUNTER|`datakit_httpcli_conn_reused_from_idle_total`|`from`|HTTP connection reused from idle count| -|SUMMARY|`datakit_httpcli_conn_idle_time_seconds`|`from`|HTTP connection idle time| -|SUMMARY|`datakit_httpcli_dns_cost_seconds`|`from`|HTTP DNS cost| -|SUMMARY|`datakit_httpcli_tls_handshake_seconds`|`from`|HTTP TLS handshake cost| -|SUMMARY|`datakit_httpcli_http_connect_cost_seconds`|`from`|HTTP connect cost| -|SUMMARY|`datakit_io_build_body_cost_seconds`|`category,encoding`|Build point HTTP body cost| -|SUMMARY|`datakit_io_build_body_batches`|`category,encoding`|Batch HTTP body batches| -|SUMMARY|`datakit_io_build_body_batch_bytes`|`category,encoding,gzip`|Batch HTTP body size| -|SUMMARY|`datakit_io_build_body_batch_points`|`category,encoding,gzip`|Batch HTTP body points| -|COUNTER|`datakit_io_dataway_point_total`|`category,status`|Dataway uploaded points, partitioned by category and send status(HTTP status)| -|COUNTER|`datakit_io_dataway_point_bytes_total`|`category,enc,status`|Dataway uploaded points bytes, partitioned by category and pint send status(HTTP status)| -|COUNTER|`datakit_io_dataway_http_drop_point_total`|`category,error`|Dataway write drop points| -|SUMMARY|`datakit_io_dataway_api_latency_seconds`|`api,status`|Dataway HTTP request latency partitioned by HTTP API(method@url) and HTTP status| -|COUNTER|`datakit_io_http_retry_total`|`api,status`|Dataway HTTP retried count| -|SUMMARY|`datakit_io_grouped_request`|`category`|Grouped requests under sinker| -|SUMMARY|`datakit_io_flush_failcache_bytes`|`category`|IO flush fail-cache bytes(in gzip) summary| -|COUNTER|`datakit_filter_update_total`|`N/A`|Filters(remote) updated count| -|GAUGE|`datakit_filter_last_update_timestamp_seconds`|`N/A`|Filter last update time| -|COUNTER|`datakit_filter_point_total`|`category,filters,source`|Filter points of filters| -|GAUGE|`datakit_filter_parse_error`|`error,filters`|Filter parse error| -|COUNTER|`datakit_filter_point_dropped_total`|`category,filters,source`|Dropped points of filters| -|SUMMARY|`datakit_filter_pull_latency_seconds`|`status`|Filter pull(remote) latency| -|SUMMARY|`datakit_filter_latency_seconds`|`category,filters,source`|Filter latency of these filters| -|GAUGE|`datakit_io_queue_points`|`category`|IO module queued(cached) points| -|COUNTER|`datakit_io_input_filter_point_total`|`name,category`|Input filtered point total| -|COUNTER|`datakit_io_feed_total`|`name,category`|Input feed total| -|GAUGE|`datakit_io_last_feed_timestamp_seconds`|`name,category`|Input last feed time(according to Datakit local time)| -|SUMMARY|`datakit_input_collect_latency_seconds`|`name,category`|Input collect latency| -|GAUGE|`datakit_io_chan_usage`|`category`|IO channel usage(length of the channel)| -|GAUGE|`datakit_io_chan_capacity`|`category`|IO channel capacity| -|SUMMARY|`datakit_io_feed_cost_seconds`|`category,from`|IO feed waiting(on block mode) seconds| -|SUMMARY|`datakit_io_feed_point`|`name,category`|Input feed point| -|GAUGE|`datakit_io_flush_workers`|`category`|IO flush workers| -|COUNTER|`datakit_io_flush_total`|`category`|IO flush total| -|COUNTER|`datakit_error_total`|`source,category`|Total errors, only count on error source, not include error message| -|GAUGE|`datakit_goroutines`|`N/A`|Goroutine count within Datakit| -|GAUGE|`datakit_heap_alloc_bytes`|`N/A`|Datakit memory heap bytes| -|GAUGE|`datakit_sys_alloc_bytes`|`N/A`|Datakit memory system bytes| -|GAUGE|`datakit_cpu_usage`|`N/A`|Datakit CPU usage(%)| -|GAUGE|`datakit_open_files`|`N/A`|Datakit open files(only available on Linux)| -|GAUGE|`datakit_cpu_cores`|`N/A`|Datakit CPU cores| -|GAUGE|`datakit_uptime_seconds`|`auto_update,docker,hostname,lite,elinker,resource_limit,version=?,build_at=?,branch=?,os_arch=?`|Datakit uptime| -|GAUGE|`datakit_data_overuse`|`N/A`|Does current workspace's data(metric/logging) usage(if 0 not beyond, or with a unix timestamp when overuse occurred)| -|COUNTER|`datakit_process_ctx_switch_total`|`type`|Datakit process context switch count(Linux only)| -|COUNTER|`datakit_process_io_count_total`|`type`|Datakit process IO count| -|COUNTER|`datakit_process_io_bytes_total`|`type`|Datakit process IO bytes count| -|COUNTER|`datakit_pipeline_offload_point_total`|`category,exporter,remote`|Pipeline offload processed total points| -|COUNTER|`datakit_pipeline_offload_error_point_total`|`category,exporter,remote`|Pipeline offload processed total error points| -|SUMMARY|`datakit_pipeline_offload_cost_seconds`|`category,exporter,remote`|Pipeline offload total cost| -|GAUGE|`datakit_input_container_kubernetes_fetch_error`|`namespace,resource,error`|Kubernetes resource fetch error| -|SUMMARY|`datakit_input_container_kubernetes_collect_cost_seconds`|`category`|Kubernetes collect cost| -|SUMMARY|`datakit_input_container_kubernetes_collect_resource_cost_seconds`|`category,kind,fieldselector`|Kubernetes collect resource cost| -|COUNTER|`datakit_input_container_kubernetes_collect_pts_total`|`category`|Kubernetes collect point total| -|COUNTER|`datakit_input_container_kubernetes_pod_metrics_query_total`|`target`|Kubernetes query pod metrics count| -|SUMMARY|`datakit_input_container_collect_cost_seconds`|`category`|Container collect cost| -|COUNTER|`datakit_input_container_collect_pts_total`|`category`|Container collect point total| -|SUMMARY|`datakit_input_container_total_collect_cost_seconds`|`category`|Total container collect cost| -|SUMMARY|`datakit_dialtesting_task_run_cost_seconds`|`region,protocol`|Task run time| -|SUMMARY|`datakit_dialtesting_task_exec_time_interval_seconds`|`region,protocol`|Task execution time interval| -|GAUGE|`datakit_dialtesting_worker_job_chan_number`|`type`|The number of the channel for the jobs| -|GAUGE|`datakit_dialtesting_worker_job_number`|`N/A`|The number of the jobs to send data in parallel| -|GAUGE|`datakit_dialtesting_worker_cached_points_number`|`region,protocol`|The number of cached points| -|GAUGE|`datakit_dialtesting_worker_send_points_number`|`region,protocol,status`|The number of the points which have been sent| -|SUMMARY|`datakit_dialtesting_worker_send_cost_seconds`|`region,protocol`|Time cost to send points| -|GAUGE|`datakit_dialtesting_task_number`|`region,protocol`|The number of tasks| -|GAUGE|`datakit_dialtesting_dataway_send_failed_number`|`region,protocol,dataway`|The number of failed sending for each Dataway| -|SUMMARY|`datakit_dialtesting_pull_cost_seconds`|`region,is_first`|Time cost to pull tasks| -|COUNTER|`datakit_dialtesting_task_synchronized_total`|`region,protocol`|Task synchronized number| -|COUNTER|`datakit_dialtesting_task_invalid_total`|`region,protocol,fail_reason`|Invalid task number| -|SUMMARY|`datakit_dialtesting_task_check_cost_seconds`|`region,protocol,status`|Task check time| -|COUNTER|`datakit_input_kafkamq_consumer_message_total`|`topic,partition,status`|Kafka consumer message numbers from Datakit start| -|COUNTER|`datakit_input_kafkamq_group_election_total`|`N/A`|Kafka group election count| -|SUMMARY|`datakit_input_kafkamq_process_message_nano`|`topic`|kafkamq process message nanoseconds duration| -|COUNTER|`datakit_input_kubernetesprometheus_resource_collect_pts_total`|`role,name`|The number of the points which have been sent| -|GAUGE|`datakit_input_kubernetesprometheus_forked_worker_number`|`role,name`|The number of the worker| -|GAUGE|`datakit_inputs_instance`|`input`|Input instance count| -|COUNTER|`datakit_inputs_crash_total`|`input`|Input crash count| -|GAUGE|`datakit_input_ploffload_chan_capacity`|`channel_name`|PlOffload channel capacity| -|GAUGE|`datakit_input_ploffload_chan_usage`|`channel_name`|PlOffload channel usage| -|COUNTER|`datakit_input_ploffload_point_total`|`category`|PlOffload processed total points| -|SUMMARY|`datakit_input_promremote_collect_points`|`source`|Total number of promremote collection points| -|SUMMARY|`datakit_input_promremote_time_diff_in_second`|`source`|Time diff with local time| -|COUNTER|`datakit_input_promremote_no_time_points_total`|`source`|Total number of promremote collection no time points| -|GAUGE|`api_elapsed_seconds`|`N/A`|Proxied API elapsed seconds| -|COUNTER|`api_post_bytes_total`|`api,status`|Proxied API post bytes total| -|SUMMARY|`api_latency_seconds`|`api,status`|Proxied API latency| -|COUNTER|`datakit_input_proxy_connect_total`|`client_ip`|Proxy connect(method CONNECT)| -|COUNTER|`datakit_input_proxy_api_total`|`api,method`|Proxy API total| -|SUMMARY|`datakit_input_proxy_api_latency_seconds`|`api,method,status`|Proxy API latency| -|COUNTER|`datakit_input_rum_session_replay_drop_total`|`app_id,env,version,service`|statistics the total count of session replay points which have been filtered by rules| -|COUNTER|`datakit_input_rum_session_replay_drop_bytes_total`|`app_id,env,version,service`|statistics the total bytes of session replay points which have been filtered by rules| -|COUNTER|`datakit_input_rum_locate_statistics_total`|`app_id,ip_status,locate_status`|locate by ip addr statistics| -|COUNTER|`datakit_input_rum_source_map_total`|`app_id,sdk_name,status,remark`|source map result statistics| -|GAUGE|`datakit_input_rum_loaded_zips`|`platform`|RUM source map currently loaded zip archive count| -|SUMMARY|`datakit_input_rum_source_map_duration_seconds`|`sdk_name,app_id,env,version`|statistics elapsed time in RUM source map(unit: second)| -|SUMMARY|`datakit_input_rum_session_replay_upload_latency_seconds`|`app_id,env,version,service,status_code`|statistics elapsed time in session replay uploading| -|COUNTER|`datakit_input_rum_session_replay_upload_failure_total`|`app_id,env,version,service,status_code`|statistics count of session replay points which which have unsuccessfully uploaded| -|COUNTER|`datakit_input_rum_session_replay_upload_failure_bytes_total`|`app_id,env,version,service,status_code`|statistics the total bytes of session replay points which have unsuccessfully uploaded| -|SUMMARY|`datakit_input_rum_session_replay_read_body_delay_seconds`|`app_id,env,version,service`|statistics the duration of reading session replay body| -|SUMMARY|`datakit_input_snmp_discovery_cost`|`profile_type`|Discovery cost(in second)| -|SUMMARY|`datakit_input_snmp_collect_cost`|`N/A`|Every loop collect cost(in second)| -|SUMMARY|`datakit_input_snmp_device_collect_cost`|`class`|Device collect cost(in second)| -|GAUGE|`datakit_input_snmp_alive_devices`|`class`|Alive devices| -|SUMMARY|`datakit_input_prom_collect_points`|`mode,source`|Total number of prom collection points| -|SUMMARY|`datakit_input_prom_http_get_bytes`|`mode,source`|HTTP get bytes| -|SUMMARY|`datakit_input_prom_http_latency_in_second`|`mode,source`|HTTP latency(in second)| -|GAUGE|`datakit_input_prom_stream_size`|`mode,source`|Stream size| -|SUMMARY|`datakit_input_statsd_collect_points`|`N/A`|Total number of statsd collection points| -|SUMMARY|`datakit_input_statsd_accept_bytes`|`N/A`|Accept bytes from network| -|COUNTER|`datakit_input_logging_socket_feed_message_count_total`|`network`|socket feed to IO message count| -|SUMMARY|`datakit_input_logging_socket_log_length`|`network`|record the length of each log line| -|COUNTER|`datakit_tailer_collect_multiline_state_total`|`source,filepath,multilinestate`|Tailer multiline state total| -|COUNTER|`datakit_tailer_file_rotate_total`|`source,filepath`|Tailer rotate total| -|COUNTER|`datakit_tailer_buffer_force_flush_total`|`source,filepath`|Tailer force flush total| -|COUNTER|`datakit_tailer_parse_fail_total`|`source,filepath,mode`|Tailer parse fail total| -|GAUGE|`datakit_tailer_open_file_num`|`mode`|Tailer open file total| -|COUNTER|`datakit_input_logging_socket_connect_status_total`|`network,status`|connect and close count for net.conn| -|COUNTER|`datakit_input_tracing_total`|`input,service`|The total links number of Trace processed by the trace module| -|COUNTER|`datakit_input_sampler_total`|`input,service`|The sampler number of Trace processed by the trace module| -|COUNTER|`diskcache_put_bytes_total`|`path`|Cache Put() bytes count| -|COUNTER|`diskcache_get_total`|`path`|Cache Get() count| -|COUNTER|`diskcache_wakeup_total`|`path`|Wakeup count on sleeping write file| -|COUNTER|`diskcache_seek_back_total`|`path`|Seek back when Get() got any error| -|COUNTER|`diskcache_get_bytes_total`|`path`|Cache Get() bytes count| -|GAUGE|`diskcache_capacity`|`path`|Current capacity(in bytes)| -|GAUGE|`diskcache_max_data`|`path`|Max data to Put(in bytes), default 0| -|GAUGE|`diskcache_batch_size`|`path`|Data file size(in bytes)| -|GAUGE|`diskcache_size`|`path`|Current cache size(in bytes)| -|GAUGE|`diskcache_open_time`|`no_fallback_on_error,no_lock,no_pos,no_sync,path`|Current cache Open time in unix timestamp(second)| -|GAUGE|`diskcache_last_close_time`|`path`|Current cache last Close time in unix timestamp(second)| -|GAUGE|`diskcache_datafiles`|`path`|Current un-read data files| -|SUMMARY|`diskcache_get_latency`|`path`|Get() time cost(micro-second)| -|SUMMARY|`diskcache_put_latency`|`path`|Put() time cost(micro-second)| -|COUNTER|`diskcache_dropped_bytes_total`|`path`|Dropped bytes during Put() when capacity reached.| -|COUNTER|`diskcache_dropped_total`|`path,reason`|Dropped files during Put() when capacity reached.| -|COUNTER|`diskcache_rotate_total`|`path`|Cache rotate count, mean file rotate from data to data.0000xxx| -|COUNTER|`diskcache_remove_total`|`path`|Removed file count, if some file read EOF, remove it from un-read list| -|COUNTER|`diskcache_put_total`|`path`|Cache Put() count| -|COUNTER|`pointpool_chan_get_total`|`N/A`|Get count from reserved channel| -|COUNTER|`pointpool_chan_put_total`|`N/A`|Put count to reserved channel| -|COUNTER|`pointpool_pool_get_total`|`N/A`|Get count from reserved channel| -|COUNTER|`pointpool_pool_put_total`|`N/A`|Put count to reserved channel| -|COUNTER|`pointpool_reserved_capacity`|`N/A`|Reserved capacity of the pool| -|COUNTER|`pointpool_malloc_total`|`N/A`|New object malloc from pool| -|COUNTER|`pointpool_escaped`|`N/A`|Points that not comes from pool| +|POSITION|TYPE|NAME|LABELS|HELP| +|---|---|---|---|---| +|*internal/config*|GAUGE|`datakit_config_datakit_ulimit`|`status`|Datakit ulimit| +|*internal/dnswatcher*|COUNTER|`datakit_dns_domain_total`|`N/A`|DNS watched domain counter| +|*internal/dnswatcher*|COUNTER|`datakit_dns_ip_updated_total`|`domain`|Domain IP updated counter| +|*internal/dnswatcher*|COUNTER|`datakit_dns_watch_run_total`|`interval`|Watch run counter| +|*internal/dnswatcher*|SUMMARY|`datakit_dns_cost_seconds`|`domain,status`|DNS IP lookup cost| +|*internal/election*|COUNTER|`datakit_election_pause_total`|`id,namespace`|Input paused count when election failed| +|*internal/election*|COUNTER|`datakit_election_resume_total`|`id,namespace`|Input resume count when election OK| +|*internal/election*|GAUGE|`datakit_election_status`|`elected_id,id,namespace,status`|Datakit election status, if metric = 0, meas not elected, or the elected time(unix timestamp second)| +|*internal/election*|GAUGE|`datakit_election_inputs`|`namespace`|Datakit election input count| +|*internal/election*|SUMMARY|`datakit_election_seconds`|`namespace,status`|Election latency| +|*internal/goroutine*|GAUGE|`datakit_goroutine_alive`|`name`|Alive Goroutine count| +|*internal/goroutine*|COUNTER|`datakit_goroutine_recover_total`|`name`|Recovered Goroutine count| +|*internal/goroutine*|COUNTER|`datakit_goroutine_stopped_total`|`name`|Stopped Goroutine count| +|*internal/goroutine*|COUNTER|`datakit_goroutine_crashed_total`|`name`|Crashed goroutines count| +|*internal/goroutine*|GAUGE|`datakit_goroutine_groups`|`N/A`|Goroutine group count| +|*internal/goroutine*|SUMMARY|`datakit_goroutine_cost_seconds`|`name`|Goroutine running duration| +|*internal/httpapi*|SUMMARY|`datakit_http_api_elapsed_seconds`|`api,method,status`|API request cost| +|*internal/httpapi*|SUMMARY|`datakit_http_api_req_size_bytes`|`api,method,status`|API request body size| +|*internal/httpapi*|COUNTER|`datakit_http_api_total`|`api,method,status`|API request counter| +|*internal/httpapi*|GAUGE|`datakit_http_api_global_tags_last_updated`|`api,method,status`|Global tag updated timestamp, in second| +|*internal/httpcli*|SUMMARY|`datakit_httpcli_got_first_resp_byte_cost_seconds`|`from`|Got first response byte cost| +|*internal/httpcli*|COUNTER|`datakit_httpcli_tcp_conn_total`|`from,remote,type`|HTTP TCP connection count| +|*internal/httpcli*|COUNTER|`datakit_httpcli_conn_reused_from_idle_total`|`from`|HTTP connection reused from idle count| +|*internal/httpcli*|SUMMARY|`datakit_httpcli_conn_idle_time_seconds`|`from`|HTTP connection idle time| +|*internal/httpcli*|SUMMARY|`datakit_httpcli_dns_cost_seconds`|`from`|HTTP DNS cost| +|*internal/httpcli*|SUMMARY|`datakit_httpcli_tls_handshake_seconds`|`from`|HTTP TLS handshake cost| +|*internal/httpcli*|SUMMARY|`datakit_httpcli_http_connect_cost_seconds`|`from`|HTTP connect cost| +|*internal/io/dataway*|SUMMARY|`datakit_io_flush_failcache_bytes`|`category`|IO flush fail-cache bytes(in gzip) summary| +|*internal/io/dataway*|SUMMARY|`datakit_io_build_body_cost_seconds`|`category,encoding,stage`|Build point HTTP body cost| +|*internal/io/dataway*|SUMMARY|`datakit_io_build_body_batches`|`category,encoding`|Batch HTTP body batches| +|*internal/io/dataway*|SUMMARY|`datakit_io_build_body_batch_bytes`|`category,encoding,type`|Batch HTTP body size| +|*internal/io/dataway*|SUMMARY|`datakit_io_build_body_batch_points`|`category,encoding`|Batch HTTP body points| +|*internal/io/dataway*|SUMMARY|`datakit_io_dataway_wal_flush`|`category,gzip,queue`|Dataway WAL worker flushed bytes| +|*internal/io/dataway*|COUNTER|`datakit_io_dataway_point_total`|`category,status`|Dataway uploaded points, partitioned by category and send status(HTTP status)| +|*internal/io/dataway*|COUNTER|`datakit_io_dataway_point_bytes_total`|`category,enc,status`|Dataway uploaded points bytes, partitioned by category and pint send status(HTTP status)| +|*internal/io/dataway*|COUNTER|`datakit_io_dataway_http_drop_point_total`|`category,error`|Dataway write drop points| +|*internal/io/dataway*|SUMMARY|`datakit_io_dataway_api_latency_seconds`|`api,status`|Dataway HTTP request latency partitioned by HTTP API(method@url) and HTTP status| +|*internal/io/dataway*|COUNTER|`datakit_io_http_retry_total`|`api,status`|Dataway HTTP retried count| +|*internal/io/dataway*|SUMMARY|`datakit_io_grouped_request`|`category`|Grouped requests under sinker| +|*internal/io/dataway*|GAUGE|`datakit_io_dataway_wal_mem_len`|`category`|Dataway WAL's memory queue length| +|*internal/io/filter*|COUNTER|`datakit_filter_update_total`|`N/A`|Filters(remote) updated count| +|*internal/io/filter*|GAUGE|`datakit_filter_last_update_timestamp_seconds`|`N/A`|Filter last update time| +|*internal/io/filter*|COUNTER|`datakit_filter_point_total`|`category,filters,source`|Filter points of filters| +|*internal/io/filter*|GAUGE|`datakit_filter_parse_error`|`error,filters`|Filter parse error| +|*internal/io/filter*|COUNTER|`datakit_filter_point_dropped_total`|`category,filters,source`|Dropped points of filters| +|*internal/io/filter*|SUMMARY|`datakit_filter_pull_latency_seconds`|`status`|Filter pull(remote) latency| +|*internal/io/filter*|SUMMARY|`datakit_filter_latency_seconds`|`category,filters,source`|Filter latency of these filters| +|*internal/io*|GAUGE|`datakit_io_queue_points`|`category`|IO module queued(cached) points| +|*internal/io*|COUNTER|`datakit_io_input_filter_point_total`|`name,category`|Input filtered point total| +|*internal/io*|COUNTER|`datakit_io_feed_total`|`name,category`|Input feed total| +|*internal/io*|GAUGE|`datakit_io_last_feed_timestamp_seconds`|`name,category`|Input last feed time(according to Datakit local time)| +|*internal/io*|SUMMARY|`datakit_input_collect_latency_seconds`|`name,category`|Input collect latency| +|*internal/io*|GAUGE|`datakit_io_chan_usage`|`category`|IO channel usage(length of the channel)| +|*internal/io*|GAUGE|`datakit_io_chan_capacity`|`category`|IO channel capacity| +|*internal/io*|SUMMARY|`datakit_io_feed_cost_seconds`|`category,from`|IO feed waiting(on block mode) seconds| +|*internal/io*|SUMMARY|`datakit_io_feed_point`|`name,category`|Input feed point| +|*internal/io*|GAUGE|`datakit_io_flush_workers`|`category`|IO flush workers| +|*internal/io*|COUNTER|`datakit_io_flush_total`|`category`|IO flush total| +|*internal/metrics*|COUNTER|`datakit_error_total`|`source,category`|Total errors, only count on error source, not include error message| +|*internal/metrics*|GAUGE|`datakit_goroutines`|`N/A`|Goroutine count within Datakit| +|*internal/metrics*|GAUGE|`datakit_mem_stat`|`type`|Datakit memory system bytes| +|*internal/metrics*|GAUGE|`datakit_heap_alloc_bytes`|`N/A`|Datakit memory heap bytes(Deprecated by `datakit_golang_mem_usage`)| +|*internal/metrics*|GAUGE|`datakit_sys_alloc_bytes`|`N/A`|Datakit memory system bytes(Deprecated by `datakit_golang_mem_usage`)| +|*internal/metrics*|GAUGE|`datakit_golang_mem_usage`|`type`|Datakit golang memory usage stats| +|*internal/metrics*|GAUGE|`datakit_cpu_usage`|`N/A`|Datakit CPU usage(%)| +|*internal/metrics*|GAUGE|`datakit_open_files`|`N/A`|Datakit open files(only available on Linux)| +|*internal/metrics*|GAUGE|`datakit_cpu_cores`|`N/A`|Datakit CPU cores| +|*internal/metrics*|GAUGE|`datakit_uptime_seconds`|`auto_update,docker,hostname,lite,elinker,resource_limit,version=?,build_at=?,branch=?,os_arch=?`|Datakit uptime| +|*internal/metrics*|GAUGE|`datakit_data_overuse`|`N/A`|Does current workspace's data(metric/logging) usage(if 0 not beyond, or with a unix timestamp when overuse occurred)| +|*internal/metrics*|COUNTER|`datakit_process_ctx_switch_total`|`type`|Datakit process context switch count(Linux only)| +|*internal/metrics*|COUNTER|`datakit_process_io_count_total`|`type`|Datakit process IO count| +|*internal/metrics*|COUNTER|`datakit_process_io_bytes_total`|`type`|Datakit process IO bytes count| +|*internal/ntp*|COUNTER|`datakit_ntp_sync_total`|`N/A`|Total count synced with remote NTP server| +|*internal/ntp*|SUMMARY|`datakit_ntp_time_diff`|`N/A`|Time difference(seconds) between remote NTP server| +|*internal/pipeline/offload*|COUNTER|`datakit_pipeline_offload_point_total`|`category,exporter,remote`|Pipeline offload processed total points| +|*internal/pipeline/offload*|COUNTER|`datakit_pipeline_offload_error_point_total`|`category,exporter,remote`|Pipeline offload processed total error points| +|*internal/pipeline/offload*|SUMMARY|`datakit_pipeline_offload_cost_seconds`|`category,exporter,remote`|Pipeline offload total cost| +|*internal/plugins/inputs/container/kubernetes*|GAUGE|`datakit_input_container_kubernetes_fetch_error`|`namespace,resource,error`|Kubernetes resource fetch error| +|*internal/plugins/inputs/container/kubernetes*|SUMMARY|`datakit_input_container_kubernetes_collect_cost_seconds`|`category`|Kubernetes collect cost| +|*internal/plugins/inputs/container/kubernetes*|SUMMARY|`datakit_input_container_kubernetes_collect_resource_cost_seconds`|`category,kind,fieldselector`|Kubernetes collect resource cost| +|*internal/plugins/inputs/container/kubernetes*|COUNTER|`datakit_input_container_kubernetes_collect_pts_total`|`category`|Kubernetes collect point total| +|*internal/plugins/inputs/container/kubernetes*|COUNTER|`datakit_input_container_kubernetes_pod_metrics_query_total`|`target`|Kubernetes query pod metrics count| +|*internal/plugins/inputs/container*|SUMMARY|`datakit_input_container_collect_cost_seconds`|`category`|Container collect cost| +|*internal/plugins/inputs/container*|COUNTER|`datakit_input_container_collect_pts_total`|`category`|Container collect point total| +|*internal/plugins/inputs/container*|SUMMARY|`datakit_input_container_total_collect_cost_seconds`|`category`|Total container collect cost| +|*internal/plugins/inputs/dialtesting*|SUMMARY|`datakit_dialtesting_task_run_cost_seconds`|`region,protocol`|Task run time| +|*internal/plugins/inputs/dialtesting*|SUMMARY|`datakit_dialtesting_task_exec_time_interval_seconds`|`region,protocol`|Task execution time interval| +|*internal/plugins/inputs/dialtesting*|GAUGE|`datakit_dialtesting_worker_job_chan_number`|`type`|The number of the channel for the jobs| +|*internal/plugins/inputs/dialtesting*|GAUGE|`datakit_dialtesting_worker_job_number`|`N/A`|The number of the jobs to send data in parallel| +|*internal/plugins/inputs/dialtesting*|GAUGE|`datakit_dialtesting_worker_cached_points_number`|`region,protocol`|The number of cached points| +|*internal/plugins/inputs/dialtesting*|GAUGE|`datakit_dialtesting_worker_send_points_number`|`region,protocol,status`|The number of the points which have been sent| +|*internal/plugins/inputs/dialtesting*|SUMMARY|`datakit_dialtesting_worker_send_cost_seconds`|`region,protocol`|Time cost to send points| +|*internal/plugins/inputs/dialtesting*|GAUGE|`datakit_dialtesting_task_number`|`region,protocol`|The number of tasks| +|*internal/plugins/inputs/dialtesting*|GAUGE|`datakit_dialtesting_dataway_send_failed_number`|`region,protocol`|The number of failed sending| +|*internal/plugins/inputs/dialtesting*|SUMMARY|`datakit_dialtesting_pull_cost_seconds`|`region,is_first`|Time cost to pull tasks| +|*internal/plugins/inputs/dialtesting*|COUNTER|`datakit_dialtesting_task_synchronized_total`|`region,protocol`|Task synchronized number| +|*internal/plugins/inputs/dialtesting*|COUNTER|`datakit_dialtesting_task_invalid_total`|`region,protocol,fail_reason`|Invalid task number| +|*internal/plugins/inputs/dialtesting*|SUMMARY|`datakit_dialtesting_task_check_cost_seconds`|`region,protocol,status`|Task check time| +|*internal/plugins/inputs/graphite/cache*|GAUGE|`datakit_input_graphite_metric_mapper_cache_length`|`N/A`|The count of unique metrics currently cached.| +|*internal/plugins/inputs/graphite/cache*|COUNTER|`datakit_input_graphite_metric_cache_gets_total`|`N/A`|The count of total metric cache gets.| +|*internal/plugins/inputs/graphite/cache*|COUNTER|`datakit_input_graphite_metric_mapper_cache_hits_total`|`N/A`|The count of total metric cache hits.| +|*internal/plugins/inputs/graphite*|COUNTER|`datakit_input_graphite_tag_parse_failures_total`|`N/A`|Total count of samples with invalid tags| +|*internal/plugins/inputs/graphite*|GAUGE|`datakit_input_graphite_last_processed_timestamp_seconds`|`N/A`|Unix timestamp of the last processed graphite metric.| +|*internal/plugins/inputs/graphite*|GAUGE|`datakit_input_graphite_sample_expiry_seconds`|`N/A`|How long in seconds a metric sample is valid for.| +|*internal/plugins/inputs/kafkamq*|COUNTER|`datakit_input_kafkamq_consumer_message_total`|`topic,partition,status`|Kafka consumer message numbers from Datakit start| +|*internal/plugins/inputs/kafkamq*|COUNTER|`datakit_input_kafkamq_group_election_total`|`N/A`|Kafka group election count| +|*internal/plugins/inputs/kafkamq*|SUMMARY|`datakit_input_kafkamq_process_message_nano`|`topic`|kafkamq process message nanoseconds duration| +|*internal/plugins/inputs/kubernetesprometheus*|COUNTER|`datakit_input_kubernetesprometheus_resource_collect_pts_total`|`role,name`|The number of the points which have been sent| +|*internal/plugins/inputs/kubernetesprometheus*|GAUGE|`datakit_input_kubernetesprometheus_resource_target_number`|`role,name`|The number of the target| +|*internal/plugins/inputs/kubernetesprometheus*|SUMMARY|`datakit_input_kubernetesprometheus_resource_scrape_cost_seconds`|`role,name,url`|The scrape cost in seconds| +|*internal/plugins/inputs/kubernetesprometheus*|GAUGE|`datakit_input_kubernetesprometheus_worker_number`|`role,worker`|The number of the worker| +|*internal/plugins/inputs*|GAUGE|`datakit_inputs_instance`|`input`|Input instance count| +|*internal/plugins/inputs*|COUNTER|`datakit_inputs_crash_total`|`input`|Input crash count| +|*internal/plugins/inputs/ploffload*|GAUGE|`datakit_input_ploffload_chan_capacity`|`channel_name`|PlOffload channel capacity| +|*internal/plugins/inputs/ploffload*|GAUGE|`datakit_input_ploffload_chan_usage`|`channel_name`|PlOffload channel usage| +|*internal/plugins/inputs/ploffload*|COUNTER|`datakit_input_ploffload_point_total`|`category`|PlOffload processed total points| +|*internal/plugins/inputs/promremote*|SUMMARY|`datakit_input_promremote_collect_points`|`source`|Total number of promremote collection points| +|*internal/plugins/inputs/promremote*|SUMMARY|`datakit_input_promremote_time_diff_in_second`|`source`|Time diff with local time| +|*internal/plugins/inputs/promremote*|COUNTER|`datakit_input_promremote_no_time_points_total`|`source`|Total number of promremote collection no time points| +|*internal/plugins/inputs/proxy/bench/client*|GAUGE|`api_elapsed_seconds`|`N/A`|Proxied API elapsed seconds| +|*internal/plugins/inputs/proxy/bench/client*|COUNTER|`api_post_bytes_total`|`api,status`|Proxied API post bytes total| +|*internal/plugins/inputs/proxy/bench/client*|SUMMARY|`api_latency_seconds`|`api,status`|Proxied API latency| +|*internal/plugins/inputs/proxy*|COUNTER|`datakit_input_proxy_connect_total`|`client_ip`|Proxy connect(method CONNECT)| +|*internal/plugins/inputs/proxy*|COUNTER|`datakit_input_proxy_api_total`|`api,method`|Proxy API total| +|*internal/plugins/inputs/proxy*|SUMMARY|`datakit_input_proxy_api_latency_seconds`|`api,method,status`|Proxy API latency| +|*internal/plugins/inputs/rum*|COUNTER|`datakit_input_rum_session_replay_drop_total`|`app_id,env,version,service`|statistics the total count of session replay points which have been filtered by rules| +|*internal/plugins/inputs/rum*|COUNTER|`datakit_input_rum_session_replay_drop_bytes_total`|`app_id,env,version,service`|statistics the total bytes of session replay points which have been filtered by rules| +|*internal/plugins/inputs/rum*|COUNTER|`datakit_input_rum_locate_statistics_total`|`app_id,ip_status,locate_status`|locate by ip addr statistics| +|*internal/plugins/inputs/rum*|COUNTER|`datakit_input_rum_source_map_total`|`app_id,sdk_name,status,remark`|source map result statistics| +|*internal/plugins/inputs/rum*|GAUGE|`datakit_input_rum_loaded_zips`|`platform`|RUM source map currently loaded zip archive count| +|*internal/plugins/inputs/rum*|SUMMARY|`datakit_input_rum_source_map_duration_seconds`|`sdk_name,app_id,env,version`|statistics elapsed time in RUM source map(unit: second)| +|*internal/plugins/inputs/rum*|SUMMARY|`datakit_input_rum_session_replay_upload_latency_seconds`|`app_id,env,version,service,status_code`|statistics elapsed time in session replay uploading| +|*internal/plugins/inputs/rum*|COUNTER|`datakit_input_rum_session_replay_upload_failure_total`|`app_id,env,version,service,status_code`|statistics count of session replay points which which have unsuccessfully uploaded| +|*internal/plugins/inputs/rum*|COUNTER|`datakit_input_rum_session_replay_upload_failure_bytes_total`|`app_id,env,version,service,status_code`|statistics the total bytes of session replay points which have unsuccessfully uploaded| +|*internal/plugins/inputs/rum*|SUMMARY|`datakit_input_rum_session_replay_read_body_delay_seconds`|`app_id,env,version,service`|statistics the duration of reading session replay body| +|*internal/plugins/inputs/snmp*|SUMMARY|`datakit_input_snmp_discovery_cost`|`profile_type`|Discovery cost(in second)| +|*internal/plugins/inputs/snmp*|SUMMARY|`datakit_input_snmp_collect_cost`|`N/A`|Every loop collect cost(in second)| +|*internal/plugins/inputs/snmp*|SUMMARY|`datakit_input_snmp_device_collect_cost`|`class`|Device collect cost(in second)| +|*internal/plugins/inputs/snmp*|GAUGE|`datakit_input_snmp_alive_devices`|`class`|Alive devices| +|*internal/prom*|SUMMARY|`datakit_input_prom_collect_points`|`mode,source`|Total number of prom collection points| +|*internal/prom*|SUMMARY|`datakit_input_prom_http_get_bytes`|`mode,source`|HTTP get bytes| +|*internal/prom*|SUMMARY|`datakit_input_prom_http_latency_in_second`|`mode,source`|HTTP latency(in second)| +|*internal/prom*|GAUGE|`datakit_input_prom_stream_size`|`mode,source`|Stream size| +|*internal/statsd*|SUMMARY|`datakit_input_statsd_collect_points`|`N/A`|Total number of statsd collection points| +|*internal/statsd*|SUMMARY|`datakit_input_statsd_accept_bytes`|`N/A`|Accept bytes from network| +|*internal/tailer*|COUNTER|`datakit_input_logging_socket_feed_message_count_total`|`network`|socket feed to IO message count| +|*internal/tailer*|SUMMARY|`datakit_input_logging_socket_log_length`|`network`|record the length of each log line| +|*internal/tailer*|COUNTER|`datakit_tailer_collect_multiline_state_total`|`source,filepath,multilinestate`|Tailer multiline state total| +|*internal/tailer*|COUNTER|`datakit_tailer_file_rotate_total`|`source,filepath`|Tailer rotate total| +|*internal/tailer*|COUNTER|`datakit_tailer_buffer_force_flush_total`|`source,filepath`|Tailer force flush total| +|*internal/tailer*|COUNTER|`datakit_tailer_parse_fail_total`|`source,filepath,mode`|Tailer parse fail total| +|*internal/tailer*|GAUGE|`datakit_tailer_open_file_num`|`mode`|Tailer open file total| +|*internal/tailer*|COUNTER|`datakit_input_logging_socket_connect_status_total`|`network,status`|connect and close count for net.conn| +|*internal/trace*|COUNTER|`datakit_input_tracing_total`|`input,service`|The total links number of Trace processed by the trace module| +|*internal/trace*|COUNTER|`datakit_input_sampler_total`|`input,service`|The sampler number of Trace processed by the trace module| +|*vendor/github.com/GuanceCloud/cliutils/diskcache*|SUMMARY|`diskcache_dropped_data`|`path,reason`|Dropped data during Put() when capacity reached.| +|*vendor/github.com/GuanceCloud/cliutils/diskcache*|COUNTER|`diskcache_rotate_total`|`path`|Cache rotate count, mean file rotate from data to data.0000xxx| +|*vendor/github.com/GuanceCloud/cliutils/diskcache*|COUNTER|`diskcache_remove_total`|`path`|Removed file count, if some file read EOF, remove it from un-read list| +|*vendor/github.com/GuanceCloud/cliutils/diskcache*|COUNTER|`diskcache_wakeup_total`|`path`|Wakeup count on sleeping write file| +|*vendor/github.com/GuanceCloud/cliutils/diskcache*|COUNTER|`diskcache_seek_back_total`|`path`|Seek back when Get() got any error| +|*vendor/github.com/GuanceCloud/cliutils/diskcache*|GAUGE|`diskcache_capacity`|`path`|Current capacity(in bytes)| +|*vendor/github.com/GuanceCloud/cliutils/diskcache*|GAUGE|`diskcache_max_data`|`path`|Max data to Put(in bytes), default 0| +|*vendor/github.com/GuanceCloud/cliutils/diskcache*|GAUGE|`diskcache_batch_size`|`path`|Data file size(in bytes)| +|*vendor/github.com/GuanceCloud/cliutils/diskcache*|GAUGE|`diskcache_size`|`path`|Current cache size(in bytes)| +|*vendor/github.com/GuanceCloud/cliutils/diskcache*|GAUGE|`diskcache_open_time`|`no_fallback_on_error,no_lock,no_pos,no_sync,path`|Current cache Open time in unix timestamp(second)| +|*vendor/github.com/GuanceCloud/cliutils/diskcache*|GAUGE|`diskcache_last_close_time`|`path`|Current cache last Close time in unix timestamp(second)| +|*vendor/github.com/GuanceCloud/cliutils/diskcache*|GAUGE|`diskcache_datafiles`|`path`|Current un-read data files| +|*vendor/github.com/GuanceCloud/cliutils/diskcache*|SUMMARY|`diskcache_stream_put`|`path`|Stream put times| +|*vendor/github.com/GuanceCloud/cliutils/diskcache*|SUMMARY|`diskcache_get_latency`|`path`|Get() cost seconds| +|*vendor/github.com/GuanceCloud/cliutils/diskcache*|SUMMARY|`diskcache_put_latency`|`path`|Put() cost seconds| +|*vendor/github.com/GuanceCloud/cliutils/diskcache*|SUMMARY|`diskcache_put_bytes`|`path`|Cache Put() bytes| +|*vendor/github.com/GuanceCloud/cliutils/diskcache*|SUMMARY|`diskcache_get_bytes`|`path`|Cache Get() bytes| +|*vendor/github.com/GuanceCloud/cliutils/point*|COUNTER|`pointpool_chan_get_total`|`N/A`|Get count from reserved channel| +|*vendor/github.com/GuanceCloud/cliutils/point*|COUNTER|`pointpool_chan_put_total`|`N/A`|Put count to reserved channel| +|*vendor/github.com/GuanceCloud/cliutils/point*|COUNTER|`pointpool_pool_get_total`|`N/A`|Get count from reserved channel| +|*vendor/github.com/GuanceCloud/cliutils/point*|COUNTER|`pointpool_pool_put_total`|`N/A`|Put count to reserved channel| +|*vendor/github.com/GuanceCloud/cliutils/point*|COUNTER|`pointpool_reserved_capacity`|`N/A`|Reserved capacity of the pool| +|*vendor/github.com/GuanceCloud/cliutils/point*|COUNTER|`pointpool_malloc_total`|`N/A`|New object malloc from pool| +|*vendor/github.com/GuanceCloud/cliutils/point*|COUNTER|`pointpool_escaped`|`N/A`|Points that not comes from pool| ### Golang Runtime Metrics {#go-runtime-metrics} diff --git a/internal/export/doc/env.go b/internal/export/doc/env.go index d577c1870e..dc4e904f5b 100644 --- a/internal/export/doc/env.go +++ b/internal/export/doc/env.go @@ -23,7 +23,7 @@ const ( URL = "URL" JSON = "JSON" List = "List" - TimeDuration = "TimeDuration" + TimeDuration = "Duration" // Doc type. NonInput = "NonInput" diff --git a/internal/export/doc/zh/datakit-conf.md b/internal/export/doc/zh/datakit-conf.md index 8093d160ed..3935fa05b2 100644 --- a/internal/export/doc/zh/datakit-conf.md +++ b/internal/export/doc/zh/datakit-conf.md @@ -59,12 +59,14 @@ DataKit 会开启 HTTP 服务,用来接收外部数据,或者对外提供基 配置完成后可以使用 `curl` 命令测试是否配置成功:`sudo curl --no-buffer -XGET --unix-socket /tmp/datakit.sock http:/localhost/v1/ping`。更多关于 `curl` 的测试命令的信息可以参阅[这里](https://superuser.com/a/925610){:target="_blank"}。 ### HTTP 请求频率控制 {#set-http-api-limit} + + > [:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0) 已经默认开启该功能。 由于 DataKit 需要大量接收外部数据写入,为了避免给所在节点造成巨大开销,可修改如下 HTTP 配置(默认不开启): ```toml [http_api] - request_rate_limit = 1000.0 # 限制每个 HTTP API 每秒只接收 1000 次请求 + request_rate_limit = 20.0 # 限制每个客户端(IP + API 路由)发起请求的 QPS 限制 ``` ### 其它设置 {#http-other-settings} @@ -316,6 +318,57 @@ Dataway 部分有如下几个配置可以配置,其它部分不建议改动: Kubernetes 下部署相关配置参见[这里](datakit-daemonset-deploy.md#env-dataway)。 +#### WAL 队列配置 {#dataway-wal} + +[:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0) + +在 `[dataway.wal]` 中,我们可以调整 WAL 队列的配置: + +```toml + [dataway.wal] + max_capacity_gb = 2.0 # 2GB reserved disk space for each category(M/L/O/T/...) + workers = 0 # flush workers on WAL(default to CPU limited cores) + mem_cap = 0 # in-memory queue capacity(default to CPU limited cores) + fail_cache_clean_interval = "30s" # duration for clean fail uploaded data +``` + +磁盘文件位于 Datakit 安装目录的 *data/dw-wal* 目录下: + +```shell +/usr/local/datakit/data/dw-wal/ +├── custom_object +│   └── data +├── dialtesting +│   └── data +├── dynamic_dw +│   └── data +├── fc +│   └── data +├── keyevent +│   └── data +├── logging +│   ├── data +│   └── data.00000000000000000000000000000000 +├── metric +│   └── data +├── network +│   └── data +├── object +│   └── data +├── profiling +│   └── data +├── rum +│   └── data +├── security +│   └── data +└── tracing + └── data + +13 directories, 14 files +``` + +此处,除了 *fc* 是失败重传队列,其它目录分别对应一种数据类型。 + ### Sinker 配置 {#dataway-sink} 参见[这里](../deployment/dataway-sink.md) diff --git a/internal/export/doc/zh/datakit-metrics.md b/internal/export/doc/zh/datakit-metrics.md index a76ae733af..8552ab11c5 100644 --- a/internal/export/doc/zh/datakit-metrics.md +++ b/internal/export/doc/zh/datakit-metrics.md @@ -22,175 +22,187 @@ datakit_cpu_usage 4.9920266849857144 其它指标也能通过类似方式来观察,目前已有的指标如下(当前版本 {{ .Version }}): -|TYPE|NAME|LABELS|HELP| -|---|---|---|---| -|GAUGE|`datakit_config_datakit_ulimit`|`status`|Datakit ulimit| -|COUNTER|`datakit_dns_domain_total`|`N/A`|DNS watched domain counter| -|COUNTER|`datakit_dns_ip_updated_total`|`domain`|Domain IP updated counter| -|COUNTER|`datakit_dns_watch_run_total`|`interval`|Watch run counter| -|SUMMARY|`datakit_dns_cost_seconds`|`domain,status`|DNS IP lookup cost| -|COUNTER|`datakit_election_pause_total`|`id,namespace`|Input paused count when election failed| -|COUNTER|`datakit_election_resume_total`|`id,namespace`|Input resume count when election OK| -|GAUGE|`datakit_election_status`|`elected_id,id,namespace,status`|Datakit election status, if metric = 0, meas not elected, or the elected time(unix timestamp second)| -|GAUGE|`datakit_election_inputs`|`namespace`|Datakit election input count| -|SUMMARY|`datakit_election_seconds`|`namespace,status`|Election latency| -|GAUGE|`datakit_goroutine_alive`|`name`|Alive Goroutine count| -|COUNTER|`datakit_goroutine_recover_total`|`name`|Recovered Goroutine count| -|COUNTER|`datakit_goroutine_stopped_total`|`name`|Stopped Goroutine count| -|COUNTER|`datakit_goroutine_crashed_total`|`name`|Crashed goroutines count| -|GAUGE|`datakit_goroutine_groups`|`N/A`|Goroutine group count| -|SUMMARY|`datakit_goroutine_cost_seconds`|`name`|Goroutine running duration| -|SUMMARY|`datakit_http_api_elapsed_seconds`|`api,method,status`|API request cost| -|SUMMARY|`datakit_http_api_req_size_bytes`|`api,method,status`|API request body size| -|COUNTER|`datakit_http_api_total`|`api,method,status`|API request counter| -|GAUGE|`datakit_http_api_global_tags_last_updated`|`api,method,status`|Global tag updated timestamp, in second| -|SUMMARY|`datakit_httpcli_got_first_resp_byte_cost_seconds`|`from`|Got first response byte cost| -|COUNTER|`datakit_httpcli_tcp_conn_total`|`from,remote,type`|HTTP TCP connection count| -|COUNTER|`datakit_httpcli_conn_reused_from_idle_total`|`from`|HTTP connection reused from idle count| -|SUMMARY|`datakit_httpcli_conn_idle_time_seconds`|`from`|HTTP connection idle time| -|SUMMARY|`datakit_httpcli_dns_cost_seconds`|`from`|HTTP DNS cost| -|SUMMARY|`datakit_httpcli_tls_handshake_seconds`|`from`|HTTP TLS handshake cost| -|SUMMARY|`datakit_httpcli_http_connect_cost_seconds`|`from`|HTTP connect cost| -|SUMMARY|`datakit_io_build_body_cost_seconds`|`category,encoding`|Build point HTTP body cost| -|SUMMARY|`datakit_io_build_body_batches`|`category,encoding`|Batch HTTP body batches| -|SUMMARY|`datakit_io_build_body_batch_bytes`|`category,encoding,gzip`|Batch HTTP body size| -|SUMMARY|`datakit_io_build_body_batch_points`|`category,encoding,gzip`|Batch HTTP body points| -|COUNTER|`datakit_io_dataway_point_total`|`category,status`|Dataway uploaded points, partitioned by category and send status(HTTP status)| -|COUNTER|`datakit_io_dataway_point_bytes_total`|`category,enc,status`|Dataway uploaded points bytes, partitioned by category and pint send status(HTTP status)| -|COUNTER|`datakit_io_dataway_http_drop_point_total`|`category,error`|Dataway write drop points| -|SUMMARY|`datakit_io_dataway_api_latency_seconds`|`api,status`|Dataway HTTP request latency partitioned by HTTP API(method@url) and HTTP status| -|COUNTER|`datakit_io_http_retry_total`|`api,status`|Dataway HTTP retried count| -|SUMMARY|`datakit_io_grouped_request`|`category`|Grouped requests under sinker| -|SUMMARY|`datakit_io_flush_failcache_bytes`|`category`|IO flush fail-cache bytes(in gzip) summary| -|COUNTER|`datakit_filter_update_total`|`N/A`|Filters(remote) updated count| -|GAUGE|`datakit_filter_last_update_timestamp_seconds`|`N/A`|Filter last update time| -|COUNTER|`datakit_filter_point_total`|`category,filters,source`|Filter points of filters| -|GAUGE|`datakit_filter_parse_error`|`error,filters`|Filter parse error| -|COUNTER|`datakit_filter_point_dropped_total`|`category,filters,source`|Dropped points of filters| -|SUMMARY|`datakit_filter_pull_latency_seconds`|`status`|Filter pull(remote) latency| -|SUMMARY|`datakit_filter_latency_seconds`|`category,filters,source`|Filter latency of these filters| -|GAUGE|`datakit_io_queue_points`|`category`|IO module queued(cached) points| -|COUNTER|`datakit_io_input_filter_point_total`|`name,category`|Input filtered point total| -|COUNTER|`datakit_io_feed_total`|`name,category`|Input feed total| -|GAUGE|`datakit_io_last_feed_timestamp_seconds`|`name,category`|Input last feed time(according to Datakit local time)| -|SUMMARY|`datakit_input_collect_latency_seconds`|`name,category`|Input collect latency| -|GAUGE|`datakit_io_chan_usage`|`category`|IO channel usage(length of the channel)| -|GAUGE|`datakit_io_chan_capacity`|`category`|IO channel capacity| -|SUMMARY|`datakit_io_feed_cost_seconds`|`category,from`|IO feed waiting(on block mode) seconds| -|SUMMARY|`datakit_io_feed_point`|`name,category`|Input feed point| -|GAUGE|`datakit_io_flush_workers`|`category`|IO flush workers| -|COUNTER|`datakit_io_flush_total`|`category`|IO flush total| -|COUNTER|`datakit_error_total`|`source,category`|Total errors, only count on error source, not include error message| -|GAUGE|`datakit_goroutines`|`N/A`|Goroutine count within Datakit| -|GAUGE|`datakit_heap_alloc_bytes`|`N/A`|Datakit memory heap bytes| -|GAUGE|`datakit_sys_alloc_bytes`|`N/A`|Datakit memory system bytes| -|GAUGE|`datakit_cpu_usage`|`N/A`|Datakit CPU usage(%)| -|GAUGE|`datakit_open_files`|`N/A`|Datakit open files(only available on Linux)| -|GAUGE|`datakit_cpu_cores`|`N/A`|Datakit CPU cores| -|GAUGE|`datakit_uptime_seconds`|`auto_update,docker,hostname,lite,elinker,resource_limit,version=?,build_at=?,branch=?,os_arch=?`|Datakit uptime| -|GAUGE|`datakit_data_overuse`|`N/A`|Does current workspace's data(metric/logging) usage(if 0 not beyond, or with a unix timestamp when overuse occurred)| -|COUNTER|`datakit_process_ctx_switch_total`|`type`|Datakit process context switch count(Linux only)| -|COUNTER|`datakit_process_io_count_total`|`type`|Datakit process IO count| -|COUNTER|`datakit_process_io_bytes_total`|`type`|Datakit process IO bytes count| -|COUNTER|`datakit_pipeline_offload_point_total`|`category,exporter,remote`|Pipeline offload processed total points| -|COUNTER|`datakit_pipeline_offload_error_point_total`|`category,exporter,remote`|Pipeline offload processed total error points| -|SUMMARY|`datakit_pipeline_offload_cost_seconds`|`category,exporter,remote`|Pipeline offload total cost| -|GAUGE|`datakit_input_container_kubernetes_fetch_error`|`namespace,resource,error`|Kubernetes resource fetch error| -|SUMMARY|`datakit_input_container_kubernetes_collect_cost_seconds`|`category`|Kubernetes collect cost| -|SUMMARY|`datakit_input_container_kubernetes_collect_resource_cost_seconds`|`category,kind,fieldselector`|Kubernetes collect resource cost| -|COUNTER|`datakit_input_container_kubernetes_collect_pts_total`|`category`|Kubernetes collect point total| -|COUNTER|`datakit_input_container_kubernetes_pod_metrics_query_total`|`target`|Kubernetes query pod metrics count| -|SUMMARY|`datakit_input_container_collect_cost_seconds`|`category`|Container collect cost| -|COUNTER|`datakit_input_container_collect_pts_total`|`category`|Container collect point total| -|SUMMARY|`datakit_input_container_total_collect_cost_seconds`|`category`|Total container collect cost| -|SUMMARY|`datakit_dialtesting_task_run_cost_seconds`|`region,protocol`|Task run time| -|SUMMARY|`datakit_dialtesting_task_exec_time_interval_seconds`|`region,protocol`|Task execution time interval| -|GAUGE|`datakit_dialtesting_worker_job_chan_number`|`type`|The number of the channel for the jobs| -|GAUGE|`datakit_dialtesting_worker_job_number`|`N/A`|The number of the jobs to send data in parallel| -|GAUGE|`datakit_dialtesting_worker_cached_points_number`|`region,protocol`|The number of cached points| -|GAUGE|`datakit_dialtesting_worker_send_points_number`|`region,protocol,status`|The number of the points which have been sent| -|SUMMARY|`datakit_dialtesting_worker_send_cost_seconds`|`region,protocol`|Time cost to send points| -|GAUGE|`datakit_dialtesting_task_number`|`region,protocol`|The number of tasks| -|GAUGE|`datakit_dialtesting_dataway_send_failed_number`|`region,protocol,dataway`|The number of failed sending for each Dataway| -|SUMMARY|`datakit_dialtesting_pull_cost_seconds`|`region,is_first`|Time cost to pull tasks| -|COUNTER|`datakit_dialtesting_task_synchronized_total`|`region,protocol`|Task synchronized number| -|COUNTER|`datakit_dialtesting_task_invalid_total`|`region,protocol,fail_reason`|Invalid task number| -|SUMMARY|`datakit_dialtesting_task_check_cost_seconds`|`region,protocol,status`|Task check time| -|COUNTER|`datakit_input_kafkamq_consumer_message_total`|`topic,partition,status`|Kafka consumer message numbers from Datakit start| -|COUNTER|`datakit_input_kafkamq_group_election_total`|`N/A`|Kafka group election count| -|SUMMARY|`datakit_input_kafkamq_process_message_nano`|`topic`|kafkamq process message nanoseconds duration| -|COUNTER|`datakit_input_kubernetesprometheus_resource_collect_pts_total`|`role,name`|The number of the points which have been sent| -|GAUGE|`datakit_input_kubernetesprometheus_forked_worker_number`|`role,name`|The number of the worker| -|GAUGE|`datakit_inputs_instance`|`input`|Input instance count| -|COUNTER|`datakit_inputs_crash_total`|`input`|Input crash count| -|GAUGE|`datakit_input_ploffload_chan_capacity`|`channel_name`|PlOffload channel capacity| -|GAUGE|`datakit_input_ploffload_chan_usage`|`channel_name`|PlOffload channel usage| -|COUNTER|`datakit_input_ploffload_point_total`|`category`|PlOffload processed total points| -|SUMMARY|`datakit_input_promremote_collect_points`|`source`|Total number of promremote collection points| -|SUMMARY|`datakit_input_promremote_time_diff_in_second`|`source`|Time diff with local time| -|COUNTER|`datakit_input_promremote_no_time_points_total`|`source`|Total number of promremote collection no time points| -|GAUGE|`api_elapsed_seconds`|`N/A`|Proxied API elapsed seconds| -|COUNTER|`api_post_bytes_total`|`api,status`|Proxied API post bytes total| -|SUMMARY|`api_latency_seconds`|`api,status`|Proxied API latency| -|COUNTER|`datakit_input_proxy_connect_total`|`client_ip`|Proxy connect(method CONNECT)| -|COUNTER|`datakit_input_proxy_api_total`|`api,method`|Proxy API total| -|SUMMARY|`datakit_input_proxy_api_latency_seconds`|`api,method,status`|Proxy API latency| -|COUNTER|`datakit_input_rum_session_replay_drop_total`|`app_id,env,version,service`|statistics the total count of session replay points which have been filtered by rules| -|COUNTER|`datakit_input_rum_session_replay_drop_bytes_total`|`app_id,env,version,service`|statistics the total bytes of session replay points which have been filtered by rules| -|COUNTER|`datakit_input_rum_locate_statistics_total`|`app_id,ip_status,locate_status`|locate by ip addr statistics| -|COUNTER|`datakit_input_rum_source_map_total`|`app_id,sdk_name,status,remark`|source map result statistics| -|GAUGE|`datakit_input_rum_loaded_zips`|`platform`|RUM source map currently loaded zip archive count| -|SUMMARY|`datakit_input_rum_source_map_duration_seconds`|`sdk_name,app_id,env,version`|statistics elapsed time in RUM source map(unit: second)| -|SUMMARY|`datakit_input_rum_session_replay_upload_latency_seconds`|`app_id,env,version,service,status_code`|statistics elapsed time in session replay uploading| -|COUNTER|`datakit_input_rum_session_replay_upload_failure_total`|`app_id,env,version,service,status_code`|statistics count of session replay points which which have unsuccessfully uploaded| -|COUNTER|`datakit_input_rum_session_replay_upload_failure_bytes_total`|`app_id,env,version,service,status_code`|statistics the total bytes of session replay points which have unsuccessfully uploaded| -|SUMMARY|`datakit_input_rum_session_replay_read_body_delay_seconds`|`app_id,env,version,service`|statistics the duration of reading session replay body| -|SUMMARY|`datakit_input_snmp_discovery_cost`|`profile_type`|Discovery cost(in second)| -|SUMMARY|`datakit_input_snmp_collect_cost`|`N/A`|Every loop collect cost(in second)| -|SUMMARY|`datakit_input_snmp_device_collect_cost`|`class`|Device collect cost(in second)| -|GAUGE|`datakit_input_snmp_alive_devices`|`class`|Alive devices| -|SUMMARY|`datakit_input_prom_collect_points`|`mode,source`|Total number of prom collection points| -|SUMMARY|`datakit_input_prom_http_get_bytes`|`mode,source`|HTTP get bytes| -|SUMMARY|`datakit_input_prom_http_latency_in_second`|`mode,source`|HTTP latency(in second)| -|GAUGE|`datakit_input_prom_stream_size`|`mode,source`|Stream size| -|SUMMARY|`datakit_input_statsd_collect_points`|`N/A`|Total number of statsd collection points| -|SUMMARY|`datakit_input_statsd_accept_bytes`|`N/A`|Accept bytes from network| -|COUNTER|`datakit_input_logging_socket_feed_message_count_total`|`network`|socket feed to IO message count| -|SUMMARY|`datakit_input_logging_socket_log_length`|`network`|record the length of each log line| -|COUNTER|`datakit_tailer_collect_multiline_state_total`|`source,filepath,multilinestate`|Tailer multiline state total| -|COUNTER|`datakit_tailer_file_rotate_total`|`source,filepath`|Tailer rotate total| -|COUNTER|`datakit_tailer_buffer_force_flush_total`|`source,filepath`|Tailer force flush total| -|COUNTER|`datakit_tailer_parse_fail_total`|`source,filepath,mode`|Tailer parse fail total| -|GAUGE|`datakit_tailer_open_file_num`|`mode`|Tailer open file total| -|COUNTER|`datakit_input_logging_socket_connect_status_total`|`network,status`|connect and close count for net.conn| -|COUNTER|`datakit_input_tracing_total`|`input,service`|The total links number of Trace processed by the trace module| -|COUNTER|`datakit_input_sampler_total`|`input,service`|The sampler number of Trace processed by the trace module| -|COUNTER|`diskcache_put_bytes_total`|`path`|Cache Put() bytes count| -|COUNTER|`diskcache_get_total`|`path`|Cache Get() count| -|COUNTER|`diskcache_wakeup_total`|`path`|Wakeup count on sleeping write file| -|COUNTER|`diskcache_seek_back_total`|`path`|Seek back when Get() got any error| -|COUNTER|`diskcache_get_bytes_total`|`path`|Cache Get() bytes count| -|GAUGE|`diskcache_capacity`|`path`|Current capacity(in bytes)| -|GAUGE|`diskcache_max_data`|`path`|Max data to Put(in bytes), default 0| -|GAUGE|`diskcache_batch_size`|`path`|Data file size(in bytes)| -|GAUGE|`diskcache_size`|`path`|Current cache size(in bytes)| -|GAUGE|`diskcache_open_time`|`no_fallback_on_error,no_lock,no_pos,no_sync,path`|Current cache Open time in unix timestamp(second)| -|GAUGE|`diskcache_last_close_time`|`path`|Current cache last Close time in unix timestamp(second)| -|GAUGE|`diskcache_datafiles`|`path`|Current un-read data files| -|SUMMARY|`diskcache_get_latency`|`path`|Get() time cost(micro-second)| -|SUMMARY|`diskcache_put_latency`|`path`|Put() time cost(micro-second)| -|COUNTER|`diskcache_dropped_bytes_total`|`path`|Dropped bytes during Put() when capacity reached.| -|COUNTER|`diskcache_dropped_total`|`path,reason`|Dropped files during Put() when capacity reached.| -|COUNTER|`diskcache_rotate_total`|`path`|Cache rotate count, mean file rotate from data to data.0000xxx| -|COUNTER|`diskcache_remove_total`|`path`|Removed file count, if some file read EOF, remove it from un-read list| -|COUNTER|`diskcache_put_total`|`path`|Cache Put() count| -|COUNTER|`pointpool_chan_get_total`|`N/A`|Get count from reserved channel| -|COUNTER|`pointpool_chan_put_total`|`N/A`|Put count to reserved channel| -|COUNTER|`pointpool_pool_get_total`|`N/A`|Get count from reserved channel| -|COUNTER|`pointpool_pool_put_total`|`N/A`|Put count to reserved channel| -|COUNTER|`pointpool_reserved_capacity`|`N/A`|Reserved capacity of the pool| -|COUNTER|`pointpool_malloc_total`|`N/A`|New object malloc from pool| -|COUNTER|`pointpool_escaped`|`N/A`|Points that not comes from pool| +|POSITION|TYPE|NAME|LABELS|HELP| +|---|---|---|---|---| +|*internal/config*|GAUGE|`datakit_config_datakit_ulimit`|`status`|Datakit ulimit| +|*internal/dnswatcher*|COUNTER|`datakit_dns_domain_total`|`N/A`|DNS watched domain counter| +|*internal/dnswatcher*|COUNTER|`datakit_dns_ip_updated_total`|`domain`|Domain IP updated counter| +|*internal/dnswatcher*|COUNTER|`datakit_dns_watch_run_total`|`interval`|Watch run counter| +|*internal/dnswatcher*|SUMMARY|`datakit_dns_cost_seconds`|`domain,status`|DNS IP lookup cost| +|*internal/election*|COUNTER|`datakit_election_pause_total`|`id,namespace`|Input paused count when election failed| +|*internal/election*|COUNTER|`datakit_election_resume_total`|`id,namespace`|Input resume count when election OK| +|*internal/election*|GAUGE|`datakit_election_status`|`elected_id,id,namespace,status`|Datakit election status, if metric = 0, meas not elected, or the elected time(unix timestamp second)| +|*internal/election*|GAUGE|`datakit_election_inputs`|`namespace`|Datakit election input count| +|*internal/election*|SUMMARY|`datakit_election_seconds`|`namespace,status`|Election latency| +|*internal/goroutine*|GAUGE|`datakit_goroutine_alive`|`name`|Alive Goroutine count| +|*internal/goroutine*|COUNTER|`datakit_goroutine_recover_total`|`name`|Recovered Goroutine count| +|*internal/goroutine*|COUNTER|`datakit_goroutine_stopped_total`|`name`|Stopped Goroutine count| +|*internal/goroutine*|COUNTER|`datakit_goroutine_crashed_total`|`name`|Crashed goroutines count| +|*internal/goroutine*|GAUGE|`datakit_goroutine_groups`|`N/A`|Goroutine group count| +|*internal/goroutine*|SUMMARY|`datakit_goroutine_cost_seconds`|`name`|Goroutine running duration| +|*internal/httpapi*|SUMMARY|`datakit_http_api_elapsed_seconds`|`api,method,status`|API request cost| +|*internal/httpapi*|SUMMARY|`datakit_http_api_req_size_bytes`|`api,method,status`|API request body size| +|*internal/httpapi*|COUNTER|`datakit_http_api_total`|`api,method,status`|API request counter| +|*internal/httpapi*|GAUGE|`datakit_http_api_global_tags_last_updated`|`api,method,status`|Global tag updated timestamp, in second| +|*internal/httpcli*|SUMMARY|`datakit_httpcli_got_first_resp_byte_cost_seconds`|`from`|Got first response byte cost| +|*internal/httpcli*|COUNTER|`datakit_httpcli_tcp_conn_total`|`from,remote,type`|HTTP TCP connection count| +|*internal/httpcli*|COUNTER|`datakit_httpcli_conn_reused_from_idle_total`|`from`|HTTP connection reused from idle count| +|*internal/httpcli*|SUMMARY|`datakit_httpcli_conn_idle_time_seconds`|`from`|HTTP connection idle time| +|*internal/httpcli*|SUMMARY|`datakit_httpcli_dns_cost_seconds`|`from`|HTTP DNS cost| +|*internal/httpcli*|SUMMARY|`datakit_httpcli_tls_handshake_seconds`|`from`|HTTP TLS handshake cost| +|*internal/httpcli*|SUMMARY|`datakit_httpcli_http_connect_cost_seconds`|`from`|HTTP connect cost| +|*internal/io/dataway*|SUMMARY|`datakit_io_flush_failcache_bytes`|`category`|IO flush fail-cache bytes(in gzip) summary| +|*internal/io/dataway*|SUMMARY|`datakit_io_build_body_cost_seconds`|`category,encoding,stage`|Build point HTTP body cost| +|*internal/io/dataway*|SUMMARY|`datakit_io_build_body_batches`|`category,encoding`|Batch HTTP body batches| +|*internal/io/dataway*|SUMMARY|`datakit_io_build_body_batch_bytes`|`category,encoding,type`|Batch HTTP body size| +|*internal/io/dataway*|SUMMARY|`datakit_io_build_body_batch_points`|`category,encoding`|Batch HTTP body points| +|*internal/io/dataway*|SUMMARY|`datakit_io_dataway_wal_flush`|`category,gzip,queue`|Dataway WAL worker flushed bytes| +|*internal/io/dataway*|COUNTER|`datakit_io_dataway_point_total`|`category,status`|Dataway uploaded points, partitioned by category and send status(HTTP status)| +|*internal/io/dataway*|COUNTER|`datakit_io_dataway_point_bytes_total`|`category,enc,status`|Dataway uploaded points bytes, partitioned by category and pint send status(HTTP status)| +|*internal/io/dataway*|COUNTER|`datakit_io_dataway_http_drop_point_total`|`category,error`|Dataway write drop points| +|*internal/io/dataway*|SUMMARY|`datakit_io_dataway_api_latency_seconds`|`api,status`|Dataway HTTP request latency partitioned by HTTP API(method@url) and HTTP status| +|*internal/io/dataway*|COUNTER|`datakit_io_http_retry_total`|`api,status`|Dataway HTTP retried count| +|*internal/io/dataway*|SUMMARY|`datakit_io_grouped_request`|`category`|Grouped requests under sinker| +|*internal/io/dataway*|GAUGE|`datakit_io_dataway_wal_mem_len`|`category`|Dataway WAL's memory queue length| +|*internal/io/filter*|COUNTER|`datakit_filter_update_total`|`N/A`|Filters(remote) updated count| +|*internal/io/filter*|GAUGE|`datakit_filter_last_update_timestamp_seconds`|`N/A`|Filter last update time| +|*internal/io/filter*|COUNTER|`datakit_filter_point_total`|`category,filters,source`|Filter points of filters| +|*internal/io/filter*|GAUGE|`datakit_filter_parse_error`|`error,filters`|Filter parse error| +|*internal/io/filter*|COUNTER|`datakit_filter_point_dropped_total`|`category,filters,source`|Dropped points of filters| +|*internal/io/filter*|SUMMARY|`datakit_filter_pull_latency_seconds`|`status`|Filter pull(remote) latency| +|*internal/io/filter*|SUMMARY|`datakit_filter_latency_seconds`|`category,filters,source`|Filter latency of these filters| +|*internal/io*|GAUGE|`datakit_io_queue_points`|`category`|IO module queued(cached) points| +|*internal/io*|COUNTER|`datakit_io_input_filter_point_total`|`name,category`|Input filtered point total| +|*internal/io*|COUNTER|`datakit_io_feed_total`|`name,category`|Input feed total| +|*internal/io*|GAUGE|`datakit_io_last_feed_timestamp_seconds`|`name,category`|Input last feed time(according to Datakit local time)| +|*internal/io*|SUMMARY|`datakit_input_collect_latency_seconds`|`name,category`|Input collect latency| +|*internal/io*|GAUGE|`datakit_io_chan_usage`|`category`|IO channel usage(length of the channel)| +|*internal/io*|GAUGE|`datakit_io_chan_capacity`|`category`|IO channel capacity| +|*internal/io*|SUMMARY|`datakit_io_feed_cost_seconds`|`category,from`|IO feed waiting(on block mode) seconds| +|*internal/io*|SUMMARY|`datakit_io_feed_point`|`name,category`|Input feed point| +|*internal/io*|GAUGE|`datakit_io_flush_workers`|`category`|IO flush workers| +|*internal/io*|COUNTER|`datakit_io_flush_total`|`category`|IO flush total| +|*internal/metrics*|COUNTER|`datakit_error_total`|`source,category`|Total errors, only count on error source, not include error message| +|*internal/metrics*|GAUGE|`datakit_goroutines`|`N/A`|Goroutine count within Datakit| +|*internal/metrics*|GAUGE|`datakit_mem_stat`|`type`|Datakit memory system bytes| +|*internal/metrics*|GAUGE|`datakit_heap_alloc_bytes`|`N/A`|Datakit memory heap bytes(Deprecated by datakit_golang_mem_usage)| +|*internal/metrics*|GAUGE|`datakit_sys_alloc_bytes`|`N/A`|Datakit memory system bytes(Deprecated by datakit_golang_mem_usage)| +|*internal/metrics*|GAUGE|`datakit_golang_mem_usage`|`type`|Datakit golang memory_usage| +|*internal/metrics*|GAUGE|`datakit_cpu_usage`|`N/A`|Datakit CPU usage(%)| +|*internal/metrics*|GAUGE|`datakit_open_files`|`N/A`|Datakit open files(only available on Linux)| +|*internal/metrics*|GAUGE|`datakit_cpu_cores`|`N/A`|Datakit CPU cores| +|*internal/metrics*|GAUGE|`datakit_uptime_seconds`|`auto_update,docker,hostname,lite,elinker,resource_limit,version=?,build_at=?,branch=?,os_arch=?`|Datakit uptime| +|*internal/metrics*|GAUGE|`datakit_data_overuse`|`N/A`|Does current workspace's data(metric/logging) usage(if 0 not beyond, or with a unix timestamp when overuse occurred)| +|*internal/metrics*|COUNTER|`datakit_process_ctx_switch_total`|`type`|Datakit process context switch count(Linux only)| +|*internal/metrics*|COUNTER|`datakit_process_io_count_total`|`type`|Datakit process IO count| +|*internal/metrics*|COUNTER|`datakit_process_io_bytes_total`|`type`|Datakit process IO bytes count| +|*internal/ntp*|COUNTER|`datakit_ntp_sync_total`|`N/A`|Total count synced with remote NTP server| +|*internal/ntp*|SUMMARY|`datakit_ntp_time_diff`|`N/A`|Time difference(seconds) between remote NTP server| +|*internal/pipeline/offload*|COUNTER|`datakit_pipeline_offload_point_total`|`category,exporter,remote`|Pipeline offload processed total points| +|*internal/pipeline/offload*|COUNTER|`datakit_pipeline_offload_error_point_total`|`category,exporter,remote`|Pipeline offload processed total error points| +|*internal/pipeline/offload*|SUMMARY|`datakit_pipeline_offload_cost_seconds`|`category,exporter,remote`|Pipeline offload total cost| +|*internal/plugins/inputs/container/kubernetes*|GAUGE|`datakit_input_container_kubernetes_fetch_error`|`namespace,resource,error`|Kubernetes resource fetch error| +|*internal/plugins/inputs/container/kubernetes*|SUMMARY|`datakit_input_container_kubernetes_collect_cost_seconds`|`category`|Kubernetes collect cost| +|*internal/plugins/inputs/container/kubernetes*|SUMMARY|`datakit_input_container_kubernetes_collect_resource_cost_seconds`|`category,kind,fieldselector`|Kubernetes collect resource cost| +|*internal/plugins/inputs/container/kubernetes*|COUNTER|`datakit_input_container_kubernetes_collect_pts_total`|`category`|Kubernetes collect point total| +|*internal/plugins/inputs/container/kubernetes*|COUNTER|`datakit_input_container_kubernetes_pod_metrics_query_total`|`target`|Kubernetes query pod metrics count| +|*internal/plugins/inputs/container*|SUMMARY|`datakit_input_container_collect_cost_seconds`|`category`|Container collect cost| +|*internal/plugins/inputs/container*|COUNTER|`datakit_input_container_collect_pts_total`|`category`|Container collect point total| +|*internal/plugins/inputs/container*|SUMMARY|`datakit_input_container_total_collect_cost_seconds`|`category`|Total container collect cost| +|*internal/plugins/inputs/dialtesting*|SUMMARY|`datakit_dialtesting_task_run_cost_seconds`|`region,protocol`|Task run time| +|*internal/plugins/inputs/dialtesting*|SUMMARY|`datakit_dialtesting_task_exec_time_interval_seconds`|`region,protocol`|Task execution time interval| +|*internal/plugins/inputs/dialtesting*|GAUGE|`datakit_dialtesting_worker_job_chan_number`|`type`|The number of the channel for the jobs| +|*internal/plugins/inputs/dialtesting*|GAUGE|`datakit_dialtesting_worker_job_number`|`N/A`|The number of the jobs to send data in parallel| +|*internal/plugins/inputs/dialtesting*|GAUGE|`datakit_dialtesting_worker_cached_points_number`|`region,protocol`|The number of cached points| +|*internal/plugins/inputs/dialtesting*|GAUGE|`datakit_dialtesting_worker_send_points_number`|`region,protocol,status`|The number of the points which have been sent| +|*internal/plugins/inputs/dialtesting*|SUMMARY|`datakit_dialtesting_worker_send_cost_seconds`|`region,protocol`|Time cost to send points| +|*internal/plugins/inputs/dialtesting*|GAUGE|`datakit_dialtesting_task_number`|`region,protocol`|The number of tasks| +|*internal/plugins/inputs/dialtesting*|GAUGE|`datakit_dialtesting_dataway_send_failed_number`|`region,protocol`|The number of failed sending| +|*internal/plugins/inputs/dialtesting*|SUMMARY|`datakit_dialtesting_pull_cost_seconds`|`region,is_first`|Time cost to pull tasks| +|*internal/plugins/inputs/dialtesting*|COUNTER|`datakit_dialtesting_task_synchronized_total`|`region,protocol`|Task synchronized number| +|*internal/plugins/inputs/dialtesting*|COUNTER|`datakit_dialtesting_task_invalid_total`|`region,protocol,fail_reason`|Invalid task number| +|*internal/plugins/inputs/dialtesting*|SUMMARY|`datakit_dialtesting_task_check_cost_seconds`|`region,protocol,status`|Task check time| +|*internal/plugins/inputs/graphite/cache*|GAUGE|`datakit_input_graphite_metric_mapper_cache_length`|`N/A`|The count of unique metrics currently cached.| +|*internal/plugins/inputs/graphite/cache*|COUNTER|`datakit_input_graphite_metric_cache_gets_total`|`N/A`|The count of total metric cache gets.| +|*internal/plugins/inputs/graphite/cache*|COUNTER|`datakit_input_graphite_metric_mapper_cache_hits_total`|`N/A`|The count of total metric cache hits.| +|*internal/plugins/inputs/graphite*|COUNTER|`datakit_input_graphite_tag_parse_failures_total`|`N/A`|Total count of samples with invalid tags| +|*internal/plugins/inputs/graphite*|GAUGE|`datakit_input_graphite_last_processed_timestamp_seconds`|`N/A`|Unix timestamp of the last processed graphite metric.| +|*internal/plugins/inputs/graphite*|GAUGE|`datakit_input_graphite_sample_expiry_seconds`|`N/A`|How long in seconds a metric sample is valid for.| +|*internal/plugins/inputs/kafkamq*|COUNTER|`datakit_input_kafkamq_consumer_message_total`|`topic,partition,status`|Kafka consumer message numbers from Datakit start| +|*internal/plugins/inputs/kafkamq*|COUNTER|`datakit_input_kafkamq_group_election_total`|`N/A`|Kafka group election count| +|*internal/plugins/inputs/kafkamq*|SUMMARY|`datakit_input_kafkamq_process_message_nano`|`topic`|kafkamq process message nanoseconds duration| +|*internal/plugins/inputs/kubernetesprometheus*|COUNTER|`datakit_input_kubernetesprometheus_resource_collect_pts_total`|`role,name`|The number of the points which have been sent| +|*internal/plugins/inputs/kubernetesprometheus*|GAUGE|`datakit_input_kubernetesprometheus_resource_target_number`|`role,name`|The number of the target| +|*internal/plugins/inputs/kubernetesprometheus*|SUMMARY|`datakit_input_kubernetesprometheus_resource_scrape_cost_seconds`|`role,name,url`|The scrape cost in seconds| +|*internal/plugins/inputs/kubernetesprometheus*|GAUGE|`datakit_input_kubernetesprometheus_worker_number`|`role,worker`|The number of the worker| +|*internal/plugins/inputs*|GAUGE|`datakit_inputs_instance`|`input`|Input instance count| +|*internal/plugins/inputs*|COUNTER|`datakit_inputs_crash_total`|`input`|Input crash count| +|*internal/plugins/inputs/ploffload*|GAUGE|`datakit_input_ploffload_chan_capacity`|`channel_name`|PlOffload channel capacity| +|*internal/plugins/inputs/ploffload*|GAUGE|`datakit_input_ploffload_chan_usage`|`channel_name`|PlOffload channel usage| +|*internal/plugins/inputs/ploffload*|COUNTER|`datakit_input_ploffload_point_total`|`category`|PlOffload processed total points| +|*internal/plugins/inputs/promremote*|SUMMARY|`datakit_input_promremote_collect_points`|`source`|Total number of promremote collection points| +|*internal/plugins/inputs/promremote*|SUMMARY|`datakit_input_promremote_time_diff_in_second`|`source`|Time diff with local time| +|*internal/plugins/inputs/promremote*|COUNTER|`datakit_input_promremote_no_time_points_total`|`source`|Total number of promremote collection no time points| +|*internal/plugins/inputs/proxy/bench/client*|GAUGE|`api_elapsed_seconds`|`N/A`|Proxied API elapsed seconds| +|*internal/plugins/inputs/proxy/bench/client*|COUNTER|`api_post_bytes_total`|`api,status`|Proxied API post bytes total| +|*internal/plugins/inputs/proxy/bench/client*|SUMMARY|`api_latency_seconds`|`api,status`|Proxied API latency| +|*internal/plugins/inputs/proxy*|COUNTER|`datakit_input_proxy_connect_total`|`client_ip`|Proxy connect(method CONNECT)| +|*internal/plugins/inputs/proxy*|COUNTER|`datakit_input_proxy_api_total`|`api,method`|Proxy API total| +|*internal/plugins/inputs/proxy*|SUMMARY|`datakit_input_proxy_api_latency_seconds`|`api,method,status`|Proxy API latency| +|*internal/plugins/inputs/rum*|COUNTER|`datakit_input_rum_session_replay_drop_total`|`app_id,env,version,service`|statistics the total count of session replay points which have been filtered by rules| +|*internal/plugins/inputs/rum*|COUNTER|`datakit_input_rum_session_replay_drop_bytes_total`|`app_id,env,version,service`|statistics the total bytes of session replay points which have been filtered by rules| +|*internal/plugins/inputs/rum*|COUNTER|`datakit_input_rum_locate_statistics_total`|`app_id,ip_status,locate_status`|locate by ip addr statistics| +|*internal/plugins/inputs/rum*|COUNTER|`datakit_input_rum_source_map_total`|`app_id,sdk_name,status,remark`|source map result statistics| +|*internal/plugins/inputs/rum*|GAUGE|`datakit_input_rum_loaded_zips`|`platform`|RUM source map currently loaded zip archive count| +|*internal/plugins/inputs/rum*|SUMMARY|`datakit_input_rum_source_map_duration_seconds`|`sdk_name,app_id,env,version`|statistics elapsed time in RUM source map(unit: second)| +|*internal/plugins/inputs/rum*|SUMMARY|`datakit_input_rum_session_replay_upload_latency_seconds`|`app_id,env,version,service,status_code`|statistics elapsed time in session replay uploading| +|*internal/plugins/inputs/rum*|COUNTER|`datakit_input_rum_session_replay_upload_failure_total`|`app_id,env,version,service,status_code`|statistics count of session replay points which which have unsuccessfully uploaded| +|*internal/plugins/inputs/rum*|COUNTER|`datakit_input_rum_session_replay_upload_failure_bytes_total`|`app_id,env,version,service,status_code`|statistics the total bytes of session replay points which have unsuccessfully uploaded| +|*internal/plugins/inputs/rum*|SUMMARY|`datakit_input_rum_session_replay_read_body_delay_seconds`|`app_id,env,version,service`|statistics the duration of reading session replay body| +|*internal/plugins/inputs/snmp*|SUMMARY|`datakit_input_snmp_discovery_cost`|`profile_type`|Discovery cost(in second)| +|*internal/plugins/inputs/snmp*|SUMMARY|`datakit_input_snmp_collect_cost`|`N/A`|Every loop collect cost(in second)| +|*internal/plugins/inputs/snmp*|SUMMARY|`datakit_input_snmp_device_collect_cost`|`class`|Device collect cost(in second)| +|*internal/plugins/inputs/snmp*|GAUGE|`datakit_input_snmp_alive_devices`|`class`|Alive devices| +|*internal/prom*|SUMMARY|`datakit_input_prom_collect_points`|`mode,source`|Total number of prom collection points| +|*internal/prom*|SUMMARY|`datakit_input_prom_http_get_bytes`|`mode,source`|HTTP get bytes| +|*internal/prom*|SUMMARY|`datakit_input_prom_http_latency_in_second`|`mode,source`|HTTP latency(in second)| +|*internal/prom*|GAUGE|`datakit_input_prom_stream_size`|`mode,source`|Stream size| +|*internal/statsd*|SUMMARY|`datakit_input_statsd_collect_points`|`N/A`|Total number of statsd collection points| +|*internal/statsd*|SUMMARY|`datakit_input_statsd_accept_bytes`|`N/A`|Accept bytes from network| +|*internal/tailer*|COUNTER|`datakit_input_logging_socket_feed_message_count_total`|`network`|socket feed to IO message count| +|*internal/tailer*|SUMMARY|`datakit_input_logging_socket_log_length`|`network`|record the length of each log line| +|*internal/tailer*|COUNTER|`datakit_tailer_collect_multiline_state_total`|`source,filepath,multilinestate`|Tailer multiline state total| +|*internal/tailer*|COUNTER|`datakit_tailer_file_rotate_total`|`source,filepath`|Tailer rotate total| +|*internal/tailer*|COUNTER|`datakit_tailer_buffer_force_flush_total`|`source,filepath`|Tailer force flush total| +|*internal/tailer*|COUNTER|`datakit_tailer_parse_fail_total`|`source,filepath,mode`|Tailer parse fail total| +|*internal/tailer*|GAUGE|`datakit_tailer_open_file_num`|`mode`|Tailer open file total| +|*internal/tailer*|COUNTER|`datakit_input_logging_socket_connect_status_total`|`network,status`|connect and close count for net.conn| +|*internal/trace*|COUNTER|`datakit_input_tracing_total`|`input,service`|The total links number of Trace processed by the trace module| +|*internal/trace*|COUNTER|`datakit_input_sampler_total`|`input,service`|The sampler number of Trace processed by the trace module| +|*vendor/github.com/GuanceCloud/cliutils/diskcache*|SUMMARY|`diskcache_dropped_data`|`path,reason`|Dropped data during Put() when capacity reached.| +|*vendor/github.com/GuanceCloud/cliutils/diskcache*|COUNTER|`diskcache_rotate_total`|`path`|Cache rotate count, mean file rotate from data to data.0000xxx| +|*vendor/github.com/GuanceCloud/cliutils/diskcache*|COUNTER|`diskcache_remove_total`|`path`|Removed file count, if some file read EOF, remove it from un-read list| +|*vendor/github.com/GuanceCloud/cliutils/diskcache*|COUNTER|`diskcache_wakeup_total`|`path`|Wakeup count on sleeping write file| +|*vendor/github.com/GuanceCloud/cliutils/diskcache*|COUNTER|`diskcache_seek_back_total`|`path`|Seek back when Get() got any error| +|*vendor/github.com/GuanceCloud/cliutils/diskcache*|GAUGE|`diskcache_capacity`|`path`|Current capacity(in bytes)| +|*vendor/github.com/GuanceCloud/cliutils/diskcache*|GAUGE|`diskcache_max_data`|`path`|Max data to Put(in bytes), default 0| +|*vendor/github.com/GuanceCloud/cliutils/diskcache*|GAUGE|`diskcache_batch_size`|`path`|Data file size(in bytes)| +|*vendor/github.com/GuanceCloud/cliutils/diskcache*|GAUGE|`diskcache_size`|`path`|Current cache size(in bytes)| +|*vendor/github.com/GuanceCloud/cliutils/diskcache*|GAUGE|`diskcache_open_time`|`no_fallback_on_error,no_lock,no_pos,no_sync,path`|Current cache Open time in unix timestamp(second)| +|*vendor/github.com/GuanceCloud/cliutils/diskcache*|GAUGE|`diskcache_last_close_time`|`path`|Current cache last Close time in unix timestamp(second)| +|*vendor/github.com/GuanceCloud/cliutils/diskcache*|GAUGE|`diskcache_datafiles`|`path`|Current un-read data files| +|*vendor/github.com/GuanceCloud/cliutils/diskcache*|SUMMARY|`diskcache_stream_put`|`path`|Stream put times| +|*vendor/github.com/GuanceCloud/cliutils/diskcache*|SUMMARY|`diskcache_get_latency`|`path`|Get() cost seconds| +|*vendor/github.com/GuanceCloud/cliutils/diskcache*|SUMMARY|`diskcache_put_latency`|`path`|Put() cost seconds| +|*vendor/github.com/GuanceCloud/cliutils/diskcache*|SUMMARY|`diskcache_put_bytes`|`path`|Cache Put() bytes| +|*vendor/github.com/GuanceCloud/cliutils/diskcache*|SUMMARY|`diskcache_get_bytes`|`path`|Cache Get() bytes| +|*vendor/github.com/GuanceCloud/cliutils/point*|COUNTER|`pointpool_chan_get_total`|`N/A`|Get count from reserved channel| +|*vendor/github.com/GuanceCloud/cliutils/point*|COUNTER|`pointpool_chan_put_total`|`N/A`|Put count to reserved channel| +|*vendor/github.com/GuanceCloud/cliutils/point*|COUNTER|`pointpool_pool_get_total`|`N/A`|Get count from reserved channel| +|*vendor/github.com/GuanceCloud/cliutils/point*|COUNTER|`pointpool_pool_put_total`|`N/A`|Put count to reserved channel| +|*vendor/github.com/GuanceCloud/cliutils/point*|COUNTER|`pointpool_reserved_capacity`|`N/A`|Reserved capacity of the pool| +|*vendor/github.com/GuanceCloud/cliutils/point*|COUNTER|`pointpool_malloc_total`|`N/A`|New object malloc from pool| +|*vendor/github.com/GuanceCloud/cliutils/point*|COUNTER|`pointpool_escaped`|`N/A`|Points that not comes from pool| ### Golang 运行时指标 {#go-runtime-metrics} diff --git a/internal/export/non_input_docs.go b/internal/export/non_input_docs.go index e5dd2a1583..27e0c0d127 100644 --- a/internal/export/non_input_docs.go +++ b/internal/export/non_input_docs.go @@ -39,12 +39,46 @@ var nonInputDocs = map[string]content{ func envCommon() []*inputs.ENVInfo { // nolint:lll infos := []*inputs.ENVInfo{ - {ENVName: "ENV_DISABLE_PROTECT_MODE", Type: doc.Boolean, Desc: "Disable protect mode", DescZh: "禁用「配置保护」模式"}, - {ENVName: "ENV_DATAWAY", Type: doc.URL, Example: "`https://openway.guance.com?token=xxx`", Required: doc.Yes, Desc: "Configure the DataWay address", DescZh: "配置 DataWay 地址"}, - {ENVName: "ENV_DEFAULT_ENABLED_INPUTS", Type: doc.List, Example: `cpu,mem,disk`, Desc: "[The list of collectors](datakit-input-conf.md#default-enabled-inputs) is opened by default, divided by English commas, the old `ENV_ENABLE_INPUTS` will be discarded", DescZh: "默认开启[采集器列表](datakit-input-conf.md#default-enabled-inputs),以英文逗号分割,如 `cpu,mem,disk`"}, - {ENVName: "ENV_ENABLE_INPUTS :fontawesome-solid-x:", Type: doc.List, Desc: "Same as ENV_DEFAULT_ENABLED_INPUTS, to be scrapped", DescZh: "同 ENV_DEFAULT_ENABLED_INPUTS,将废弃"}, - {ENVName: "ENV_GLOBAL_HOST_TAGS", Type: doc.List, Example: `tag1=val,tag2=val2`, Desc: "Global tag, multiple tags are divided by English commas. The old `ENV_GLOBAL_TAGS` will be discarded", DescZh: "全局 tag,多个 tag 之间以英文逗号分割"}, - {ENVName: "ENV_GLOBAL_TAGS :fontawesome-solid-x:", Type: doc.List, Desc: "Same as ENV_GLOBAL_HOST-TAGS, to be scrapped", DescZh: "同 ENV_GLOBAL_HOST_TAGS,将废弃"}, + { + ENVName: "ENV_DISABLE_PROTECT_MODE", + Type: doc.Boolean, + Desc: "Disable protect mode", + DescZh: "禁用「配置保护」模式", + }, + { + ENVName: "ENV_DATAWAY", + Type: doc.URL, + Example: "`https://openway.guance.com?token=xxx`", + Required: doc.Yes, + Desc: "Configure the DataWay address", + DescZh: "配置 DataWay 地址", + }, + { + ENVName: "ENV_DEFAULT_ENABLED_INPUTS", + Type: doc.List, + Example: `cpu,mem,disk`, + Desc: "[The list of collectors](datakit-input-conf.md#default-enabled-inputs) is opened by default, divided by commas", + DescZh: "默认开启[采集器列表](datakit-input-conf.md#default-enabled-inputs),以英文逗号分割,如 `cpu,mem,disk`", + }, + { + ENVName: "~~ENV_ENABLE_INPUTS~~", + Type: doc.List, + Desc: "Same as ENV_DEFAULT_ENABLED_INPUTS(Deprecated)", + DescZh: "同 ENV_DEFAULT_ENABLED_INPUTS,将废弃", + }, + { + ENVName: "ENV_GLOBAL_HOST_TAGS", + Type: doc.List, + Example: `tag1=val,tag2=val2`, + Desc: "Global tag, multiple tags are divided by English commas. The old `ENV_GLOBAL_TAGS` will be discarded", + DescZh: "全局 tag,多个 tag 之间以英文逗号分割", + }, + { + ENVName: "~~ENV_GLOBAL_TAGS~~", + Type: doc.List, + Desc: "Same as ENV_GLOBAL_HOST-TAGS(Deprecated)", + DescZh: "同 ENV_GLOBAL_HOST_TAGS,将废弃", + }, { ENVName: "ENV_K8S_CLUSTER_NODE_NAME", @@ -149,15 +183,51 @@ func envDataway() []*inputs.ENVInfo { { ENVName: "ENV_DATAWAY_NTP_INTERVAL", Type: doc.String, - Desc: "Set NTP sync interval [:octicons-tag-24: Version-1.39.0](changelog.md#cl-1.38.2)", - DescZh: "设置 NTP 时间同步间隔 [:octicons-tag-24: Version-1.39.0](changelog.md#cl-1.38.2)", + Desc: "Set NTP sync interval [:octicons-tag-24: Version-1.38.2](changelog.md#cl-1.38.2)", + DescZh: "设置 NTP 时间同步间隔 [:octicons-tag-24: Version-1.38.2](changelog.md#cl-1.38.2)", }, { ENVName: "ENV_DATAWAY_NTP_DIFF", Type: doc.String, - Desc: "Set NTP sync difference [:octicons-tag-24: Version-1.39.0](changelog.md#cl-1.38.2)", - DescZh: "设置 NTP 时间同步的误差[:octicons-tag-24: Version-1.39.0](changelog.md#cl-1.38.2)", + Desc: "Set NTP sync difference [:octicons-tag-24: Version-1.38.2](changelog.md#cl-1.38.2)", + DescZh: "设置 NTP 时间同步的误差[:octicons-tag-24: Version-1.38.2](changelog.md#cl-1.38.2)", + }, + + // WAL + { + ENVName: "ENV_DATAWAY_WAL_CAPACITY", + Type: doc.Float, + Desc: "Set WAL disk cache capacity [:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0)", + DescZh: "设置 WAL 占用的磁盘大小 [:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0)", + }, + + { + ENVName: "ENV_DATAWAY_WAL_WORKERS", + Type: doc.Int, + Desc: "Set WAL workers, default to limited CPU cores [:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0)", + DescZh: "设置 WAL worker 个数,默认为 CPU 配额核心数 X 2 [:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0)", + }, + + { + ENVName: "ENV_DATAWAY_WAL_MEM_CAPACITY", + Type: doc.Int, + Desc: "Set WAL memory queue length, default to limited CPU cores [:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0)", + DescZh: "设置 WAL 内存队列长度,默认为 CPU 配额核心数 [:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0)", + }, + + { + ENVName: "ENV_DATAWAY_WAL_PATH", + Type: doc.String, + Desc: "Set WAL disk path, default path is *data/dw-wal* under Datakit install path[:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0)", + DescZh: "设置 WAL 磁盘目录,默认为 Datakit 安装目录下的 *data/dw-wal* [:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0)", + }, + + { + ENVName: "ENV_DATAWAY_WAL_FAIL_CACHE_CLEAN_INTERVAL", + Type: doc.TimeDuration, + Desc: "Set WAL fail-cache clean interval, default `30s`[:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0)", + DescZh: "设置 WAL 失败队列的重试间隔,默认 `30s` [:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0)", }, } @@ -189,7 +259,7 @@ func envLog() []*inputs.ENVInfo { func envPprof() []*inputs.ENVInfo { // nolint:lll infos := []*inputs.ENVInfo{ - {ENVName: "ENV_ENABLE_PPROF :fontawesome-solid-x:", Type: doc.Boolean, Desc: "Whether to start `pprof`", DescZh: "是否开启 `pprof`"}, + {ENVName: "~~ENV_ENABLE_PPROF~~", Type: doc.Boolean, Desc: "Whether to start port on for profiling(Deprecated: Default enabled)", DescZh: "是否开启 profiling 端口(已默认启用)"}, {ENVName: "ENV_PPROF_LISTEN", Type: doc.String, Desc: "`pprof` service listening address", DescZh: "`pprof` 服务监听地址"}, } @@ -324,6 +394,7 @@ func envHTTPAPI() []*inputs.ENVInfo { { ENVName: "ENV_REQUEST_RATE_LIMIT", Type: doc.Float, + Default: "20.0", Desc: "Limit 9529 [API requests per second](datakit-conf.md#set-http-api-limit).", DescZh: "限制 9529 [API 每秒请求数](datakit-conf.md#set-http-api-limit)。", }, @@ -421,15 +492,69 @@ func envSinker() []*inputs.ENVInfo { func envIO() []*inputs.ENVInfo { // nolint:lll infos := []*inputs.ENVInfo{ - {ENVName: "ENV_IO_FILTERS", Type: doc.JSON, Desc: "Add [line protocol filter](datakit-filter.md)", DescZh: "添加[行协议过滤器](datakit-filter.md)"}, - {ENVName: "ENV_IO_FLUSH_INTERVAL", Type: doc.TimeDuration, Default: "10s", Desc: "IO channel capacity [:octicons-tag-24: Version-1.22.0](changelog.md#cl-1.22.0)", DescZh: "IO 发送时间频率 [:octicons-tag-24: Version-1.22.0](changelog.md#cl-1.22.0)"}, - {ENVName: "ENV_IO_FEED_CHAN_SIZE", Type: doc.Int, Default: "1", Desc: "IO transmission time frequency [:octicons-tag-24: Version-1.22.0](changelog.md#cl-1.22.0)", DescZh: "IO 发送队列长度 [:octicons-tag-24: Version-1.22.0](changelog.md#cl-1.22.0)"}, - {ENVName: "ENV_IO_FLUSH_WORKERS", Type: doc.Int, Default: "`cpu_core * 2 + 1`", Desc: "IO flush workers [:octicons-tag-24: Version-1.5.9](changelog.md#cl-1.5.9)", DescZh: "IO 发送 worker 数 [:octicons-tag-24: Version-1.5.9](changelog.md#cl-1.5.9)"}, - {ENVName: "ENV_IO_MAX_CACHE_COUNT", Type: doc.Int, Default: "1000", Desc: "Send buffer size", DescZh: "发送 buffer(点数)大小"}, - {ENVName: "ENV_IO_ENABLE_CACHE", Type: doc.Boolean, Default: "false", Desc: "Whether to open the disk cache that failed to send", DescZh: "是否开启发送失败的磁盘缓存"}, - {ENVName: "ENV_IO_CACHE_ALL", Type: doc.Boolean, Default: "false", Desc: "Cache failed data points of all categories", DescZh: "是否 cache 所有发送失败的数据"}, - {ENVName: "ENV_IO_CACHE_MAX_SIZE_GB", Type: doc.Int, Default: "10", Desc: "Disk size of send failure cache (in GB)", DescZh: "发送失败缓存的磁盘大小(单位 GB)"}, - {ENVName: "ENV_IO_CACHE_CLEAN_INTERVAL", Type: doc.TimeDuration, Default: "5s", Desc: "Periodically send failed tasks cached on disk", DescZh: "定期发送缓存在磁盘内的失败任务"}, + { + ENVName: "ENV_IO_FILTERS", + Type: doc.JSON, + Desc: "Add [line protocol filter](datakit-filter.md)", + DescZh: "添加[行协议过滤器](datakit-filter.md)", + }, + { + ENVName: "ENV_IO_FLUSH_INTERVAL", + Type: doc.TimeDuration, + Default: "10s", + Desc: "Set compact interval [:octicons-tag-24: Version-1.22.0](changelog.md#cl-1.22.0)", + DescZh: "设置 compact 执行间隔 [:octicons-tag-24: Version-1.22.0](changelog.md#cl-1.22.0)", + }, + { + ENVName: "ENV_IO_FEED_CHAN_SIZE", + Type: doc.Int, + Default: "1", + Desc: "Set compact queue size [:octicons-tag-24: Version-1.22.0](changelog.md#cl-1.22.0)", + DescZh: "设置 compact 队列长度 [:octicons-tag-24: Version-1.22.0](changelog.md#cl-1.22.0)", + }, + { + ENVName: "ENV_IO_FLUSH_WORKERS", + Type: doc.Int, + Desc: "Set compact workers, default to limited CPU cores x 2 [:octicons-tag-24: Version-1.5.9](changelog.md#cl-1.5.9)", + DescZh: "设置 compactor worker 数,默认为 CPU 配额核心数 x 2 [:octicons-tag-24: Version-1.5.9](changelog.md#cl-1.5.9)", + }, + + { + ENVName: "ENV_IO_MAX_CACHE_COUNT", + Type: doc.Int, + Default: "1024", + Desc: "Compact buffer size", + DescZh: "Compact 缓存的点数", + }, + + { + ENVName: "~~ENV_IO_ENABLE_CACHE~~", + Type: doc.Boolean, + Default: "false", + Desc: "Whether to open the disk cache that failed to send. Removed in [:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0)", + DescZh: "是否开启发送失败的磁盘缓存。[:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0) 版本已移除", + }, + { + ENVName: "~~ENV_IO_CACHE_ALL~~", + Type: doc.Boolean, + Default: "false", + Desc: "Cache failed data points of all categories. Removed in [:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0)", + DescZh: "是否 cache 所有发送失败的数据。[:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0) 版本已移除", + }, + { + ENVName: "~~ENV_IO_CACHE_MAX_SIZE_GB~~", + Type: doc.Int, + Default: "10", + Desc: "Disk size of send failure cache (in GB). Removed in [:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0)", + DescZh: "发送失败缓存的磁盘大小(单位 GB)。[:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0) 版本已移除", + }, + { + ENVName: "~~ENV_IO_CACHE_CLEAN_INTERVAL~~", + Type: doc.TimeDuration, + Default: "5s", + Desc: "Periodically send failed tasks cached on disk. Removed in [:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0)", + DescZh: "定期发送缓存在磁盘内的失败任务。[:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0) 版本已移除", + }, } for idx := range infos { @@ -511,8 +636,26 @@ func envOthers() []*inputs.ENVInfo { func envPointPool() []*inputs.ENVInfo { // nolint:lll infos := []*inputs.ENVInfo{ - {ENVName: "ENV_ENABLE_POINT_POOL", Type: doc.Boolean, Example: "`on`", Desc: "Enable point pool", DescZh: "开启 point pool"}, - {ENVName: "ENV_POINT_POOL_RESERVED_CAPACITY", Type: doc.Int, Desc: "Specify pool capacity(default 4096)", DescZh: "指定 point pool 大小(默认 4096)"}, + { + ENVName: "~~ENV_ENABLE_POINT_POOL~~", + Type: doc.Boolean, + Example: "`on`", + Desc: "Enable point pool [:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0) default enabled", + DescZh: "开启 point pool [:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0) 版本已默认开启", + }, + { + ENVName: "ENV_DISABLE_POINT_POOL", + Type: doc.Boolean, + Example: "`on`", + Desc: "Disable point pool [:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0)", + DescZh: "禁用 point pool [:octicons-tag-24: Version-1.60.0](changelog.md#cl-1.60.0)", + }, + { + ENVName: "ENV_POINT_POOL_RESERVED_CAPACITY", + Type: doc.Int, + Desc: "Specify pool capacity(default 4096)", + DescZh: "指定 point pool 大小(默认 4096)", + }, } for idx := range infos { diff --git a/internal/httpapi/dca_test.go b/internal/httpapi/dca_test.go index 1f6a889172..00849205e9 100644 --- a/internal/httpapi/dca_test.go +++ b/internal/httpapi/dca_test.go @@ -77,7 +77,10 @@ func TestCors(t *testing.T) { hs := defaultHTTPServerConf() hs.dcaConfig = &config.DCAConfig{} - hs.dw = &dataway.Dataway{URLs: []string{"http://localhost:9529?token=" + TOKEN}} + + hs.dw = dataway.NewDefaultDataway() + hs.dw.URLs = []string{"http://localhost:9529?token=" + TOKEN} + assert.NoError(t, hs.dw.Init()) w := getResponse(t, req, hs) @@ -107,7 +110,10 @@ func runTestCases(t *testing.T, cases []TestCase) { hs := defaultHTTPServerConf() hs.dcaConfig = &config.DCAConfig{} - hs.dw = &dataway.Dataway{URLs: []string{"http://localhost:9529?token=" + TOKEN}} + + hs.dw = dataway.NewDefaultDataway() + hs.dw.URLs = []string{"http://localhost:9529?token=" + TOKEN} + assert.NoError(t, hs.dw.Init()) for _, tc := range cases { @@ -277,7 +283,9 @@ func TestDcaReload(t *testing.T) { hs := defaultHTTPServerConf() hs.dcaConfig = &config.DCAConfig{} - hs.dw = &dataway.Dataway{URLs: []string{"http://localhost:9529?token=" + TOKEN}} + hs.dw = dataway.NewDefaultDataway() + hs.dw.URLs = []string{"http://localhost:9529?token=" + TOKEN} + assert.NoError(t, hs.dw.Init()) w := getResponse(t, req, hs) @@ -317,7 +325,9 @@ func TestDcaSaveConfig(t *testing.T) { hs := defaultHTTPServerConf() hs.dcaConfig = &config.DCAConfig{} - hs.dw = &dataway.Dataway{URLs: []string{"http://localhost:9529?token=" + TOKEN}} + hs.dw = dataway.NewDefaultDataway() + hs.dw.URLs = []string{"http://localhost:9529?token=" + TOKEN} + assert.NoError(t, hs.dw.Init()) w := getResponse(t, req, hs) @@ -342,7 +352,10 @@ func TestDcaSaveConfig(t *testing.T) { func TestGetConfig(t *testing.T) { hs := defaultHTTPServerConf() hs.dcaConfig = &config.DCAConfig{} - hs.dw = &dataway.Dataway{URLs: []string{"http://localhost:9529?token=" + TOKEN}} + + hs.dw = dataway.NewDefaultDataway() + hs.dw.URLs = []string{"http://localhost:9529?token=" + TOKEN} + assert.NoError(t, hs.dw.Init()) // no path @@ -403,7 +416,10 @@ func TestDcaGetPipelines(t *testing.T) { hs := defaultHTTPServerConf() hs.dcaConfig = &config.DCAConfig{} - hs.dw = &dataway.Dataway{URLs: []string{"http://localhost:9529?token=" + TOKEN}} + + hs.dw = dataway.NewDefaultDataway() + hs.dw.URLs = []string{"http://localhost:9529?token=" + TOKEN} + assert.NoError(t, hs.dw.Init()) w := getResponse(t, req, hs) @@ -465,7 +481,10 @@ func TestDcaGetPipelinesDetail(t *testing.T) { hs := defaultHTTPServerConf() hs.dcaConfig = &config.DCAConfig{} - hs.dw = &dataway.Dataway{URLs: []string{"http://localhost:9529?token=" + TOKEN}} + + hs.dw = dataway.NewDefaultDataway() + hs.dw.URLs = []string{"http://localhost:9529?token=" + TOKEN} + assert.NoError(t, hs.dw.Init()) for _, tc := range testCases { @@ -518,7 +537,9 @@ func TestDcaTestPipelines(t *testing.T) { hs := defaultHTTPServerConf() hs.dcaConfig = &config.DCAConfig{} - hs.dw = &dataway.Dataway{URLs: []string{"http://localhost:9529?token=" + TOKEN}} + + hs.dw = dataway.NewDefaultDataway() + hs.dw.URLs = []string{"http://localhost:9529?token=" + TOKEN} assert.NoError(t, hs.dw.Init()) for _, tc := range testCases { @@ -602,7 +623,9 @@ func TestDcaCreatePipeline(t *testing.T) { hs := defaultHTTPServerConf() hs.dcaConfig = &config.DCAConfig{} - hs.dw = &dataway.Dataway{URLs: []string{"http://localhost:9529?token=" + TOKEN}} + + hs.dw = dataway.NewDefaultDataway() + hs.dw.URLs = []string{"http://localhost:9529?token=" + TOKEN} assert.NoError(t, hs.dw.Init()) for _, tc := range testCases { @@ -688,7 +711,9 @@ func TestDcaGetFilter(t *testing.T) { hs := defaultHTTPServerConf() hs.dcaConfig = &config.DCAConfig{} - hs.dw = &dataway.Dataway{URLs: []string{"http://localhost:9529?token=" + TOKEN}} + + hs.dw = dataway.NewDefaultDataway() + hs.dw.URLs = []string{"http://localhost:9529?token=" + TOKEN} assert.NoError(t, hs.dw.Init()) for index, tc := range cases { @@ -732,7 +757,8 @@ func TestDcaGetLogTail(t *testing.T) { hs := defaultHTTPServerConf() hs.dcaConfig = &config.DCAConfig{} - hs.dw = &dataway.Dataway{URLs: []string{"http://localhost:9529?token=" + TOKEN}} + hs.dw = dataway.NewDefaultDataway() + hs.dw.URLs = []string{"http://localhost:9529?token=" + TOKEN} assert.NoError(t, hs.dw.Init()) router := setupDcaRouter(hs) @@ -763,7 +789,9 @@ func TestDcaGetLogTail(t *testing.T) { func TestDcaDownloadLog(t *testing.T) { hs := defaultHTTPServerConf() hs.dcaConfig = &config.DCAConfig{} - hs.dw = &dataway.Dataway{URLs: []string{"http://localhost:9529?token=" + TOKEN}} + + hs.dw = dataway.NewDefaultDataway() + hs.dw.URLs = []string{"http://localhost:9529?token=" + TOKEN} assert.NoError(t, hs.dw.Init()) tmpDir, err := ioutil.TempDir("./", "__tmp") diff --git a/internal/httpapi/http.go b/internal/httpapi/http.go index 7ac4d802c9..6c81a926c6 100644 --- a/internal/httpapi/http.go +++ b/internal/httpapi/http.go @@ -86,7 +86,7 @@ func Start(opts ...option) { if hs.apiConfig.RequestRateLimit > 0.0 { l.Infof("set request limit to %f", hs.apiConfig.RequestRateLimit) - reqLimiter = setupLimiter(hs.apiConfig.RequestRateLimit) + reqLimiter = setupLimiter(hs.apiConfig.RequestRateLimit, time.Minute) } else { l.Infof("set request limit not set: %f", hs.apiConfig.RequestRateLimit) } diff --git a/internal/httpapi/http_test.go b/internal/httpapi/http_test.go index e5330479bd..30e5d31693 100644 --- a/internal/httpapi/http_test.go +++ b/internal/httpapi/http_test.go @@ -88,7 +88,9 @@ func TestRestartAPI(t *T.T) { } hs := defaultHTTPServerConf() - hs.dw = &dataway.Dataway{URLs: urls} + + hs.dw = dataway.NewDefaultDataway() + hs.dw.URLs = urls assert.NoError(t, hs.dw.Init()) cases := []struct { diff --git a/internal/httpapi/wrap.go b/internal/httpapi/wrap.go index bb63ff42c3..1714764710 100644 --- a/internal/httpapi/wrap.go +++ b/internal/httpapi/wrap.go @@ -135,12 +135,14 @@ func RawHTTPWrapper(lmt *limiter.Limiter, next APIHandler, other ...interface{}) func limitReach(w http.ResponseWriter, r *http.Request) { // TODO: export metrics here group by r.Method + r.URL // or we can cache the request + l.Warnf("request %s(%s) reached rate limit, dropped", r.URL.String(), r.Method) } -func setupLimiter(limit float64) *limiter.Limiter { - return tollbooth.NewLimiter(limit, &limiter.ExpirableOptions{ - DefaultExpirationTTL: time.Second, - }).SetOnLimitReached(limitReach).SetBurst(1) +func setupLimiter(limit float64, ttl time.Duration) *limiter.Limiter { + return tollbooth.NewLimiter(limit, + &limiter.ExpirableOptions{ + DefaultExpirationTTL: ttl, + }).SetOnLimitReached(limitReach) // .SetBurst(2) } // From https://github.com/DanielHeckrath/gin-prometheus/blob/master/gin_prometheus.go diff --git a/internal/httpapi/wrap_test.go b/internal/httpapi/wrap_test.go index bcec6d6ffc..5c373b7888 100644 --- a/internal/httpapi/wrap_test.go +++ b/internal/httpapi/wrap_test.go @@ -16,56 +16,69 @@ import ( ) func TestLimitWrap(t *testing.T) { - var limit float64 = 1000.0 - reqLimiter = setupLimiter(limit) + var ( + limit = 100.0 + ttl = time.Minute // ttl 需超过本测试的运行时长,这样得到的 limit 比较接近预期值 @expectLimited + limiter = setupLimiter(limit, ttl) + ) r := gin.New() apiHandler := func(c *gin.Context) { c.Data(200, "", nil) } - r.GET("/", ginLimiter(reqLimiter), apiHandler) + r.GET("/", ginLimiter(limiter), apiHandler) ts := httptest.NewServer(r) defer ts.Close() time.Sleep(time.Second) - total := 0 - limited := 0 - passed := 0 - round := 0 + var ( + total = 0 + limited = 0 + passed = 0 + + max = 10000 + sleep = time.Millisecond // 总运行时长在 10s+ 左右 + ) tick := time.NewTicker(time.Second) defer tick.Stop() + start := time.Now() for { + if total >= max { + break + } + resp, err := http.Get(ts.URL) if err != nil { t.Error(err) } - resp.Body.Close() - time.Sleep(time.Microsecond) + if resp != nil { + resp.Body.Close() + time.Sleep(sleep) - switch resp.StatusCode { - case 200: - passed++ - case 429: - limited++ + switch resp.StatusCode { + case 200: + passed++ + case 429: + limited++ + } } + total++ - if total > 10000 { - break - } + } - select { - case <-tick.C: - round++ - rate := float64(passed) / float64(round) - assert.Truef(t, rate < limit, "expect %f < %f", rate, limit) + expectLimited := float64(time.Since(start)) / float64(time.Second) * limit + ratio := float64(passed) / expectLimited - t.Logf("rate: %f, passed: %d, limited: %d, total: %d", rate, passed, limited, total) - default: - } - } + // 此处 passed 总会高出 expectLimited 一截,不清楚是不是 TTL 边界的原因,但不会超过 expectLimited 10% + assert.Truef(t, ratio <= 1.1, "expected %d, passed %d", int(expectLimited), passed) + // TTL 一旦小于 for 循环运行时长,此处的偏差就开始变大。当 TTL 大于 运行时长时,不管时 1min 还是 1hour, + // 比例都在 10% 以内。 + + t.Logf("cost %s, expected: %d, passed: %d(ratio: %f), limited: %d, total: %d", + time.Since(start), int(expectLimited), passed, ratio, limited, total) } diff --git a/internal/io/queue.go b/internal/io/compactor.go similarity index 51% rename from internal/io/queue.go rename to internal/io/compactor.go index 9cf1db5bc6..b090b1ce3c 100644 --- a/internal/io/queue.go +++ b/internal/io/compactor.go @@ -10,12 +10,63 @@ import ( "time" "github.com/GuanceCloud/cliutils/point" + "gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/datakit" "gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/io/dataway" - "gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/io/failcache" ) -func (x *dkIO) cacheData(c *consumer, d *feedOption, tryClean bool) { +type compactor struct { + category point.Category + compactTicker *time.Ticker + points []*point.Point + lastCompact time.Time +} + +func (x *dkIO) runCompactor(cat point.Category) { + r := x.fo.Reader(cat) + if r == nil && cat != point.DynamicDWCategory { + log.Panicf("invalid category %q, should not been here", cat.String()) + } + + c := &compactor{ + compactTicker: time.NewTicker(x.flushInterval), + category: cat, + } + + defer c.compactTicker.Stop() + + if cat == point.DynamicDWCategory { + // NOTE: 目前只有拨测的数据会将数据打到 dynamicDatawayPts 中,而拨测数据 + // 是写日志,故将 category 设置为 logging + c.category = point.Logging + } + + log.Infof("run compactor on %s", c.category) + for { + select { + case d := <-r: + x.cacheData(c, d, true) + PutFeedOption(d) // release feed options here + + case <-c.compactTicker.C: + if len(c.points) > 0 { + log.Debugf("on tick(%s) to compact %s(%d points), last compact %s ago...", + x.flushInterval, c.category, len(c.points), time.Since(c.lastCompact)) + x.compact(c) + } + + case <-datakit.Exit.Wait(): + if len(c.points) > 0 { + log.Debugf("on tick(%s) to compact %s(%d points), last compact %s ago...", + x.flushInterval, c.category, len(c.points), time.Since(c.lastCompact)) + x.compact(c) + } + return + } + } +} + +func (x *dkIO) cacheData(c *compactor, d *feedOption, tryClean bool) { if d == nil { log.Warn("get empty data, ignored") return @@ -35,11 +86,11 @@ func (x *dkIO) cacheData(c *consumer, d *feedOption, tryClean bool) { x.recordPoints(d) c.points = append(c.points, d.pts...) - if tryClean && x.maxCacheCount > 0 && len(c.points) > x.maxCacheCount { - x.flush(c) + if tryClean && x.compactAt > 0 && len(c.points) > x.compactAt { + x.compact(c) - // reset consumer flush ticker to prevent send small packages - c.flushTiker.Reset(x.flushInterval) + // reset compact ticker to prevent small packages + c.compactTicker.Reset(x.flushInterval) } } @@ -55,14 +106,14 @@ func (x *dkIO) recordPoints(d *feedOption) { } } -func (x *dkIO) flush(c *consumer) { - c.lastFlush = time.Now() +func (x *dkIO) compact(c *compactor) { + c.lastCompact = time.Now() defer func() { flushVec.WithLabelValues(c.category.String()).Inc() }() - if err := x.doFlush(c.points, c.category, c.fc); err != nil { + if err := x.doCompact(c.points, c.category); err != nil { log.Warnf("post %d points to %s failed: %s, ignored", len(c.points), c.category, err) } @@ -72,24 +123,7 @@ func (x *dkIO) flush(c *consumer) { c.points = c.points[:0] // clear } -func (x *dkIO) flushFailCache(c *consumer) { - if c.fc == nil { - return - } - - if err := x.dw.Write(dataway.WithCacheClean(true), - dataway.WithCategory(c.category), - dataway.WithFailCache(c.fc), - ); err != nil { - log.Warnf("flush cache failed: %s, ignored", err) - } -} - -func (x *dkIO) doFlush(points []*point.Point, - cat point.Category, - fc failcache.Cache, - dynamicURL ...string, -) error { +func (x *dkIO) doCompact(points []*point.Point, cat point.Category, dynamicURL ...string) error { if x.dw == nil { return fmt.Errorf("dataway not set") } @@ -100,13 +134,9 @@ func (x *dkIO) doFlush(points []*point.Point, opts := []dataway.WriteOption{ dataway.WithPoints(points), - // max cache size(in memory) upload as a batch - dataway.WithBatchSize(x.maxCacheCount), - + dataway.WithBatchSize(x.compactAt), dataway.WithCategory(cat), - dataway.WithFailCache(fc), - dataway.WithCacheAll(x.cacheAll), } if len(dynamicURL) > 0 { diff --git a/internal/io/consumer.go b/internal/io/consumer.go deleted file mode 100644 index a37f4d3629..0000000000 --- a/internal/io/consumer.go +++ /dev/null @@ -1,92 +0,0 @@ -// Unless explicitly stated otherwise all files in this repository are licensed -// under the MIT License. -// This product includes software developed at Guance Cloud (https://www.guance.com/). -// Copyright 2021-present Guance, Inc. - -package io - -import ( - "time" - - "github.com/GuanceCloud/cliutils/point" - "gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/datakit" - "gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/io/failcache" -) - -type consumer struct { - fc failcache.Cache - - category point.Category - - flushTiker *time.Ticker - - points []*point.Point - lastFlush time.Time -} - -func (x *dkIO) runConsumer(cat point.Category) { - r := x.fo.Reader(cat) - if r == nil && x.enableCache && cat != point.DynamicDWCategory { - log.Panicf("invalid category %q, should not been here", cat.String()) - } - - fc, ok := x.fcs[cat.String()] - if !ok { - log.Infof("IO local cache not set for %q", cat.String()) - } - - c := &consumer{ - flushTiker: time.NewTicker(x.flushInterval), - fc: fc, - category: cat, - } - - defer c.flushTiker.Stop() - - if cat == point.DynamicDWCategory { - // NOTE: 目前只有拨测的数据会将数据打到 dynamicDatawayPts 中,而拨测数据 - // 是写日志,故将 category 设置为 logging - c.category = point.Logging - } - - fcTick := time.NewTicker(x.cacheCleanInterval) - defer fcTick.Stop() - - // close diskcache on exit. - defer func() { - if c.fc != nil { - if err := c.fc.Close(); err != nil { - log.Warnf("cache.Close: %s, ignored", err) - } - } - }() - - log.Infof("run consumer on %s", c.category) - for { - select { - case d := <-r: - x.cacheData(c, d, true) - PutFeedOption(d) // release feed options here - - case <-c.flushTiker.C: - if len(c.points) > 0 { - log.Debugf("on tick(%s) to flush %s(%d points), last flush %s ago...", - x.flushInterval, c.category, len(c.points), time.Since(c.lastFlush)) - x.flush(c) - } - - case <-fcTick.C: - x.flushFailCache(c) - - case <-datakit.Exit.Wait(): - if len(c.points) > 0 { - log.Debugf("on tick(%s) to flush %s(%d points), last flush %s ago...", - x.flushInterval, c.category, len(c.points), time.Since(c.lastFlush)) - x.flush(c) - } - x.flushFailCache(c) - log.Infof("io consumer on %s exit on exit", c.category) - return - } - } -} diff --git a/internal/io/dataway/body.go b/internal/io/dataway/body.go index 3231d3824d..8f9b588d08 100644 --- a/internal/io/dataway/body.go +++ b/internal/io/dataway/body.go @@ -14,48 +14,169 @@ import ( "github.com/GuanceCloud/cliutils/point" ) +type walFrom int8 + +const ( + walFromMem walFrom = iota + walFromDisk +) + +func (f walFrom) String() string { + // nolint: exhaustive + switch f { + case walFromMem: + return "M" + default: + return "D" + } +} + type body struct { - buf []byte - rawLen int - gzon bool - npts int - payloadEnc point.Encoding + CacheData + + // NOTE: these 2 buffer may comes from: + // - reusable buffer that not allocated by body instance, or + // - new allocated by apply withCap() when getBody(). + // So during putBody(), do not touch these 2 buffer. + marshalBuf []byte // buffer used for dump pb binary + sendBuf []byte // buffer used for encoding points to pb/line-proto + + chksum string + + selfBuffer, // buffer that belongs to itself, and we should not drop it when putback + gzon int8 + from walFrom + checkSize bool } func (b *body) reset() { - b.buf = nil - b.rawLen = 0 - b.gzon = false - b.npts = 0 - b.payloadEnc = point.LineProtocol + b.CacheData.Payload = nil + b.CacheData.PayloadType = int32(point.Protobuf) + b.CacheData.Category = int32(point.Protobuf) + + b.CacheData.Headers = b.CacheData.Headers[:0] + b.CacheData.DynURL = "" + b.CacheData.Pts = 0 + b.CacheData.RawLen = 0 + + if b.selfBuffer != 1 { // buffer not managed by itself + b.sendBuf = nil + b.marshalBuf = nil + } + + // NOTE: do not touch b.sendBuf and b.marshalBuf, we use the buffer for encoding + // and WAL protobuf marshal, their len(x) is always it's capacity. If len(x) changed, + // this will **panic** body encoding and protobuf marshal. + + b.gzon = -1 + b.from = walFromMem +} + +func (b *body) buf() []byte { + return b.CacheData.Payload +} + +func (b *body) headers() []*HTTPHeader { + return b.CacheData.Headers +} + +func (b *body) url() string { + return b.CacheData.DynURL +} + +func (b *body) cat() point.Category { + return point.Category(b.CacheData.Category) +} + +func (b *body) enc() point.Encoding { + return point.Encoding(b.CacheData.PayloadType) +} + +func (b *body) npts() int32 { + return b.CacheData.Pts +} + +func (b *body) rawLen() int32 { + return b.CacheData.RawLen +} + +func (b *body) loadCache(data []byte) error { + if err := b.CacheData.Unmarshal(data); err != nil { + return fmt.Errorf("Unmarshal: %w", err) + } + + return nil +} + +func (b *body) dump() ([]byte, error) { + if b.checkSize { // checkSize will not set on production, just for testing cases. + // NOTE: check required size before marshal, extra Size() call may cause a bit CPU time. + if s := b.CacheData.Size(); s > len(b.marshalBuf) { + return nil, fmt.Errorf("too small(%d) marshal buffer, need %d", len(b.marshalBuf), s) + } + } + + // MarshalTo() all call Size() within itself. + if n, err := b.CacheData.MarshalTo(b.marshalBuf); err != nil { + return nil, fmt.Errorf("MarshalTo: %w", err) + } else { + return b.marshalBuf[:n], nil + } } func (b *body) String() string { - return fmt.Sprintf("gzon: %v, pts: %d, buf bytes: %d, rawLen: %d", b.gzon, b.npts, len(b.buf), b.rawLen) + return fmt.Sprintf("from: %s, enc: %s, cat: %s, gzon: %v, headers: %d, pts: %d, buf bytes: %d, chksum: %s, rawLen: %d, cap: %d", + b.from, b.enc(), b.cat(), b.gzon, len(b.headers()), b.npts(), len(b.buf()), b.chksum, b.rawLen(), cap(b.sendBuf)) } -func (w *writer) zip(data []byte) ([]byte, error) { - // reset zipper on multiple parts. - // zipper may called multiple times during build HTTP bodies, - // so zipper need to reset before next round. - if w.parts > 0 { - w.zipper.buf.Reset() - w.zipper.w.Reset(w.zipper.buf) +func (b *body) pretty() string { + var arr []string + arr = append(arr, fmt.Sprintf("\n%p from: %s", b, b.from)) + arr = append(arr, fmt.Sprintf("enc: %s", b.enc())) + arr = append(arr, fmt.Sprintf("cat: %s", b.cat())) + arr = append(arr, fmt.Sprintf("gzon: %d", b.gzon)) + arr = append(arr, fmt.Sprintf("#buf: %d", len(b.buf()))) + arr = append(arr, fmt.Sprintf("#send-buf: %d", len(b.sendBuf))) + arr = append(arr, fmt.Sprintf("#mars-buf: %d", len(b.sendBuf))) + arr = append(arr, fmt.Sprintf("url: %s", b.url())) + arr = append(arr, fmt.Sprintf("raw-len: %d", b.rawLen())) + + arr = append(arr, fmt.Sprintf("headers(%d):\n", len(b.headers()))) + + for _, h := range b.headers() { + arr = append(arr, fmt.Sprintf(" %s: %s", h.Key, h.Value)) } - if _, err := w.zipper.w.Write(data); err != nil { + return strings.Join(arr, "\n") +} + +func (z *gzipWriter) zip(data []byte) ([]byte, error) { + if _, err := z.w.Write(data); err != nil { return nil, err } - if err := w.zipper.w.Flush(); err != nil { + if err := z.w.Flush(); err != nil { return nil, err } - if err := w.zipper.w.Close(); err != nil { + if err := z.w.Close(); err != nil { return nil, err } - return w.zipper.buf.Bytes(), nil + return z.buf.Bytes(), nil +} + +func isGzip(data []byte) int8 { + if len(data) < 2 { + return -1 + } + + // See: https://stackoverflow.com/a/6059342/342348 + if data[0] == 0x1f && data[1] == 0x8b { + return 1 + } else { + return 0 + } } type bodyCallback func(w *writer, b *body) error @@ -69,10 +190,13 @@ func dumpPoints(pts []*point.Point) string { return strings.Join(arr, "\n") } -func (w *writer) buildPointsBody(cb bodyCallback) error { +// buildPointsBody build points within w into line-protocol(v1) or protobuf(v2). +// +// If there too many points, it will automatically split them on multipart on dataway's MaxRawBodySize. +func (w *writer) buildPointsBody() error { var ( - start = time.Now() nptsArr []int + parts int ) // encode callback: to trace payload info. @@ -94,12 +218,6 @@ func (w *writer) buildPointsBody(cb bodyCallback) error { enc.EncodeV2(w.points) - if cap(w.sendBuffer) == 0 { - // The buffer not set before, here we make a default - // buffer to prevent too-small-buffer error from enc.Next(). - w.sendBuffer = make([]byte, w.batchBytesSize) - } - // for panic logging, when panics, we know: // - what these points are // - how points encoded and sent @@ -109,10 +227,10 @@ func (w *writer) buildPointsBody(cb bodyCallback) error { buf := make([]byte, 1<<12) runtime.Stack(buf, false) - log.Errorf("panic: %s\n%s", err.Error(), string(buf)) + l.Errorf("panic: %s\n%s", err.Error(), string(buf)) - log.Errorf("encode: %s, total points: %d, current part: %d, body cap: %d", - err.Error(), len(w.points), w.parts, cap(w.sendBuffer)) + l.Errorf("encode: %s, total points: %d, current part: %d, body cap: %d", + err.Error(), len(w.points), parts, w.batchBytesSize) panic(fmt.Errorf("dump points: %s", dumpPoints(w.points))) } @@ -120,68 +238,64 @@ func (w *writer) buildPointsBody(cb bodyCallback) error { }() for { - encodeBytes, ok := enc.Next(w.sendBuffer) + var ( + compactStart = time.Now() + b = getNewBufferBody(withNewBuffer(w.batchBytesSize)) + ) + + encodeBytes, ok := enc.Next(b.sendBuf) if !ok { if err := enc.LastErr(); err != nil { - log.Errorf("encode: %s, total points: %d, current part: %d, body cap: %d", - err.Error(), len(w.points), w.parts, cap(w.sendBuffer)) + l.Errorf("encode: %s, cat: %s, total points: %d, current part: %d, body cap: %d", + err.Error(), b.cat().Alias(), len(w.points), parts, cap(b.sendBuf)) return err } break } - var err error - w.body.reset() - - w.body.buf = encodeBytes - w.body.rawLen = len(encodeBytes) - w.body.gzon = w.gzip - w.body.payloadEnc = w.httpEncoding - w.body.npts = -1 - - if w.gzip { - w.body.buf, err = w.zip(encodeBytes) - if err != nil { - log.Errorf("datakit.GZip: %s", err.Error()) - continue - } + // setup body info. + b.from = walFromMem + b.CacheData.Payload = encodeBytes + b.CacheData.Category = int32(w.category) + b.CacheData.Pts = int32(nptsArr[parts]) + b.CacheData.RawLen = int32(len(encodeBytes)) + b.CacheData.PayloadType = int32(w.httpEncoding) + b.CacheData.DynURL = w.dynamicURL + for k, v := range w.httpHeaders { + b.CacheData.Headers = append(b.CacheData.Headers, &HTTPHeader{Key: k, Value: v}) } - w.body.npts = nptsArr[w.parts] - buildBodyCostVec.WithLabelValues( - w.category.String(), + b.cat().String(), w.httpEncoding.String(), - ).Observe(float64(time.Since(start)) / float64(time.Second)) + "enc", + ).Observe(float64(time.Since(compactStart)) / float64(time.Second)) buildBodyBatchBytesVec.WithLabelValues( - w.category.String(), + b.cat().String(), w.httpEncoding.String(), - fmt.Sprintf("%v", w.gzip), - ).Observe(float64(len(w.body.buf))) + "raw", + ).Observe(float64(b.rawLen())) buildBodyBatchPointsVec.WithLabelValues( - w.category.String(), + b.cat().String(), w.httpEncoding.String(), - fmt.Sprintf("%v", w.gzip), - ).Observe(float64(w.body.npts)) - w.parts++ - - if cb != nil { - if err := cb(w, w.body); err != nil { - log.Warnf("send %d points to %q(gzip: %v) bytes failed: %q, ignored", - w.body.npts, w.category, w.gzip, err.Error()) - } else { - log.Debugf("send part %d with %d points to %q ok, bytes: %d/%d(zipped)", - w.parts, w.body.npts, w.category, len(encodeBytes), len(w.body.buf)) + ).Observe(float64(b.npts())) + + if w.bcb != nil { + if err := w.bcb(w, b); err != nil { + l.Warnf("%d points to %q bytes failed: %q, ignored", + nptsArr[parts], w.category, err.Error()) } } + + parts++ } buildBodyBatchCountVec.WithLabelValues( w.category.String(), w.httpEncoding.String(), - ).Observe(float64(w.parts)) + ).Observe(float64(parts)) return nil } diff --git a/internal/io/dataway/body_test.go b/internal/io/dataway/body_test.go index 20f4239c37..86e92faa22 100644 --- a/internal/io/dataway/body_test.go +++ b/internal/io/dataway/body_test.go @@ -113,10 +113,10 @@ func TestCrashBuildBody(t *T.T) { w := getWriter() WithPoints(pts)(w) - WithBatchBytesSize(10 * 1024 * 1024)(w) + WithMaxBodyCap(10 * 1024 * 1024)(w) WithHTTPEncoding(point.LineProtocol)(w) - assert.NoError(t, w.buildPointsBody(nil)) + assert.NoError(t, w.buildPointsBody()) putWriter(w) } }(i) @@ -174,9 +174,7 @@ func TestBuildBody(t *T.T) { WithBatchSize(batchSize) WithHTTPEncoding(tc.enc)(w) - assert.NoError(t, w.buildPointsBody(nil)) - - t.Logf("get %d bodies", w.parts) + assert.NoError(t, w.buildPointsBody()) }) } @@ -191,12 +189,12 @@ func TestBuildBody(t *T.T) { WithHTTPEncoding(tc.enc)(w) var arr []*body - cb := func(_ *writer, b *body) error { + WithBodyCallback(func(_ *writer, b *body) error { arr = append(arr, b) return nil - } + })(w) - w.buildPointsBody(cb) + w.buildPointsBody() var ( extractPts []*point.Point @@ -209,17 +207,17 @@ func TestBuildBody(t *T.T) { t.Logf("decoded into %d parts", len(arr)) for _, x := range arr { - assert.True(t, x.npts > 0) - assert.True(t, x.rawLen > 0) - assert.Equal(t, tc.enc, x.payloadEnc) + assert.True(t, x.npts() > 0) + assert.True(t, x.rawLen() > 0) + assert.Equal(t, tc.enc, x.enc()) var ( - raw = x.buf + raw = x.buf() err error ) - if x.gzon { - raw, err = uhttp.Unzip(x.buf) + if x.gzon == 1 { + raw, err = uhttp.Unzip(x.buf()) require.NoError(t, err) } @@ -244,16 +242,16 @@ func TestBuildBody(t *T.T) { t.Logf("enc: %s", tc.enc) WithPoints(tc.pts)(w) - WithBatchBytesSize(bodyByteBatch)(w) + WithMaxBodyCap(bodyByteBatch)(w) WithHTTPEncoding(tc.enc)(w) var arr []*body - cb := func(_ *writer, b *body) error { + WithBodyCallback(func(_ *writer, b *body) error { arr = append(arr, b) return nil - } + })(w) - w.buildPointsBody(cb) + w.buildPointsBody() assert.True(t, len(arr) > 0) @@ -268,16 +266,16 @@ func TestBuildBody(t *T.T) { t.Logf("decoded into %d parts(byte size: %d)", len(arr), bodyByteBatch) for _, x := range arr { - assert.True(t, x.npts > 0) - assert.True(t, x.rawLen > 0) - assert.Equal(t, tc.enc, x.payloadEnc) + assert.True(t, x.npts() > 0) + assert.True(t, x.rawLen() > 0) + assert.Equal(t, tc.enc, x.enc()) var ( - raw = x.buf + raw = x.buf() err error ) - if x.gzon { - raw, err = uhttp.Unzip(x.buf) + if x.gzon == 1 { + raw, err = uhttp.Unzip(x.buf()) if err != nil { assert.NoError(t, err) } @@ -306,6 +304,7 @@ func BenchmarkBuildBody(b *T.B) { name string pts []*point.Point batch int + gz int enc point.Encoding }{ { @@ -350,6 +349,14 @@ func BenchmarkBuildBody(b *T.B) { enc: point.Protobuf, }, + { + name: "gz-1k-pts-on-protobuf-batch1024", + pts: r.Rand(1024), + batch: 1024, + enc: point.Protobuf, + gz: 1, + }, + { name: "10k-pts-on-protobuf-batch4k", pts: r.Rand(10240), @@ -366,17 +373,253 @@ func BenchmarkBuildBody(b *T.B) { } for _, bc := range cases { + b.ResetTimer() b.Run(bc.name, func(b *T.B) { - w := getWriter() + w := getWriter(WithBodyCallback(func(w *writer, body *body) error { + putBody(body) // release body + return nil + })) defer putWriter(w) WithBatchSize(bc.batch)(w) WithPoints(bc.pts)(w) WithHTTPEncoding(bc.enc)(w) + WithGzip(bc.gz)(w) for i := 0; i < b.N; i++ { - w.buildPointsBody(nil) + w.buildPointsBody() } }) } } + +func TestBodyCacheData(t *T.T) { + t.Run(`basic`, func(t *T.T) { + cb := func(w *writer, b *body) error { + t.Logf("send-buf: %p/%d", b.sendBuf, len(b.sendBuf)) + t.Logf("buf(): %p/%d", b.buf(), len(b.buf())) + t.Logf("payload: %p/%d", b.CacheData.Payload, len(b.CacheData.Payload)) + + assert.Equal(t, len(b.sendBuf), cap(b.sendBuf)) // b.sendBuf should always equal to it's cap + assert.Equal(t, len(b.marshalBuf), cap(b.marshalBuf)) // b.sendBuf should always equal to it's cap + + // b.buf() should point to b.sendBuf + assert.Equal(t, b.sendBuf[:len(b.buf())], b.buf()) + + return nil + } + + r := point.NewRander() + pts := r.Rand(1000) + + w := getWriter(WithBodyCallback(cb)) + defer putWriter(w) + WithPoints(pts)(w) + WithMaxBodyCap(10 * 1024 * 1024)(w) + WithHTTPEncoding(point.LineProtocol)(w) + + w.buildPointsBody() + }) + + t.Run(`body-dump-and-load`, func(t *T.T) { + cb := func(w *writer, b *body) error { + t.Logf("send-buf: %p/%d", b.sendBuf, len(b.sendBuf)) + t.Logf("buf(): %p/%d", b.buf(), len(b.buf())) + t.Logf("payload: %p/%d", b.CacheData.Payload, len(b.CacheData.Payload)) + + pb, err := b.dump() + assert.NoError(t, err) + t.Logf("marshal-buf: %p?/%d", b.marshalBuf, len(b.marshalBuf)) + t.Logf("pb: %p?/%d", pb, len(pb)) + + assert.Equal(t, len(b.sendBuf), cap(b.sendBuf)) // b.sendBuf should always equal to it's cap + assert.Equal(t, len(b.marshalBuf), cap(b.marshalBuf)) // b.sendBuf should always equal to it's cap + + // b.buf() should point to b.sendBuf + assert.Equal(t, b.marshalBuf[:len(pb)], pb) + + newBody := getNewBufferBody(withNewBuffer(defaultBatchSize)) + defer putBody(newBody) + + assert.NoError(t, newBody.loadCache(pb)) // newBody load pb data + assert.Equal(t, newBody.CacheData, b.CacheData) + assert.Equal(t, b.buf(), newBody.buf()) + + t.Logf("body: %s", newBody.pretty()) + + assert.Len(t, newBody.headers(), 2) + for _, h := range newBody.headers() { + switch h.Key { + case "header-1": + assert.Equal(t, `value-1`, h.Value) + case "header-2": + assert.Equal(t, `value-2`, h.Value) + default: + assert.Truef(t, false, "should not been here") + } + } + + assert.Equal(t, "http://some.dynamic.url?token=tkn_xyz", b.url()) + + t.Logf("buf: %q", newBody.buf()[:10]) + + putBody(b) + + return nil + } + + pts := point.RandPoints(100) + w := getWriter(WithBodyCallback(cb), + WithHTTPHeader("header-1", "value-1"), + WithHTTPHeader("header-2", "value-2"), + WithDynamicURL("http://some.dynamic.url?token=tkn_xyz"), + WithPoints(pts), + WithHTTPEncoding(point.LineProtocol), + ) + defer putWriter(w) + + w.buildPointsBody() + }) +} + +func TestPBMarshalSize(t *T.T) { + t.Run(`basic-1mb-body`, func(t *T.T) { + sendBuf := make([]byte, 1<<20) + marshalBuf := make([]byte, 1<<20+100*(1<<10)) + b := getReuseBufferBody(withReusableBuffer(sendBuf, marshalBuf)) + + b.CacheData.Category = int32(point.Metric) + b.CacheData.PayloadType = int32(point.Protobuf) + b.CacheData.Payload = sendBuf // we really get 1MB body to send + b.CacheData.Pts = 10000 + b.CacheData.RawLen = (1 << 20) + b.Headers = append(b.Headers, &HTTPHeader{Key: HeaderXGlobalTags, Value: "looooooooooooooooooooooong value"}) + b.DynURL = "https://openway.guance.com/v1/write/logging?token=tkn_11111111111111111111111" + + pbbuf, err := b.dump() + assert.NoError(t, err) + + t.Logf("#pbbuf: %d, raised: %.8f", len(pbbuf), float64(len(pbbuf)-len(b.buf()))/float64(len(b.buf()))) + }) + + t.Run(`basic-4mb-body`, func(t *T.T) { + size := 4 * (1 << 20) + sendBuf := make([]byte, size) + marshalBuf := make([]byte, size+int(float64(size)*.1)) + b := getReuseBufferBody(withReusableBuffer(sendBuf, marshalBuf)) + + b.CacheData.Category = int32(point.Metric) + b.CacheData.PayloadType = int32(point.Protobuf) + b.CacheData.Payload = sendBuf // we really get 1MB body to send + b.CacheData.Pts = 10000 + b.CacheData.RawLen = int32(size) + b.Headers = append(b.Headers, &HTTPHeader{Key: HeaderXGlobalTags, Value: "looooooooooooooooooooooong value"}) + b.DynURL = "https://openway.guance.com/v1/write/logging?token=tkn_11111111111111111111111" + + pbbuf, err := b.dump() + assert.NoError(t, err) + + t.Logf("#pbbuf: %d, raised: %.8f", len(pbbuf), float64(len(pbbuf)-len(b.buf()))/float64(len(b.buf()))) + }) + + t.Run(`basic-10mb-body`, func(t *T.T) { + size := 10 * (1 << 20) + sendBuf := make([]byte, size) + marshalBuf := make([]byte, size+int(float64(size)*.1)) + b := getReuseBufferBody(withReusableBuffer(sendBuf, marshalBuf)) + + b.CacheData.Category = int32(point.Metric) + b.CacheData.PayloadType = int32(point.Protobuf) + b.CacheData.Payload = sendBuf // we really get 1MB body to send + b.CacheData.Pts = 10000 + b.CacheData.RawLen = int32(size) + b.Headers = append(b.Headers, &HTTPHeader{Key: HeaderXGlobalTags, Value: "looooooooooooooooooooooong value"}) + b.DynURL = "https://openway.guance.com/v1/write/logging?token=tkn_11111111111111111111111" + + pbbuf, err := b.dump() + assert.NoError(t, err) + + t.Logf("#pbbuf: %d, raised: %.8f", len(pbbuf), float64(len(pbbuf)-len(b.buf()))/float64(len(b.buf()))) + }) + + t.Run(`error`, func(t *T.T) { + size := 1 << 20 + sendBuf := make([]byte, size) + marshalBuf := make([]byte, size) + b := getReuseBufferBody(withReusableBuffer(sendBuf, marshalBuf)) + + b.CacheData.Category = int32(point.Metric) + b.CacheData.PayloadType = int32(point.Protobuf) + b.CacheData.Payload = sendBuf // we really get 1MB body to send + b.CacheData.Pts = 10000 + b.CacheData.RawLen = int32(size) + b.Headers = append(b.Headers, &HTTPHeader{Key: HeaderXGlobalTags, Value: "looooooooooooooooooooooong value"}) + b.DynURL = "https://openway.guance.com/v1/write/logging?token=tkn_11111111111111111111111" + b.checkSize = true // enable size checking, or panic + + _, err := b.dump() + assert.Error(t, err) + t.Logf("[expected] dump error: %s", err.Error()) + }) +} + +func BenchmarkBodyDumpAndLoad(b *T.B) { + b.Run(`get-CacheData-size`, func(b *T.B) { + size := 1 << 20 + sendBuf := make([]byte, size) + marshalBuf := make([]byte, size) + _b := getReuseBufferBody(withReusableBuffer(sendBuf, marshalBuf)) + + _b.CacheData.Category = int32(point.Metric) + _b.CacheData.PayloadType = int32(point.Protobuf) + _b.CacheData.Payload = sendBuf // we really get 1MB body to send + _b.CacheData.Pts = 10000 + _b.CacheData.RawLen = int32(size) + _b.Headers = append(_b.Headers, &HTTPHeader{Key: HeaderXGlobalTags, Value: "looooooooooooooooooooooong value"}) + _b.DynURL = "https://openway.guance.com/v1/write/logging?token=tkn_11111111111111111111111" + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _b.Size() + } + }) + + b.Run(`dump-and-load`, func(b *T.B) { + cb := func(w *writer, x *body) error { + defer putBody(x) + + func() { // check if any error + pb, err := x.dump() + assert.NoError(b, err) + + newBody := getNewBufferBody(withNewBuffer(defaultBatchSize)) + assert.NoError(b, newBody.loadCache(pb)) + putBody(newBody) + }() + + b.ResetTimer() + // reuse @x during benchmark + for i := 0; i < b.N; i++ { + pb, _ := x.dump() + + newBody := getNewBufferBody(withNewBuffer(defaultBatchSize)) + newBody.loadCache(pb) + putBody(newBody) + } + return nil + } + + pts := point.RandPoints(100) + w := getWriter(WithBodyCallback(cb), + WithHTTPHeader("header-1", "value-1"), + WithHTTPHeader("header-2", "value-2"), + WithDynamicURL("http://some.dynamic.url?token=tkn_xyz"), + WithPoints(pts), + WithBodyCallback(cb), + ) + + defer putWriter(w) + + w.buildPointsBody() + }) +} diff --git a/internal/io/dataway/cachedata.pb.go b/internal/io/dataway/cachedata.pb.go index 2ed2d012e5..551c07efae 100644 --- a/internal/io/dataway/cachedata.pb.go +++ b/internal/io/dataway/cachedata.pb.go @@ -1,163 +1,953 @@ -// Code generated by protoc-gen-go. DO NOT EDIT. -// versions: -// protoc-gen-go v1.28.1 -// protoc v3.17.3 +// Code generated by protoc-gen-gogo. DO NOT EDIT. // source: cachedata.proto package dataway import ( - protoreflect "google.golang.org/protobuf/reflect/protoreflect" - protoimpl "google.golang.org/protobuf/runtime/protoimpl" + bytes "bytes" + fmt "fmt" + proto "github.com/gogo/protobuf/proto" + io "io" + math "math" + math_bits "math/bits" reflect "reflect" - sync "sync" + strings "strings" ) -const ( - // Verify that this generated code is sufficiently up-to-date. - _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) - // Verify that runtime/protoimpl is sufficiently up-to-date. - _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) -) +// Reference imports to suppress errors if they are not otherwise used. +var _ = proto.Marshal +var _ = fmt.Errorf +var _ = math.Inf -// fail-cache proto message. -type CacheData struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields +// This is a compile-time assertion to ensure that this generated file +// is compatible with the proto package it is being compiled against. +// A compilation error at this line likely means your copy of the +// proto package needs to be updated. +const _ = proto.GoGoProtoPackageIsVersion3 // please upgrade the proto package - Category int32 `protobuf:"varint,1,opt,name=category,proto3" json:"category,omitempty"` - PayloadType int32 `protobuf:"varint,2,opt,name=payloadType,proto3" json:"payloadType,omitempty"` - Payload []byte `protobuf:"bytes,3,opt,name=payload,proto3" json:"payload,omitempty"` +type HTTPHeader struct { + Key string `protobuf:"bytes,1,opt,name=key,proto3" json:"key,omitempty"` + Value string `protobuf:"bytes,2,opt,name=value,proto3" json:"value,omitempty"` } -func (x *CacheData) Reset() { - *x = CacheData{} - if protoimpl.UnsafeEnabled { - mi := &file_cachedata_proto_msgTypes[0] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) +func (m *HTTPHeader) Reset() { *m = HTTPHeader{} } +func (*HTTPHeader) ProtoMessage() {} +func (*HTTPHeader) Descriptor() ([]byte, []int) { + return fileDescriptor_b0dd4f457c7a1df1, []int{0} +} +func (m *HTTPHeader) XXX_Unmarshal(b []byte) error { + return m.Unmarshal(b) +} +func (m *HTTPHeader) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + if deterministic { + return xxx_messageInfo_HTTPHeader.Marshal(b, m, deterministic) + } else { + b = b[:cap(b)] + n, err := m.MarshalToSizedBuffer(b) + if err != nil { + return nil, err + } + return b[:n], nil } } +func (m *HTTPHeader) XXX_Merge(src proto.Message) { + xxx_messageInfo_HTTPHeader.Merge(m, src) +} +func (m *HTTPHeader) XXX_Size() int { + return m.Size() +} +func (m *HTTPHeader) XXX_DiscardUnknown() { + xxx_messageInfo_HTTPHeader.DiscardUnknown(m) +} + +var xxx_messageInfo_HTTPHeader proto.InternalMessageInfo -func (x *CacheData) String() string { - return protoimpl.X.MessageStringOf(x) +func (m *HTTPHeader) GetKey() string { + if m != nil { + return m.Key + } + return "" } -func (*CacheData) ProtoMessage() {} +func (m *HTTPHeader) GetValue() string { + if m != nil { + return m.Value + } + return "" +} -func (x *CacheData) ProtoReflect() protoreflect.Message { - mi := &file_cachedata_proto_msgTypes[0] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) +// fail-cache proto message. +// NOTE: do not change field's number if you need to add more. +type CacheData struct { + Category int32 `protobuf:"varint,1,opt,name=category,proto3" json:"category,omitempty"` + PayloadType int32 `protobuf:"varint,2,opt,name=payloadType,proto3" json:"payloadType,omitempty"` + Payload []byte `protobuf:"bytes,3,opt,name=payload,proto3" json:"payload,omitempty"` + Pts int32 `protobuf:"varint,4,opt,name=pts,proto3" json:"pts,omitempty"` + RawLen int32 `protobuf:"varint,5,opt,name=rawLen,proto3" json:"rawLen,omitempty"` + Headers []*HTTPHeader `protobuf:"bytes,6,rep,name=headers,proto3" json:"headers,omitempty"` + DynURL string `protobuf:"bytes,7,opt,name=dynURL,proto3" json:"dynURL,omitempty"` +} + +func (m *CacheData) Reset() { *m = CacheData{} } +func (*CacheData) ProtoMessage() {} +func (*CacheData) Descriptor() ([]byte, []int) { + return fileDescriptor_b0dd4f457c7a1df1, []int{1} +} +func (m *CacheData) XXX_Unmarshal(b []byte) error { + return m.Unmarshal(b) +} +func (m *CacheData) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + if deterministic { + return xxx_messageInfo_CacheData.Marshal(b, m, deterministic) + } else { + b = b[:cap(b)] + n, err := m.MarshalToSizedBuffer(b) + if err != nil { + return nil, err } - return ms + return b[:n], nil } - return mi.MessageOf(x) +} +func (m *CacheData) XXX_Merge(src proto.Message) { + xxx_messageInfo_CacheData.Merge(m, src) +} +func (m *CacheData) XXX_Size() int { + return m.Size() +} +func (m *CacheData) XXX_DiscardUnknown() { + xxx_messageInfo_CacheData.DiscardUnknown(m) } -// Deprecated: Use CacheData.ProtoReflect.Descriptor instead. -func (*CacheData) Descriptor() ([]byte, []int) { - return file_cachedata_proto_rawDescGZIP(), []int{0} +var xxx_messageInfo_CacheData proto.InternalMessageInfo + +func (m *CacheData) GetCategory() int32 { + if m != nil { + return m.Category + } + return 0 } -func (x *CacheData) GetCategory() int32 { - if x != nil { - return x.Category +func (m *CacheData) GetPayloadType() int32 { + if m != nil { + return m.PayloadType } return 0 } -func (x *CacheData) GetPayloadType() int32 { - if x != nil { - return x.PayloadType +func (m *CacheData) GetPayload() []byte { + if m != nil { + return m.Payload + } + return nil +} + +func (m *CacheData) GetPts() int32 { + if m != nil { + return m.Pts } return 0 } -func (x *CacheData) GetPayload() []byte { - if x != nil { - return x.Payload +func (m *CacheData) GetRawLen() int32 { + if m != nil { + return m.RawLen + } + return 0 +} + +func (m *CacheData) GetHeaders() []*HTTPHeader { + if m != nil { + return m.Headers } return nil } -var File_cachedata_proto protoreflect.FileDescriptor +func (m *CacheData) GetDynURL() string { + if m != nil { + return m.DynURL + } + return "" +} + +func init() { + proto.RegisterType((*HTTPHeader)(nil), "dataway.HTTPHeader") + proto.RegisterType((*CacheData)(nil), "dataway.CacheData") +} + +func init() { proto.RegisterFile("cachedata.proto", fileDescriptor_b0dd4f457c7a1df1) } -var file_cachedata_proto_rawDesc = []byte{ - 0x0a, 0x0f, 0x63, 0x61, 0x63, 0x68, 0x65, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x70, 0x72, 0x6f, 0x74, - 0x6f, 0x12, 0x07, 0x64, 0x61, 0x74, 0x61, 0x77, 0x61, 0x79, 0x22, 0x63, 0x0a, 0x09, 0x43, 0x61, - 0x63, 0x68, 0x65, 0x44, 0x61, 0x74, 0x61, 0x12, 0x1a, 0x0a, 0x08, 0x63, 0x61, 0x74, 0x65, 0x67, - 0x6f, 0x72, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x08, 0x63, 0x61, 0x74, 0x65, 0x67, - 0x6f, 0x72, 0x79, 0x12, 0x20, 0x0a, 0x0b, 0x70, 0x61, 0x79, 0x6c, 0x6f, 0x61, 0x64, 0x54, 0x79, - 0x70, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x05, 0x52, 0x0b, 0x70, 0x61, 0x79, 0x6c, 0x6f, 0x61, - 0x64, 0x54, 0x79, 0x70, 0x65, 0x12, 0x18, 0x0a, 0x07, 0x70, 0x61, 0x79, 0x6c, 0x6f, 0x61, 0x64, - 0x18, 0x03, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x07, 0x70, 0x61, 0x79, 0x6c, 0x6f, 0x61, 0x64, 0x42, - 0x0c, 0x5a, 0x0a, 0x2e, 0x2f, 0x3b, 0x64, 0x61, 0x74, 0x61, 0x77, 0x61, 0x79, 0x62, 0x06, 0x70, - 0x72, 0x6f, 0x74, 0x6f, 0x33, +var fileDescriptor_b0dd4f457c7a1df1 = []byte{ + // 292 bytes of a gzipped FileDescriptorProto + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xe2, 0xe2, 0x4f, 0x4e, 0x4c, 0xce, + 0x48, 0x4d, 0x49, 0x2c, 0x49, 0xd4, 0x2b, 0x28, 0xca, 0x2f, 0xc9, 0x17, 0x62, 0x07, 0xb1, 0xcb, + 0x13, 0x2b, 0x95, 0x4c, 0xb8, 0xb8, 0x3c, 0x42, 0x42, 0x02, 0x3c, 0x52, 0x13, 0x53, 0x52, 0x8b, + 0x84, 0x04, 0xb8, 0x98, 0xb3, 0x53, 0x2b, 0x25, 0x18, 0x15, 0x18, 0x35, 0x38, 0x83, 0x40, 0x4c, + 0x21, 0x11, 0x2e, 0xd6, 0xb2, 0xc4, 0x9c, 0xd2, 0x54, 0x09, 0x26, 0xb0, 0x18, 0x84, 0xa3, 0x74, + 0x85, 0x91, 0x8b, 0xd3, 0x19, 0x64, 0xa4, 0x4b, 0x62, 0x49, 0xa2, 0x90, 0x14, 0x17, 0x47, 0x72, + 0x62, 0x49, 0x6a, 0x7a, 0x7e, 0x11, 0x44, 0x2b, 0x6b, 0x10, 0x9c, 0x2f, 0xa4, 0xc0, 0xc5, 0x5d, + 0x90, 0x58, 0x99, 0x93, 0x9f, 0x98, 0x12, 0x52, 0x59, 0x00, 0x31, 0x85, 0x35, 0x08, 0x59, 0x48, + 0x48, 0x82, 0x8b, 0x1d, 0xca, 0x95, 0x60, 0x56, 0x60, 0xd4, 0xe0, 0x09, 0x82, 0x71, 0x41, 0xae, + 0x29, 0x28, 0x29, 0x96, 0x60, 0x01, 0xeb, 0x01, 0x31, 0x85, 0xc4, 0xb8, 0xd8, 0x8a, 0x12, 0xcb, + 0x7d, 0x52, 0xf3, 0x24, 0x58, 0xc1, 0x82, 0x50, 0x9e, 0x90, 0x2e, 0x17, 0x7b, 0x06, 0xd8, 0x07, + 0xc5, 0x12, 0x6c, 0x0a, 0xcc, 0x1a, 0xdc, 0x46, 0xc2, 0x7a, 0x50, 0x0f, 0xea, 0x21, 0x7c, 0x17, + 0x04, 0x53, 0x03, 0x32, 0x26, 0xa5, 0x32, 0x2f, 0x34, 0xc8, 0x47, 0x82, 0x1d, 0xec, 0x2b, 0x28, + 0xcf, 0xc9, 0xe1, 0xc2, 0x43, 0x39, 0x86, 0x1b, 0x0f, 0xe5, 0x18, 0x3e, 0x3c, 0x94, 0x63, 0x6c, + 0x78, 0x24, 0xc7, 0xb8, 0xe2, 0x91, 0x1c, 0xe3, 0x89, 0x47, 0x72, 0x8c, 0x17, 0x1e, 0xc9, 0x31, + 0x3e, 0x78, 0x24, 0xc7, 0xf8, 0xe2, 0x91, 0x1c, 0xc3, 0x87, 0x47, 0x72, 0x8c, 0x13, 0x1e, 0xcb, + 0x31, 0x5c, 0x78, 0x2c, 0xc7, 0x70, 0xe3, 0xb1, 0x1c, 0x43, 0x14, 0x97, 0x9e, 0xbe, 0x35, 0xd4, + 0xb6, 0x24, 0x36, 0x70, 0xf0, 0x1a, 0x03, 0x02, 0x00, 0x00, 0xff, 0xff, 0x70, 0x39, 0x48, 0xba, + 0x71, 0x01, 0x00, 0x00, } -var ( - file_cachedata_proto_rawDescOnce sync.Once - file_cachedata_proto_rawDescData = file_cachedata_proto_rawDesc -) +func (this *HTTPHeader) Equal(that interface{}) bool { + if that == nil { + return this == nil + } -func file_cachedata_proto_rawDescGZIP() []byte { - file_cachedata_proto_rawDescOnce.Do(func() { - file_cachedata_proto_rawDescData = protoimpl.X.CompressGZIP(file_cachedata_proto_rawDescData) - }) - return file_cachedata_proto_rawDescData -} - -var file_cachedata_proto_msgTypes = make([]protoimpl.MessageInfo, 1) -var file_cachedata_proto_goTypes = []interface{}{ - (*CacheData)(nil), // 0: dataway.CacheData -} -var file_cachedata_proto_depIdxs = []int32{ - 0, // [0:0] is the sub-list for method output_type - 0, // [0:0] is the sub-list for method input_type - 0, // [0:0] is the sub-list for extension type_name - 0, // [0:0] is the sub-list for extension extendee - 0, // [0:0] is the sub-list for field type_name -} - -func init() { file_cachedata_proto_init() } -func file_cachedata_proto_init() { - if File_cachedata_proto != nil { - return - } - if !protoimpl.UnsafeEnabled { - file_cachedata_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { - switch v := v.(*CacheData); i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - } - type x struct{} - out := protoimpl.TypeBuilder{ - File: protoimpl.DescBuilder{ - GoPackagePath: reflect.TypeOf(x{}).PkgPath(), - RawDescriptor: file_cachedata_proto_rawDesc, - NumEnums: 0, - NumMessages: 1, - NumExtensions: 0, - NumServices: 0, - }, - GoTypes: file_cachedata_proto_goTypes, - DependencyIndexes: file_cachedata_proto_depIdxs, - MessageInfos: file_cachedata_proto_msgTypes, - }.Build() - File_cachedata_proto = out.File - file_cachedata_proto_rawDesc = nil - file_cachedata_proto_goTypes = nil - file_cachedata_proto_depIdxs = nil + that1, ok := that.(*HTTPHeader) + if !ok { + that2, ok := that.(HTTPHeader) + if ok { + that1 = &that2 + } else { + return false + } + } + if that1 == nil { + return this == nil + } else if this == nil { + return false + } + if this.Key != that1.Key { + return false + } + if this.Value != that1.Value { + return false + } + return true +} +func (this *CacheData) Equal(that interface{}) bool { + if that == nil { + return this == nil + } + + that1, ok := that.(*CacheData) + if !ok { + that2, ok := that.(CacheData) + if ok { + that1 = &that2 + } else { + return false + } + } + if that1 == nil { + return this == nil + } else if this == nil { + return false + } + if this.Category != that1.Category { + return false + } + if this.PayloadType != that1.PayloadType { + return false + } + if !bytes.Equal(this.Payload, that1.Payload) { + return false + } + if this.Pts != that1.Pts { + return false + } + if this.RawLen != that1.RawLen { + return false + } + if len(this.Headers) != len(that1.Headers) { + return false + } + for i := range this.Headers { + if !this.Headers[i].Equal(that1.Headers[i]) { + return false + } + } + if this.DynURL != that1.DynURL { + return false + } + return true +} +func (this *HTTPHeader) GoString() string { + if this == nil { + return "nil" + } + s := make([]string, 0, 6) + s = append(s, "&dataway.HTTPHeader{") + s = append(s, "Key: "+fmt.Sprintf("%#v", this.Key)+",\n") + s = append(s, "Value: "+fmt.Sprintf("%#v", this.Value)+",\n") + s = append(s, "}") + return strings.Join(s, "") +} +func (this *CacheData) GoString() string { + if this == nil { + return "nil" + } + s := make([]string, 0, 11) + s = append(s, "&dataway.CacheData{") + s = append(s, "Category: "+fmt.Sprintf("%#v", this.Category)+",\n") + s = append(s, "PayloadType: "+fmt.Sprintf("%#v", this.PayloadType)+",\n") + s = append(s, "Payload: "+fmt.Sprintf("%#v", this.Payload)+",\n") + s = append(s, "Pts: "+fmt.Sprintf("%#v", this.Pts)+",\n") + s = append(s, "RawLen: "+fmt.Sprintf("%#v", this.RawLen)+",\n") + if this.Headers != nil { + s = append(s, "Headers: "+fmt.Sprintf("%#v", this.Headers)+",\n") + } + s = append(s, "DynURL: "+fmt.Sprintf("%#v", this.DynURL)+",\n") + s = append(s, "}") + return strings.Join(s, "") +} +func valueToGoStringCachedata(v interface{}, typ string) string { + rv := reflect.ValueOf(v) + if rv.IsNil() { + return "nil" + } + pv := reflect.Indirect(rv).Interface() + return fmt.Sprintf("func(v %v) *%v { return &v } ( %#v )", typ, typ, pv) +} +func (m *HTTPHeader) Marshal() (dAtA []byte, err error) { + size := m.Size() + dAtA = make([]byte, size) + n, err := m.MarshalToSizedBuffer(dAtA[:size]) + if err != nil { + return nil, err + } + return dAtA[:n], nil +} + +func (m *HTTPHeader) MarshalTo(dAtA []byte) (int, error) { + size := m.Size() + return m.MarshalToSizedBuffer(dAtA[:size]) +} + +func (m *HTTPHeader) MarshalToSizedBuffer(dAtA []byte) (int, error) { + i := len(dAtA) + _ = i + var l int + _ = l + if len(m.Value) > 0 { + i -= len(m.Value) + copy(dAtA[i:], m.Value) + i = encodeVarintCachedata(dAtA, i, uint64(len(m.Value))) + i-- + dAtA[i] = 0x12 + } + if len(m.Key) > 0 { + i -= len(m.Key) + copy(dAtA[i:], m.Key) + i = encodeVarintCachedata(dAtA, i, uint64(len(m.Key))) + i-- + dAtA[i] = 0xa + } + return len(dAtA) - i, nil +} + +func (m *CacheData) Marshal() (dAtA []byte, err error) { + size := m.Size() + dAtA = make([]byte, size) + n, err := m.MarshalToSizedBuffer(dAtA[:size]) + if err != nil { + return nil, err + } + return dAtA[:n], nil +} + +func (m *CacheData) MarshalTo(dAtA []byte) (int, error) { + size := m.Size() + return m.MarshalToSizedBuffer(dAtA[:size]) +} + +func (m *CacheData) MarshalToSizedBuffer(dAtA []byte) (int, error) { + i := len(dAtA) + _ = i + var l int + _ = l + if len(m.DynURL) > 0 { + i -= len(m.DynURL) + copy(dAtA[i:], m.DynURL) + i = encodeVarintCachedata(dAtA, i, uint64(len(m.DynURL))) + i-- + dAtA[i] = 0x3a + } + if len(m.Headers) > 0 { + for iNdEx := len(m.Headers) - 1; iNdEx >= 0; iNdEx-- { + { + size, err := m.Headers[iNdEx].MarshalToSizedBuffer(dAtA[:i]) + if err != nil { + return 0, err + } + i -= size + i = encodeVarintCachedata(dAtA, i, uint64(size)) + } + i-- + dAtA[i] = 0x32 + } + } + if m.RawLen != 0 { + i = encodeVarintCachedata(dAtA, i, uint64(m.RawLen)) + i-- + dAtA[i] = 0x28 + } + if m.Pts != 0 { + i = encodeVarintCachedata(dAtA, i, uint64(m.Pts)) + i-- + dAtA[i] = 0x20 + } + if len(m.Payload) > 0 { + i -= len(m.Payload) + copy(dAtA[i:], m.Payload) + i = encodeVarintCachedata(dAtA, i, uint64(len(m.Payload))) + i-- + dAtA[i] = 0x1a + } + if m.PayloadType != 0 { + i = encodeVarintCachedata(dAtA, i, uint64(m.PayloadType)) + i-- + dAtA[i] = 0x10 + } + if m.Category != 0 { + i = encodeVarintCachedata(dAtA, i, uint64(m.Category)) + i-- + dAtA[i] = 0x8 + } + return len(dAtA) - i, nil +} + +func encodeVarintCachedata(dAtA []byte, offset int, v uint64) int { + offset -= sovCachedata(v) + base := offset + for v >= 1<<7 { + dAtA[offset] = uint8(v&0x7f | 0x80) + v >>= 7 + offset++ + } + dAtA[offset] = uint8(v) + return base +} +func (m *HTTPHeader) Size() (n int) { + if m == nil { + return 0 + } + var l int + _ = l + l = len(m.Key) + if l > 0 { + n += 1 + l + sovCachedata(uint64(l)) + } + l = len(m.Value) + if l > 0 { + n += 1 + l + sovCachedata(uint64(l)) + } + return n } + +func (m *CacheData) Size() (n int) { + if m == nil { + return 0 + } + var l int + _ = l + if m.Category != 0 { + n += 1 + sovCachedata(uint64(m.Category)) + } + if m.PayloadType != 0 { + n += 1 + sovCachedata(uint64(m.PayloadType)) + } + l = len(m.Payload) + if l > 0 { + n += 1 + l + sovCachedata(uint64(l)) + } + if m.Pts != 0 { + n += 1 + sovCachedata(uint64(m.Pts)) + } + if m.RawLen != 0 { + n += 1 + sovCachedata(uint64(m.RawLen)) + } + if len(m.Headers) > 0 { + for _, e := range m.Headers { + l = e.Size() + n += 1 + l + sovCachedata(uint64(l)) + } + } + l = len(m.DynURL) + if l > 0 { + n += 1 + l + sovCachedata(uint64(l)) + } + return n +} + +func sovCachedata(x uint64) (n int) { + return (math_bits.Len64(x|1) + 6) / 7 +} +func sozCachedata(x uint64) (n int) { + return sovCachedata(uint64((x << 1) ^ uint64((int64(x) >> 63)))) +} +func (this *HTTPHeader) String() string { + if this == nil { + return "nil" + } + s := strings.Join([]string{`&HTTPHeader{`, + `Key:` + fmt.Sprintf("%v", this.Key) + `,`, + `Value:` + fmt.Sprintf("%v", this.Value) + `,`, + `}`, + }, "") + return s +} +func (this *CacheData) String() string { + if this == nil { + return "nil" + } + repeatedStringForHeaders := "[]*HTTPHeader{" + for _, f := range this.Headers { + repeatedStringForHeaders += strings.Replace(f.String(), "HTTPHeader", "HTTPHeader", 1) + "," + } + repeatedStringForHeaders += "}" + s := strings.Join([]string{`&CacheData{`, + `Category:` + fmt.Sprintf("%v", this.Category) + `,`, + `PayloadType:` + fmt.Sprintf("%v", this.PayloadType) + `,`, + `Payload:` + fmt.Sprintf("%v", this.Payload) + `,`, + `Pts:` + fmt.Sprintf("%v", this.Pts) + `,`, + `RawLen:` + fmt.Sprintf("%v", this.RawLen) + `,`, + `Headers:` + repeatedStringForHeaders + `,`, + `DynURL:` + fmt.Sprintf("%v", this.DynURL) + `,`, + `}`, + }, "") + return s +} +func valueToStringCachedata(v interface{}) string { + rv := reflect.ValueOf(v) + if rv.IsNil() { + return "nil" + } + pv := reflect.Indirect(rv).Interface() + return fmt.Sprintf("*%v", pv) +} +func (m *HTTPHeader) Unmarshal(dAtA []byte) error { + l := len(dAtA) + iNdEx := 0 + for iNdEx < l { + preIndex := iNdEx + var wire uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowCachedata + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + wire |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + fieldNum := int32(wire >> 3) + wireType := int(wire & 0x7) + if wireType == 4 { + return fmt.Errorf("proto: HTTPHeader: wiretype end group for non-group") + } + if fieldNum <= 0 { + return fmt.Errorf("proto: HTTPHeader: illegal tag %d (wire type %d)", fieldNum, wire) + } + switch fieldNum { + case 1: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Key", wireType) + } + var stringLen uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowCachedata + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + stringLen |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + intStringLen := int(stringLen) + if intStringLen < 0 { + return ErrInvalidLengthCachedata + } + postIndex := iNdEx + intStringLen + if postIndex < 0 { + return ErrInvalidLengthCachedata + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Key = string(dAtA[iNdEx:postIndex]) + iNdEx = postIndex + case 2: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Value", wireType) + } + var stringLen uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowCachedata + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + stringLen |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + intStringLen := int(stringLen) + if intStringLen < 0 { + return ErrInvalidLengthCachedata + } + postIndex := iNdEx + intStringLen + if postIndex < 0 { + return ErrInvalidLengthCachedata + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Value = string(dAtA[iNdEx:postIndex]) + iNdEx = postIndex + default: + iNdEx = preIndex + skippy, err := skipCachedata(dAtA[iNdEx:]) + if err != nil { + return err + } + if (skippy < 0) || (iNdEx+skippy) < 0 { + return ErrInvalidLengthCachedata + } + if (iNdEx + skippy) > l { + return io.ErrUnexpectedEOF + } + iNdEx += skippy + } + } + + if iNdEx > l { + return io.ErrUnexpectedEOF + } + return nil +} +func (m *CacheData) Unmarshal(dAtA []byte) error { + l := len(dAtA) + iNdEx := 0 + for iNdEx < l { + preIndex := iNdEx + var wire uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowCachedata + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + wire |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + fieldNum := int32(wire >> 3) + wireType := int(wire & 0x7) + if wireType == 4 { + return fmt.Errorf("proto: CacheData: wiretype end group for non-group") + } + if fieldNum <= 0 { + return fmt.Errorf("proto: CacheData: illegal tag %d (wire type %d)", fieldNum, wire) + } + switch fieldNum { + case 1: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field Category", wireType) + } + m.Category = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowCachedata + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.Category |= int32(b&0x7F) << shift + if b < 0x80 { + break + } + } + case 2: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field PayloadType", wireType) + } + m.PayloadType = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowCachedata + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.PayloadType |= int32(b&0x7F) << shift + if b < 0x80 { + break + } + } + case 3: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Payload", wireType) + } + var byteLen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowCachedata + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + byteLen |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + if byteLen < 0 { + return ErrInvalidLengthCachedata + } + postIndex := iNdEx + byteLen + if postIndex < 0 { + return ErrInvalidLengthCachedata + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Payload = append(m.Payload[:0], dAtA[iNdEx:postIndex]...) + if m.Payload == nil { + m.Payload = []byte{} + } + iNdEx = postIndex + case 4: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field Pts", wireType) + } + m.Pts = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowCachedata + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.Pts |= int32(b&0x7F) << shift + if b < 0x80 { + break + } + } + case 5: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field RawLen", wireType) + } + m.RawLen = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowCachedata + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.RawLen |= int32(b&0x7F) << shift + if b < 0x80 { + break + } + } + case 6: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Headers", wireType) + } + var msglen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowCachedata + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + msglen |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + if msglen < 0 { + return ErrInvalidLengthCachedata + } + postIndex := iNdEx + msglen + if postIndex < 0 { + return ErrInvalidLengthCachedata + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Headers = append(m.Headers, &HTTPHeader{}) + if err := m.Headers[len(m.Headers)-1].Unmarshal(dAtA[iNdEx:postIndex]); err != nil { + return err + } + iNdEx = postIndex + case 7: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field DynURL", wireType) + } + var stringLen uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowCachedata + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + stringLen |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + intStringLen := int(stringLen) + if intStringLen < 0 { + return ErrInvalidLengthCachedata + } + postIndex := iNdEx + intStringLen + if postIndex < 0 { + return ErrInvalidLengthCachedata + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.DynURL = string(dAtA[iNdEx:postIndex]) + iNdEx = postIndex + default: + iNdEx = preIndex + skippy, err := skipCachedata(dAtA[iNdEx:]) + if err != nil { + return err + } + if (skippy < 0) || (iNdEx+skippy) < 0 { + return ErrInvalidLengthCachedata + } + if (iNdEx + skippy) > l { + return io.ErrUnexpectedEOF + } + iNdEx += skippy + } + } + + if iNdEx > l { + return io.ErrUnexpectedEOF + } + return nil +} +func skipCachedata(dAtA []byte) (n int, err error) { + l := len(dAtA) + iNdEx := 0 + depth := 0 + for iNdEx < l { + var wire uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return 0, ErrIntOverflowCachedata + } + if iNdEx >= l { + return 0, io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + wire |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + wireType := int(wire & 0x7) + switch wireType { + case 0: + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return 0, ErrIntOverflowCachedata + } + if iNdEx >= l { + return 0, io.ErrUnexpectedEOF + } + iNdEx++ + if dAtA[iNdEx-1] < 0x80 { + break + } + } + case 1: + iNdEx += 8 + case 2: + var length int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return 0, ErrIntOverflowCachedata + } + if iNdEx >= l { + return 0, io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + length |= (int(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + if length < 0 { + return 0, ErrInvalidLengthCachedata + } + iNdEx += length + case 3: + depth++ + case 4: + if depth == 0 { + return 0, ErrUnexpectedEndOfGroupCachedata + } + depth-- + case 5: + iNdEx += 4 + default: + return 0, fmt.Errorf("proto: illegal wireType %d", wireType) + } + if iNdEx < 0 { + return 0, ErrInvalidLengthCachedata + } + if depth == 0 { + return iNdEx, nil + } + } + return 0, io.ErrUnexpectedEOF +} + +var ( + ErrInvalidLengthCachedata = fmt.Errorf("proto: negative length found during unmarshaling") + ErrIntOverflowCachedata = fmt.Errorf("proto: integer overflow") + ErrUnexpectedEndOfGroupCachedata = fmt.Errorf("proto: unexpected end of group") +) diff --git a/internal/io/dataway/cachedata.proto b/internal/io/dataway/cachedata.proto index 4d5a663281..c5d422fd86 100644 --- a/internal/io/dataway/cachedata.proto +++ b/internal/io/dataway/cachedata.proto @@ -1,14 +1,32 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the MIT License. +// This product includes software developed at Guance Cloud (https://www.guance.com/). +// Copyright 2021-present Guance, Inc. + +// Definition of WAL cache for body + syntax = "proto3"; package dataway; option go_package = "./;dataway"; +message HTTPHeader { + string key = 1; + string value = 2; +} + // fail-cache proto message. +// NOTE: do not change field's number if you need to add more. message CacheData { int32 category = 1; int32 payloadType = 2; bytes payload = 3; + + int32 pts = 4; + int32 rawLen = 5; + repeated HTTPHeader headers = 6; // extra HTTP headers for the data, especially for sink header + string dynURL = 7; // for dial-testing body, it's url are dynamic } // Generate command: protoc --go_out=. *.proto diff --git a/internal/io/dataway/dialtesting.go b/internal/io/dataway/dialtesting.go index 7466342bc3..bef8289063 100644 --- a/internal/io/dataway/dialtesting.go +++ b/internal/io/dataway/dialtesting.go @@ -36,9 +36,20 @@ func (d *DialtestingSender) Init(opt *DialtestingSenderOpt) error { func (d *DialtestingSender) WriteData(url string, pts []*point.Point) error { // TODO: can not set content encoding here, default use line-protocol + + // return write error or build error + var writeError error w := getWriter(WithPoints(pts), WithDynamicURL(url), WithCategory(point.DynamicDWCategory), + WithBodyCallback(func(w *writer, b *body) error { + err := d.ep.writePointData(w, b) + if err != nil { + writeError = err + } + + return err + }), WithHTTPHeader("X-Sub-Category", "dialtesting")) defer putWriter(w) @@ -46,16 +57,7 @@ func (d *DialtestingSender) WriteData(url string, pts []*point.Point) error { return fmt.Errorf("endpoint is not set correctly") } - // return write error or build error - var writeError error - buildErr := w.buildPointsBody(func(w *writer, b *body) error { - err := d.ep.writeBody(w, b) - if err != nil { - writeError = err - } - - return err - }) + buildErr := w.buildPointsBody() if buildErr != nil { return buildErr @@ -84,7 +86,7 @@ func (d *DialtestingSender) CheckToken(token, scheme, host string) (bool, error) body, err := io.ReadAll(resp.Body) if err != nil { - log.Error(err) + l.Error(err) return false, err } diff --git a/internal/io/dataway/dw.go b/internal/io/dataway/dw.go index fb432da8f3..b3a6f5fb19 100644 --- a/internal/io/dataway/dw.go +++ b/internal/io/dataway/dw.go @@ -8,6 +8,7 @@ package dataway import ( "fmt" + "path/filepath" "runtime" "sort" "strings" @@ -22,8 +23,14 @@ import ( const ( HeaderXGlobalTags = "X-Global-Tags" - DefaultRetryCount = 4 + DefaultRetryCount = 1 DefaultRetryDelay = time.Second + + // DeprecatedDefaultMaxRawBodySize will cause too many memory, we set it to + // 1MB. Set 1MB because the max-log length(message) is 1MB at storage side. + DeprecatedDefaultMaxRawBodySize = 10 * (1 << 20) // 10MB + DefaultMaxRawBodySize = (1 << 20) // 1MB + MinimalRawBodySize = 100 * (1 << 10) // 100KB ) type IDataway interface { @@ -64,13 +71,13 @@ var ( } AvailableDataways = []string{} - log = logger.DefaultSLogger("dataway") + l = logger.DefaultSLogger("dataway") datawayListIntervalDefault = 60 ) -func NewDefaultDataway() *Dataway { - return &Dataway{ - URLs: []string{"https://openway.guance.com?token=tkn_xxxxxxxxxxx"}, +func NewDefaultDataway(opts ...DWOption) *Dataway { + dw := &Dataway{ + URLs: []string{}, HTTPTimeout: 30 * time.Second, IdleTimeout: 90 * time.Second, MaxRawBodySize: DefaultMaxRawBodySize, @@ -83,7 +90,19 @@ func NewDefaultDataway() *Dataway { Interval: time.Minute * 5, SyncOnDiff: time.Second * 30, }, + + walq: map[point.Category]*WALQueue{}, + WAL: &WALConf{ + MaxCapacityGB: 2.0, + FailCacheCleanInterval: time.Second * 30, + }, } + + for _, opt := range opts { + opt(dw) + } + + return dw } type ntp struct { @@ -128,8 +147,13 @@ type Dataway struct { InsecureSkipVerify bool `toml:"tls_insecure"` GlobalCustomerKeys []string `toml:"global_customer_keys"` + WAL *WALConf `toml:"wal"` + + eps []*endPoint + + walq map[point.Category]*WALQueue + walFail *WALQueue - eps []*endPoint locker sync.RWMutex dnsCachers []*dnsCacher @@ -139,8 +163,7 @@ type Dataway struct { NTP *ntp `toml:"ntp"` } -type dwopt func(*Dataway) - +// ParseGlobalCustomerKeys parse custom tag keys used for sinker. func ParseGlobalCustomerKeys(v string) (arr []string) { for _, elem := range strings.Split(v, ",") { // remove white space if x := strings.TrimSpace(elem); len(x) > 0 { @@ -150,34 +173,22 @@ func ParseGlobalCustomerKeys(v string) (arr []string) { return } -func WithGlobalTags(maps ...map[string]string) dwopt { - return func(dw *Dataway) { - if dw.globalTags == nil { - dw.globalTags = map[string]string{} - } - - for _, tags := range maps { - for k, v := range tags { - dw.globalTags[k] = v - } - } - - log.Infof("dataway set globals: %+#v", dw.globalTags) - } -} - +// UpdateGlobalTags hot-update dataway's global tags. func (dw *Dataway) UpdateGlobalTags(tags map[string]string) { dw.locker.Lock() defer dw.locker.Unlock() dw.globalTags = tags - log.Infof("set %d global tags to dataway", len(dw.globalTags)) + l.Infof("set %d global tags to dataway", len(dw.globalTags)) if len(dw.globalTags) > 0 && dw.EnableSinker { dw.globalTagsHTTPHeaderValue = TagHeaderValue(dw.globalTags) } } -func (dw *Dataway) Init(opts ...dwopt) error { - log = logger.SLogger("dataway") +// Init setup current dataway. +// +// During Init(), we also accept options to update dataway's field after NewDefaultDataway(). +func (dw *Dataway) Init(opts ...DWOption) error { + l = logger.SLogger("dataway") for _, opt := range opts { if opt != nil { @@ -202,6 +213,8 @@ func (dw *Dataway) String() string { } } + arr = append(arr, fmt.Sprintf("wal: %s, cap: %fGB", dw.WAL.Path, dw.WAL.MaxCapacityGB)) + return strings.Join(arr, "\n") } @@ -209,6 +222,7 @@ func (dw *Dataway) ClientsCount() int { return len(dw.eps) } +// GetTokens list all dataway's tokens. func (dw *Dataway) GetTokens() []string { var arr []string for _, ep := range dw.eps { @@ -220,11 +234,6 @@ func (dw *Dataway) GetTokens() []string { return arr } -const ( - DefaultMaxRawBodySize = 10 * 1024 * 1024 - MinimalRawBodySize = 1 * 1024 * 1024 -) - // TagHeaderValue create X-Global-Tags header value in the // form of key=val,key=val with ASC sorted. func TagHeaderValue(tags map[string]string) string { @@ -236,6 +245,8 @@ func TagHeaderValue(tags map[string]string) string { return strings.Join(arr, ",") } +var defaultInvalidDatawayURL = "https://guance.openway.com?token=YOUR-WORKSPACE-TOKEN" + func (dw *Dataway) doInit() error { // 如果 env 已传入了 dataway 配置, 则不再追加老的 dataway 配置, // 避免俩边配置了同样的 dataway, 造成数据混乱 @@ -251,7 +262,8 @@ func (dw *Dataway) doInit() error { } if len(dw.URLs) == 0 { - return fmt.Errorf("dataway not set: urls is empty") + l.Warnf("dataway not set: urls is empty, set to %q", defaultInvalidDatawayURL) + dw.URLs = append(dw.URLs, defaultInvalidDatawayURL) } if dw.HTTPTimeout <= time.Duration(0) { @@ -262,7 +274,7 @@ func (dw *Dataway) doInit() error { dw.MaxIdleConnsPerHost = 64 } - log.Infof("set %d global tags to dataway", len(dw.globalTags)) + l.Infof("set %d global tags to dataway", len(dw.globalTags)) if len(dw.globalTags) > 0 && dw.EnableSinker { dw.globalTagsHTTPHeaderValue = TagHeaderValue(dw.globalTags) } @@ -273,7 +285,7 @@ func (dw *Dataway) doInit() error { withInsecureSkipVerify(dw.InsecureSkipVerify), withAPIs(dwAPIs), withHTTPHeaders(map[string]string{ - HeaderXGlobalTags: dw.globalTagsHTTPHeaderValue, + // HeaderXGlobalTags: dw.globalTagsHTTPHeaderValue, // DatakitUserAgent define HTTP User-Agent header. // user-agent format. See @@ -290,22 +302,32 @@ func (dw *Dataway) doInit() error { withRetryDelay(dw.RetryDelay), ) if err != nil { - log.Errorf("init dataway url %s failed: %s", u, err.Error()) + l.Errorf("init dataway url %s failed: %s", u, err.Error()) return err } + if dw.EnableSinker { + ep.httpHeaders[HeaderXGlobalTags] = dw.globalTagsHTTPHeaderValue + } + dw.eps = append(dw.eps, ep) dw.addDNSCache(ep.host) } + if dw.WAL.Path == "" { + dw.WAL.Path = filepath.Join(datakit.DataDir, "dw-wal") + } + return nil } +// GlobalTags list all global tags of the dataway. func (dw *Dataway) GlobalTags() map[string]string { return dw.globalTags } +// CustomTagKeys list all custome keys of the dataway. func (dw *Dataway) CustomTagKeys() []string { return dw.GlobalCustomerKeys } diff --git a/internal/io/dataway/dw_opts.go b/internal/io/dataway/dw_opts.go new file mode 100644 index 0000000000..ca88c52739 --- /dev/null +++ b/internal/io/dataway/dw_opts.go @@ -0,0 +1,43 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the MIT License. +// This product includes software developed at Guance Cloud (https://www.guance.com/). +// Copyright 2021-present Guance, Inc. + +package dataway + +type DWOption func(*Dataway) + +// WithGlobalTags add new global tags for the dataway. +func WithGlobalTags(maps ...map[string]string) DWOption { + return func(dw *Dataway) { + if dw.globalTags == nil { + dw.globalTags = map[string]string{} + } + + for _, tags := range maps { + for k, v := range tags { + dw.globalTags[k] = v + } + } + + l.Infof("dataway set globals: %+#v", dw.globalTags) + } +} + +// WithURLs add new dataway URLs for the dataway. +func WithURLs(urls ...string) DWOption { + return func(dw *Dataway) { + dw.URLs = append(dw.URLs, urls...) + } +} + +// WithWALWorkers set WAL flush workers. +func WithWALWorkers(n int) DWOption { + return func(dw *Dataway) { + if n > 0 { + dw.WAL.Workers = n + } + } +} + +// TODO: add more options for dataway instance. diff --git a/internal/io/dataway/dw_test.go b/internal/io/dataway/dw_test.go index e9c44f69ff..0d54411a80 100644 --- a/internal/io/dataway/dw_test.go +++ b/internal/io/dataway/dw_test.go @@ -15,14 +15,13 @@ import ( func TestDWInit(t *T.T) { t.Run("basic", func(t *T.T) { - dw := &Dataway{ - URLs: []string{ - "https://host.com?token=tkn_11111111111111111111", - "https://host.com?token=tkn_22222222222222222222", - }, + dw := NewDefaultDataway() + urls := []string{ + "https://host.com?token=tkn_11111111111111111111", + "https://host.com?token=tkn_22222222222222222222", } - require.NoError(t, dw.doInit()) + require.NoError(t, dw.Init(WithURLs(urls...))) assert.Len(t, dw.eps, 2) assert.Equal(t, dw.HTTPTimeout, time.Second*30) @@ -30,15 +29,14 @@ func TestDWInit(t *T.T) { }) t.Run("invalid-timeout", func(t *T.T) { - dw := &Dataway{ - URLs: []string{ - "https://host.com?token=tkn_11111111111111111111", - "https://host.com?token=tkn_22222222222222222222", - }, - HTTPTimeout: -30 * time.Second, + dw := NewDefaultDataway() + urls := []string{ + "https://host.com?token=tkn_11111111111111111111", + "https://host.com?token=tkn_22222222222222222222", } + dw.HTTPTimeout = -30 * time.Second - require.NoError(t, dw.doInit()) + require.NoError(t, dw.Init(WithURLs(urls...))) assert.Equal(t, 30*time.Second, dw.HTTPTimeout) }) } diff --git a/internal/io/dataway/dwapis.go b/internal/io/dataway/dwapis.go index 24cb180000..92ccec46cd 100644 --- a/internal/io/dataway/dwapis.go +++ b/internal/io/dataway/dwapis.go @@ -34,7 +34,7 @@ func (dw *Dataway) UsageTrace(body []byte) error { return fmt.Errorf("no workspace query URL available") } - log.Debugf("NewRequest: %s", requrl) + l.Debugf("NewRequest: %s", requrl) req, err := http.NewRequest("POST", requrl, bytes.NewBuffer(body)) if err != nil { return err @@ -58,7 +58,7 @@ func (dw *Dataway) UsageTrace(body []byte) error { defer resp.Body.Close() //nolint:errcheck switch resp.StatusCode / 100 { case 2: - log.Debugf("usage trace refresh ok") + l.Debugf("usage trace refresh ok") return nil default: return fmt.Errorf("usage trace refresh failed(status: %d): %s", resp.StatusCode, string(respBody)) @@ -76,7 +76,7 @@ func (dw *Dataway) WorkspaceQuery(body []byte) (*http.Response, error) { return nil, fmt.Errorf("no workspace query URL available") } - log.Debugf("NewRequest: %s", requrl) + l.Debugf("NewRequest: %s", requrl) req, err := http.NewRequest("POST", requrl, bytes.NewBuffer(body)) if err != nil { return nil, err @@ -132,11 +132,11 @@ func (dw *Dataway) Election(namespace, id string, reqBody io.Reader) ([]byte, er return nil, fmt.Errorf("token missing") } - log.Debugf("election sending %s", requrl) + l.Debugf("election sending %s", requrl) req, err := http.NewRequest("POST", requrl, reqBody) if err != nil { - log.Error(err) + l.Error(err) return nil, err } @@ -147,7 +147,7 @@ func (dw *Dataway) Election(namespace, id string, reqBody io.Reader) ([]byte, er resp, err := ep.sendReq(req) if err != nil { - log.Error(err) + l.Error(err) return nil, err } @@ -157,17 +157,17 @@ func (dw *Dataway) Election(namespace, id string, reqBody io.Reader) ([]byte, er body, err := io.ReadAll(resp.Body) if err != nil { - log.Error(err) + l.Error(err) return nil, err } defer resp.Body.Close() //nolint:errcheck switch resp.StatusCode / 100 { case 2: - log.Debugf("election %s ok", requrl) + l.Debugf("election %s ok", requrl) return body, nil default: - log.Debugf("election failed: %d", resp.StatusCode) + l.Debugf("election failed: %d", resp.StatusCode) return nil, fmt.Errorf("election failed: %s", string(body)) } } @@ -190,11 +190,11 @@ func (dw *Dataway) ElectionHeartbeat(namespace, id string, reqBody io.Reader) ([ return nil, fmt.Errorf("token missing") } - log.Debugf("election sending heartbeat %s", requrl) + l.Debugf("election sending heartbeat %s", requrl) req, err := http.NewRequest("POST", requrl, reqBody) if err != nil { - log.Error(err) + l.Error(err) return nil, err } @@ -205,7 +205,7 @@ func (dw *Dataway) ElectionHeartbeat(namespace, id string, reqBody io.Reader) ([ resp, err := ep.sendReq(req) if err != nil { - log.Error(err) + l.Error(err) return nil, err } @@ -215,7 +215,7 @@ func (dw *Dataway) ElectionHeartbeat(namespace, id string, reqBody io.Reader) ([ body, err := io.ReadAll(resp.Body) if err != nil { - log.Error(err) + l.Error(err) return nil, err } @@ -274,11 +274,11 @@ func (dw *Dataway) DatawayList() ([]string, int, error) { var dws dataways if err := json.Unmarshal(body, &dws); err != nil { - log.Errorf(`%s, body: %s`, err, string(body)) + l.Errorf(`%s, body: %s`, err, string(body)) return nil, datawayListIntervalDefault, err } - log.Debugf(`available dataways; %+#v,body: %s`, dws.Content, string(body)) + l.Debugf(`available dataways; %+#v,body: %s`, dws.Content, string(body)) return dws.Content.DatawayList, dws.Content.Interval, nil } @@ -377,7 +377,7 @@ type ntpResp struct { // TimeDiff implement ntp time sync interface. func (dw *Dataway) TimeDiff() int64 { if d, err := dw.doTimeDiff(); err != nil { - log.Errorf("doTimeDiff: %s", err.Error()) + l.Errorf("doTimeDiff: %s", err.Error()) return 0 } else { return d @@ -395,7 +395,7 @@ func (dw *Dataway) doTimeDiff() (int64, error) { return 0, fmt.Errorf("url %s not available", datakit.NTPSync) } - log.Debugf("NewRequest: %s", requrl) + l.Debugf("NewRequest: %s", requrl) req, err := http.NewRequest(http.MethodGet, requrl, nil) if err != nil { return 0, fmt.Errorf("http.NewRequest: %w", err) @@ -419,12 +419,12 @@ func (dw *Dataway) doTimeDiff() (int64, error) { defer resp.Body.Close() //nolint:errcheck switch resp.StatusCode / 100 { case 2: - log.Debugf("ntp ok") + l.Debugf("ntp ok") var nr ntpResp if err := json.Unmarshal(respBody, &nr); err != nil { - log.Errorf("Unmarshal: %s", string(respBody)) + l.Errorf("Unmarshal: %s", string(respBody)) return 0, fmt.Errorf(`json.Unmarshal: %w`, err) } diff --git a/internal/io/dataway/dwapis_test.go b/internal/io/dataway/dwapis_test.go index c707323824..4d9280b821 100644 --- a/internal/io/dataway/dwapis_test.go +++ b/internal/io/dataway/dwapis_test.go @@ -34,13 +34,12 @@ func TestNTP(t *T.T) { })) dw := NewDefaultDataway() - dw.URLs[0] = fmt.Sprintf("%s?token=tkn_xxxxxxxx", ts.URL) dw.NTP = &ntp{ Interval: time.Second, SyncOnDiff: time.Second, } - assert.NoError(t, dw.Init()) + assert.NoError(t, dw.Init(WithURLs(fmt.Sprintf("%s?token=tkn_xxxxxxxx", ts.URL)))) diff, err := dw.doTimeDiff() @@ -52,7 +51,7 @@ func TestNTP(t *T.T) { func TestDWAPIs(t *T.T) { t.Run("apis-with-global-tags", func(t *T.T) { - var dw *Dataway + dw := NewDefaultDataway() ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { assert.Equalf(t, dw.globalTagsHTTPHeaderValue, r.Header.Get(HeaderXGlobalTags), "failed on request %s", r.URL.Path) @@ -64,14 +63,12 @@ func TestDWAPIs(t *T.T) { w.WriteHeader(200) })) - dw = &Dataway{ - URLs: []string{fmt.Sprintf("%s?token=tkn_11111111111111111111", ts.URL)}, - } - - assert.NoError(t, dw.Init(WithGlobalTags(map[string]string{ - "tag1": "value1", - "tag2": "value2", - }))) + assert.NoError(t, dw.Init( + WithURLs(fmt.Sprintf("%s?token=tkn_11111111111111111111", ts.URL)), + WithGlobalTags(map[string]string{ + "tag1": "value1", + "tag2": "value2", + }))) _, err := dw.Pull("some-args") assert.NoError(t, err) diff --git a/internal/io/dataway/endpoint.go b/internal/io/dataway/endpoint.go index c28c6c0d75..a5bab8e59f 100644 --- a/internal/io/dataway/endpoint.go +++ b/internal/io/dataway/endpoint.go @@ -8,7 +8,6 @@ package dataway import ( "bytes" "crypto/tls" - "errors" "fmt" "io" "net/http" @@ -25,7 +24,6 @@ import ( "gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/httpcli" "gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/metrics" dnet "gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/net" - pb "google.golang.org/protobuf/proto" ) type endPoint struct { @@ -141,7 +139,7 @@ func withHTTPHeaders(headers map[string]string) endPointOption { if len(v) > 0 { // ignore empty header value ep.httpHeaders[k] = v } else { - log.Warnf("ignore empty value on header %q", k) + l.Warnf("ignore empty value on header %q", k) } } } @@ -150,7 +148,7 @@ func withHTTPHeaders(headers map[string]string) endPointOption { func newEndpoint(urlstr string, opts ...endPointOption) (*endPoint, error) { u, err := url.ParseRequestURI(urlstr) if err != nil { - log.Errorf("parse dataway url %s failed: %s", urlstr, err.Error()) + l.Errorf("parse dataway url %s failed: %s", urlstr, err.Error()) return nil, err } @@ -183,7 +181,7 @@ func newEndpoint(urlstr string, opts ...endPointOption) (*endPoint, error) { api) } - log.Infof("endpoint regist dataway API %q:%q ok", api, ep.categoryURL[api]) + l.Infof("endpoint regist dataway API %q:%q ok", api, ep.categoryURL[api]) } switch ep.scheme { @@ -201,7 +199,7 @@ func newEndpoint(urlstr string, opts ...endPointOption) (*endPoint, error) { func (ep *endPoint) getHTTPCliOpts() *httpcli.Options { dialContext, err := dnet.GetDNSCacheDialContext(defaultDNSCacheFreq, defaultDNSCacheLookUpTimeout) if err != nil { - log.Warnf("GetDNSCacheDialContext failed: %v", err) + l.Warnf("GetDNSCacheDialContext failed: %v", err) dialContext = nil // if failed, then not use dns cache. } @@ -218,13 +216,13 @@ func (ep *endPoint) getHTTPCliOpts() *httpcli.Options { if ep.proxy != "" { // set proxy if u, err := url.ParseRequestURI(ep.proxy); err != nil { - log.Warnf("parse http proxy %q failed err: %s, ignored and no proxy set", ep.proxy, err.Error()) + l.Warnf("parse http proxy %q failed err: %s, ignored and no proxy set", ep.proxy, err.Error()) } else { if ProxyURLOK(u) { cliOpts.ProxyURL = u - log.Infof("set dataway proxy to %q ok", ep.proxy) + l.Infof("set dataway proxy to %q ok", ep.proxy) } else { - log.Warnf("invalid proxy URL: %s, ignored", u) + l.Warnf("invalid proxy URL: %s, ignored", u) } } } @@ -252,79 +250,24 @@ func (ep *endPoint) Transport() *http.Transport { return httpcli.Transport(ep.getHTTPCliOpts()) } -func (ep *endPoint) writeBody(w *writer, b *body) (err error) { - w.gzip = b.gzon - - // if send failed, do nothing. - if err = ep.writePointData(w, b); err != nil { - // 4xx error do not cache data. - if errors.Is(err, errWritePoints4XX) { - writeDropPointsCounterVec.WithLabelValues(w.category.String(), err.Error()).Add(float64(b.npts)) - return - } - - if w.fc == nil { // no cache - writeDropPointsCounterVec.WithLabelValues(w.category.String(), err.Error()).Add(float64(b.npts)) - return - } - - // do cache: write them to disk. - if w.cacheAll { - if err := doCache(w, b); err != nil { - log.Errorf("doCache %d pts on %s: %s", b.npts, w.category, err) - } else { - log.Infof("ok on doCache %d pts on %s", b.npts, w.category) - } - } else { - //nolint:exhaustive - switch w.category { - case point.Metric, // these categories are not cache. - point.MetricDeprecated, - point.Object, - point.CustomObject, - point.DynamicDWCategory: - - writeDropPointsCounterVec.WithLabelValues(w.category.String(), err.Error()).Add(float64(b.npts)) - log.Warnf("drop %d pts on %s, not cached", b.npts, w.category) - - default: - if err := doCache(w, b); err != nil { - log.Errorf("doCache %v pts on %s: %s", b.npts, w.category, err) - } - } - } - } - - return err -} - func (ep *endPoint) writePoints(w *writer) error { - return w.buildPointsBody(ep.writeBody) -} - -func doCache(w *writer, b *body) error { - if cachedata, err := pb.Marshal(&CacheData{ - Category: int32(w.category), - PayloadType: int32(b.payloadEnc), - Payload: b.buf, - }); err != nil { - return err - } else { - return w.fc.Put(cachedata) - } + WithBodyCallback(ep.writePointData)(w) + return w.buildPointsBody() } func (ep *endPoint) writePointData(w *writer, b *body) error { httpCodeStr := "unknown" - requrl, catNotFound := ep.categoryURL[w.category.URL()] + requrl, catNotFound := ep.categoryURL[b.cat().URL()] if !catNotFound { + l.Debugf("cat %q not found, w.dynamicURL: %s", b.cat(), w.dynamicURL) + if w.dynamicURL != "" { // for dialtesting, there are dynamic URL to post if _, err := url.ParseRequestURI(w.dynamicURL); err != nil { return err } else { - log.Debugf("try use dynamic URL %s", w.dynamicURL) + l.Debugf("try use dynamic URL %s", w.dynamicURL) requrl = w.dynamicURL } } else { @@ -334,52 +277,55 @@ func (ep *endPoint) writePointData(w *writer, b *body) error { defer func() { if w.cacheClean { // ignore metrics on cache clean operation + l.Debug("on cache clean, no metric applied") return } // /v1/write/metric -> metric - cat := w.category.String() + cat := b.cat().String() - if w.category == point.DynamicDWCategory { + if b.cat() == point.DynamicDWCategory { // NOTE: datakit category deprecated, we use point category cat = point.DynamicDWCategory.String() } - bytesCounterVec.WithLabelValues(cat, "gzip", "total").Add(float64(len(b.buf))) - bytesCounterVec.WithLabelValues(cat, "gzip", httpCodeStr).Add(float64(len(b.buf))) - bytesCounterVec.WithLabelValues(cat, "raw", "total").Add(float64(b.rawLen)) - bytesCounterVec.WithLabelValues(cat, "raw", httpCodeStr).Add(float64(b.rawLen)) + bytesCounterVec.WithLabelValues(cat, "gzip", "total").Add(float64(len(b.buf()))) + bytesCounterVec.WithLabelValues(cat, "gzip", httpCodeStr).Add(float64(len(b.buf()))) + bytesCounterVec.WithLabelValues(cat, "raw", "total").Add(float64(b.rawLen())) + bytesCounterVec.WithLabelValues(cat, "raw", httpCodeStr).Add(float64(b.rawLen())) - if b.npts > 0 { - ptsCounterVec.WithLabelValues(cat, "total").Add(float64(b.npts)) - ptsCounterVec.WithLabelValues(cat, httpCodeStr).Add(float64(b.npts)) + if b.npts() > 0 { + ptsCounterVec.WithLabelValues(cat, "total").Add(float64(b.npts())) + ptsCounterVec.WithLabelValues(cat, httpCodeStr).Add(float64(b.npts())) } else { - log.Warnf("npts not set, should not been here") + l.Warnf("npts not set, body from %q", b.from) } }() - req, err := http.NewRequest("POST", requrl, bytes.NewBuffer(b.buf)) + l.Debugf("post %d bytes to %s...", len(b.buf()), requrl) + req, err := http.NewRequest("POST", requrl, bytes.NewBuffer(b.buf())) if err != nil { - log.Error("new request to %s: %s", requrl, err) + l.Error("new request to %s: %s", requrl, err) return err } - req.Header.Set("X-Points", fmt.Sprintf("%d", b.npts)) - req.Header.Set("Content-Length", fmt.Sprintf("%d", len(b.buf))) - req.Header.Set("Content-Type", w.httpEncoding.HTTPContentType()) - if w.gzip { + req.Header.Set("X-Points", fmt.Sprintf("%d", b.npts())) + req.Header.Set("Content-Length", fmt.Sprintf("%d", len(b.buf()))) + req.Header.Set("Content-Type", b.enc().HTTPContentType()) + if w.gzip == 1 { req.Header.Set("Content-Encoding", "gzip") } // Common HTTP headers appended, such as User-Agent, X-Global-Tags - log.Debugf("set %d endpoint HTTP headers", len(ep.httpHeaders)) for k, v := range ep.httpHeaders { + l.Debugf("set %s:%s HTTP header comes from endpoint", k, v) req.Header.Set(k, v) } // Append extra HTTP headers to request. // Here may attach X-Global-Tags again. for k, v := range w.httpHeaders { + l.Debugf("set %s:%s HTTP header comes from writer", k, v) req.Header.Set(k, v) } @@ -390,8 +336,8 @@ func (ep *endPoint) writePointData(w *writer, b *body) error { } if err != nil { - log.Errorf("sendReq: request url %s failed(proxy: %s): %s, resp: %v", requrl, ep.proxy, err, resp) - return err + l.Errorf("sendReq: request url %s failed(proxy: %s): %s, resp: %v", requrl, ep.proxy, err, resp) + // do not return here, we need more details about the fail from @resp. } if resp == nil { @@ -401,20 +347,18 @@ func (ep *endPoint) writePointData(w *writer, b *body) error { defer resp.Body.Close() //nolint:errcheck body, err := io.ReadAll(resp.Body) if err != nil { - log.Errorf("io.ReadAll: %s", err) + l.Errorf("io.ReadAll: %s", err) return err } - log.Debugf("post %d bytes to %s...", len(b.buf), requrl) - switch resp.StatusCode / 100 { case 2: - log.Debugf("post %d bytes to %s ok(gz: %v)", len(b.buf), requrl, w.gzip) + l.Debugf("post %d bytes to %s ok(gz: %v)", len(b.buf()), requrl, w.gzip) // Send data ok, it means the error `beyond-usage` error is cleared by kodo server, // we have to clear the hint in monitor too. if strings.Contains(requrl, "/v1/write/") && atomic.LoadInt64(&metrics.BeyondUsage) > 0 { - log.Info("clear BeyondUsage") + l.Info("clear BeyondUsage") atomic.StoreInt64(&metrics.BeyondUsage, 0) } @@ -422,8 +366,8 @@ func (ep *endPoint) writePointData(w *writer, b *body) error { case 4: strBody := string(body) - log.Errorf("post %d to %s failed(HTTP: %s): %s, data dropped", - len(b.buf), + l.Errorf("post %d to %s failed(HTTP: %s): %s, data dropped", + len(b.buf()), requrl, resp.Status, strBody) @@ -432,7 +376,7 @@ func (ep *endPoint) writePointData(w *writer, b *body) error { case http.StatusForbidden: if strings.Contains(strBody, "beyondDataUsage") { atomic.AddInt64(&metrics.BeyondUsage, time.Now().Unix()) // will set `beyond-usage' hint in monitor. - log.Info("set BeyondUsage") + l.Info("set BeyondUsage") } default: // pass @@ -441,8 +385,8 @@ func (ep *endPoint) writePointData(w *writer, b *body) error { return errWritePoints4XX default: // 5xx - log.Errorf("post %d to %s failed(HTTP: %s): %s", - len(b.buf), + l.Errorf("post %d to %s failed(HTTP: %s): %s", + len(b.buf()), requrl, resp.Status, string(body)) @@ -473,7 +417,7 @@ func (ep *endPoint) datakitPull(args string) ([]byte, error) { resp, err := ep.sendReq(req) if err != nil { - log.Errorf("datakitPull: %s", err.Error()) + l.Errorf("datakitPull: %s", err.Error()) return nil, err } @@ -484,7 +428,7 @@ func (ep *endPoint) datakitPull(args string) ([]byte, error) { body, err := io.ReadAll(resp.Body) if err != nil { - log.Error(err.Error()) + l.Error(err.Error()) return nil, err } @@ -501,7 +445,7 @@ func (ep *endPoint) sendReq(req *http.Request) (resp *http.Response, err error) // Generally, the req.GetBody in DK should not be nil, while we do this to avoid accidents. if ep.maxRetryCount > 1 && req.GetBody == nil && req.Body != nil { - log.Debugf("setup GetBody() on %q", req.URL.Path) + l.Debugf("setup GetBody() on %q", req.URL.Path) b, err := io.ReadAll(req.Body) if err != nil { @@ -524,7 +468,7 @@ func (ep *endPoint) sendReq(req *http.Request) (resp *http.Response, err error) maxRetry = DefaultRetryCount } - log.Debugf("retry %q with delay %s on %d retrying", req.URL.Path, delay, maxRetry) + l.Debugf("retry %q with delay %s on %d retrying", req.URL.Path, delay, maxRetry) if err := retry.Do( func() error { @@ -534,14 +478,14 @@ func (ep *endPoint) sendReq(req *http.Request) (resp *http.Response, err error) } if req.GetBody == nil { - log.Debugf("GetBody() not set for request %q, ignored", req.URL.Path) + l.Debugf("GetBody() not set for request %q, ignored", req.URL.Path) return } if body, ierr := req.GetBody(); ierr == nil { req.Body = body // reset body reader, then we can send the request again. } else { - log.Errorf("GetBody() on %q failed: %s", req.URL.Path, ierr) + l.Errorf("GetBody() on %q failed: %s", req.URL.Path, ierr) } }() @@ -554,7 +498,7 @@ func (ep *endPoint) sendReq(req *http.Request) (resp *http.Response, err error) // Terminate retry on global exit. select { case <-datakit.Exit.Wait(): - log.Info("retry abort on global exit") + l.Info("retry abort on global exit") return nil default: // pass @@ -570,7 +514,7 @@ func (ep *endPoint) sendReq(req *http.Request) (resp *http.Response, err error) retry.Delay(delay), retry.OnRetry(func(n uint, err error) { - log.Warnf("on %dth retry for %s, error: %s(%s)", n, req.URL, err, reflect.TypeOf(err)) + l.Warnf("on %dth retry for %s, error: %s(%s)", n, req.URL, err, reflect.TypeOf(err)) switch { // most of the error is Client.Timeout @@ -594,7 +538,7 @@ func (ep *endPoint) sendReq(req *http.Request) (resp *http.Response, err error) } func (ep *endPoint) doSendReq(req *http.Request) (*http.Response, error) { - log.Debugf("send request %q, proxy: %q, cli: %p, timeout: %s", + l.Debugf("send request %q, proxy: %q, cli: %p, timeout: %s", req.URL.String(), ep.proxy, ep.httpCli.Transport, ep.httpTimeout) var ( @@ -644,17 +588,19 @@ func (ep *endPoint) doSendReq(req *http.Request) (*http.Response, error) { httpCodeStr = "reset-by-pear" case strings.Contains(ue.Error(), "connection refused"): httpCodeStr = "connection-refused" + case strings.Contains(ue.Error(), "network is unreachable"): + httpCodeStr = "network-is-unreachable" default: - log.Warnf("unwrapped URL error: %s", err.Error()) + l.Warnf("unwrapped URL error: %s", err.Error()) httpCodeStr = "unwrapped-url-error" } } - log.Warnf("Do: %s, error type: %s", err.Error(), reflect.TypeOf(err)) + l.Warnf("Do: %s, error type: %s", err.Error(), reflect.TypeOf(err)) return nil, fmt.Errorf("httpCli.Do: %w, resp: %+#v", err, resp) } - log.Debugf("%s send req ok", req.URL) + l.Debugf("%s send req ok", req.URL) end: if resp != nil { diff --git a/internal/io/dataway/endpoint_test.go b/internal/io/dataway/endpoint_test.go index 8fcae1b61c..818f420844 100644 --- a/internal/io/dataway/endpoint_test.go +++ b/internal/io/dataway/endpoint_test.go @@ -19,7 +19,6 @@ import ( "time" "github.com/GuanceCloud/cliutils/metrics" - uhttp "github.com/GuanceCloud/cliutils/network/http" "github.com/GuanceCloud/cliutils/point" "github.com/prometheus/client_golang/prometheus" "github.com/stretchr/testify/assert" @@ -111,14 +110,11 @@ func TestEndpointMetrics(t *T.T) { t.Logf("%s: %s", k, r.Header.Get(k)) } - x, err := uhttp.Unzip(body) - assert.NoError(t, err) - assert.Equal(t, `test-1 f1=1i,f2=false 123 test-2 f1=1i,f2=false 123 -`, string(x)) +`, string(body)) - t.Logf("body: %q", x) + t.Logf("body: %q", body) time.Sleep(time.Second) // intended @@ -132,7 +128,7 @@ test-2 f1=1i,f2=false 123 ep, err := newEndpoint(urlstr, withAPIs([]string{datakit.Metric})) assert.NoError(t, err) - w := getWriter(WithGzip(true), + w := getWriter(WithGzip(1), WithPoints([]*point.Point{ point.NewPointV2("test-1", point.NewKVs(map[string]any{"f1": 1, "f2": false}), point.WithTime(time.Unix(0, 123))), point.NewPointV2("test-2", point.NewKVs(map[string]any{"f1": 1, "f2": false}), point.WithTime(time.Unix(0, 123))), @@ -182,14 +178,11 @@ test-2 f1=1i,f2=false 123 t.Logf("%s: %s", k, r.Header.Get(k)) } - x, err := uhttp.Unzip(body) - assert.NoError(t, err) - assert.Equal(t, []byte(`test-1 f1=1i,f2=false 123 test-2 f1=1i,f2=false 123 -`), x) +`), body) - t.Logf("body: %q", x) + t.Logf("body: %q", body) time.Sleep(time.Second) // intended @@ -208,7 +201,7 @@ test-2 f1=1i,f2=false 123 point.NewPointV2("test-1", point.NewKVs(map[string]any{"f1": 1, "f2": false}), point.WithTime(time.Unix(0, 123))), point.NewPointV2("test-2", point.NewKVs(map[string]any{"f1": 1, "f2": false}), point.WithTime(time.Unix(0, 123))), }), - WithGzip(true), + WithGzip(1), ) defer putWriter(w) @@ -255,14 +248,11 @@ test-2 f1=1i,f2=false 123 defer r.Body.Close() - x, err := uhttp.Unzip(body) - assert.NoError(t, err) - assert.Equal(t, []byte(`test-1 f1=1i,f2=false 123 test-2 f1=1i,f2=false 123 -`), x) +`), body) - t.Logf("body: %q", x) + t.Logf("body: %q", body) time.Sleep(time.Second) // intended @@ -284,7 +274,7 @@ test-2 f1=1i,f2=false 123 WithPoints([]*point.Point{ point.NewPointV2("test-1", point.NewKVs(map[string]any{"f1": 1, "f2": false}), point.WithTime(time.Unix(0, 123))), point.NewPointV2("test-2", point.NewKVs(map[string]any{"f1": 1, "f2": false}), point.WithTime(time.Unix(0, 123))), - }), WithGzip(true)) + }), WithGzip(1)) defer putWriter(w) reg := prometheus.NewRegistry() diff --git a/internal/io/dataway/flush.go b/internal/io/dataway/flush.go new file mode 100644 index 0000000000..6cc638d1e1 --- /dev/null +++ b/internal/io/dataway/flush.go @@ -0,0 +1,269 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the MIT License. +// This product includes software developed at Guance Cloud (https://www.guance.com/). +// Copyright 2021-present Guance, Inc. + +package dataway + +import ( + "context" + "errors" + "fmt" + "time" + + "github.com/GuanceCloud/cliutils/point" + "gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/datakit" +) + +type flusher struct { + cat point.Category + wal *WALQueue + dw *Dataway // refer to dataway instance + idx int + + // sendBuf and marshalBuf reused during read WAL from disk-queue. + // When read from mem-queue, these 2 buffer not used. + sendBuf, + marshalBuf []byte // buffer reusable during send & read HTTP body +} + +// StartFlushWorkers init wal-queue on each category. +func (dw *Dataway) StartFlushWorkers() error { + if err := dw.setupWAL(); err != nil { + return err + } + + worker := func(cat point.Category, n int) { + l.Infof("start %dth workers on %q", n, cat) + dwFlusher := datakit.G("dw-flusher/" + cat.Alias()) + for i := 0; i < n; i++ { + dwFlusher.Go(func(_ context.Context) error { + f := dw.newFlusher(cat) + f.idx = i + f.start() + return nil + }) + } + } + + // start wal-queue flush workers on each category + for cat := range dw.walq { + n := dw.WAL.Workers + // nolint: exhaustive + switch cat { // some category do not need much workers + case point.Metric, + point.Network, + point.Logging, + point.Tracing, + point.RUM: + + case point.DialTesting: + n = 0 // dial-testing are direct to point.DynamicDWCategory + default: + n = 1 + } + + l.Infof("start %d flush workers on %s...", n, cat.Alias()) + worker(cat, n) + } + + return nil +} + +func (dw *Dataway) newFlusher(cat point.Category) *flusher { + // we need extra spaces to read body and it's mata info from disk cache. + extra := int(float64(dw.MaxRawBodySize) * .1) + return &flusher{ + cat: cat, + wal: dw.walq[cat], + sendBuf: make([]byte, dw.MaxRawBodySize), + marshalBuf: make([]byte, dw.MaxRawBodySize+extra), + dw: dw, + } +} + +func (dw *Dataway) enqueueBody(w *writer, b *body) error { + q := dw.walq[w.category] + + l.Debugf("walq pub %s to %s(q: %+#v)", b, w.category.Alias(), q) + + walQueueMemLenVec.WithLabelValues(w.category.Alias()).Set(float64(len(q.mem))) + + return q.Put(b) +} + +func (f *flusher) start() { + cleanFailCacheTick := time.NewTicker(f.dw.WAL.FailCacheCleanInterval) + defer cleanFailCacheTick.Stop() + + l.Infof("flushWorker on %s starting...", f.cat.Alias()) + + for { + select { + case <-datakit.Exit.Wait(): + l.Infof("dataway flush worker(%dth) on %s exit", f.idx, f.cat.Alias()) + return + + case <-cleanFailCacheTick.C: + // try clean fail-cached data if any + if err := f.cleanFailCache(); err != nil { + l.Warnf("cleanFailCache: %s, ignored", err) + } + + default: // get from WAL queue(form chan or diskcache) + b, err := f.wal.Get(withReusableBuffer(f.sendBuf, f.marshalBuf)) + if err != nil { + l.Warnf("Get() from wal-queue: %s, ignored", err) + } + + if b == nil { // sleep when there is nothing to flush. + time.Sleep(time.Second) + } else { + l.Debugf("walq get on %s, got body %s(from %s) payload", f.cat.Alias(), b, b.from) + + if err := f.do(b); err != nil { + l.Warnf("do: %s, b: %s, ignored", err, b) + } + } + } + } +} + +func (f *flusher) do(b *body, opts ...WriteOption) error { + gzOn := 0 + if f.dw.GZip { + gzOn = 1 + } + + w := getWriter( + WithHTTPEncoding(b.enc()), + WithGzip(gzOn), + // cache all data into fail-cache + WithCacheAll(true), + WithCategory(b.cat()), + ) + defer putWriter(w) + + return f.dw.doFlush(w, b, opts...) +} + +func (dw *Dataway) doFlush(w *writer, b *body, opts ...WriteOption) error { + for _, opt := range opts { + opt(w) + } + + // Append extra headers if exist. + // + // These headers comes from fail-cache, so we reuse them, it's import for sinked body. + for _, h := range b.headers() { + WithHTTPHeader(h.Key, h.Value)(w) + } + + isGzip := "F" + if dw.GZip { + var ( + zstart = time.Now() + gz = getZipper() + ) + + isGzip = "T" + defer putZipper(gz) + + if zbuf, err := gz.zip(b.buf()); err != nil { + l.Errorf("gzip: %s", err.Error()) + return err + } else { + ncopy := copy(b.sendBuf, zbuf) + l.Debugf("copy %d(origin: %d) zipped bytes to buf", ncopy, len(b.buf())) + b.CacheData.Payload = b.sendBuf[:ncopy] + } + + buildBodyCostVec.WithLabelValues( + w.category.String(), + w.httpEncoding.String(), + "gzip", + ).Observe(float64(time.Since(zstart)) / float64(time.Second)) + } + + defer func() { + // NOTE: for multiple dw.eps, here only 1 flush metric. + walWorkerFlush.WithLabelValues( + b.cat().Alias(), + isGzip, + b.from.String()).Observe(float64(len(b.buf()))) + + // b always comes from pool, no matter from disk queue or mem queue. + l.Debugf("put back %s", b) + putBody(b) + }() + + for _, ep := range dw.eps { + if err := ep.writePointData(w, b); err != nil { + // 4xx error do not cache data. + if errors.Is(err, errWritePoints4XX) { + writeDropPointsCounterVec.WithLabelValues(w.category.String(), err.Error()).Add(float64(b.npts())) + continue // current endpoint POST 4xx ignored, but other endpoint maybe ok. + } + + l.Errorf("writePointData: %s", err) + + // For a exist failed-cache, we do not need to re-cache it. + // and make it fail, the diskcache will rollback and Get() the same data again. + if w.cacheClean { + return fmt.Errorf("clean fail-cache failed: %w", err) + } + + //nolint:exhaustive + switch b.cat() { + case point.Metric, // these categories are not default cached. + point.MetricDeprecated, + point.Object, + point.CustomObject, + point.DynamicDWCategory: + + if !w.cacheAll { + writeDropPointsCounterVec.WithLabelValues(w.category.String(), err.Error()).Add(float64(b.npts())) + l.Warnf("drop %d pts on %s, not cached", b.npts, w.category) + continue + } + + default: // other categories are default cached. + } + + if err := dw.dumpFailCache(b); err != nil { + l.Errorf("dumpFailCache %v pts on %s: %s", b.npts, w.category, err) + } else { + l.Debugf("dumping %q to failcache ok", b) + } + } + } + + return nil +} + +func (dw *Dataway) dumpFailCache(b *body) error { + if x, err := b.dump(); err != nil { + return err + } else { + return dw.walFail.disk.Put(x) // directly put dumpped body to disk-queue, not mem-queue. + } +} + +func (f *flusher) cleanFailCache() error { + return f.dw.walFail.DiskGet(func(b *body) error { + l.Debugf("clean body %q", b) + + var ( // @b will reset within f.do(), we cache it's meta for metric update. + cat = b.cat() + size = len(b.buf()) + ) + + if err := f.do(b, WithCacheClean(true), WithHTTPHeader("X-Fail-Cache-Retry", "1")); err != nil { + return err + } + + // only update metric on clean-ok + flushFailCacheVec.WithLabelValues(cat.Alias()).Observe(float64(size)) + return nil + }, withReusableBuffer(f.sendBuf, f.marshalBuf)) +} diff --git a/internal/io/dataway/flush_test.go b/internal/io/dataway/flush_test.go new file mode 100644 index 0000000000..708a31ff6e --- /dev/null +++ b/internal/io/dataway/flush_test.go @@ -0,0 +1,226 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the MIT License. +// This product includes software developed at Guance Cloud (https://www.guance.com/). +// Copyright 2021-present Guance, Inc. + +package dataway + +import ( + "crypto/md5" + "fmt" + "io" + "net/http" + "net/http/httptest" + "os" + sync "sync" + T "testing" + "time" + + "github.com/GuanceCloud/cliutils/metrics" + uhttp "github.com/GuanceCloud/cliutils/network/http" + "github.com/GuanceCloud/cliutils/point" + "github.com/prometheus/client_golang/prometheus" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestFlush(t *T.T) { + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + body, err := io.ReadAll(r.Body) + require.NoError(t, err) + + t.Logf("body md5: %s", fmt.Sprintf("%x", md5.Sum(body))) + + if r.Header.Get("Content-Encoding") == "gzip" { + t.Logf("is gzip body") + unz, err := uhttp.Unzip(body) + require.NoError(t, err) + body = unz + } else { + t.Logf("not gzip body") + } + + encoding := point.HTTPContentType(r.Header.Get("Content-Type")) + var dec *point.Decoder + + switch encoding { // nolint: exhaustive + case point.Protobuf: + dec = point.GetDecoder(point.WithDecEncoding(point.Protobuf)) + defer point.PutDecoder(dec) + + case point.LineProtocol: + dec = point.GetDecoder(point.WithDecEncoding(point.LineProtocol)) + defer point.PutDecoder(dec) + + default: // not implemented + t.Logf("[ERROR] unknown encoding %s", encoding) + return + } + + if dec != nil { + pts, err := dec.Decode(body) + assert.NoError(t, err) + + nwarns := 0 + for _, pt := range pts { + if len(pt.Warns()) > 0 { + nwarns++ + } + + t.Logf(pt.LineProto()) + } + + t.Logf("decode %d points, %d with warnnings", len(pts), nwarns) + } + + w.WriteHeader(200) + })) + + defer ts.Close() + time.Sleep(time.Second) + + t.Run("conc", func(t *T.T) { + dw := NewDefaultDataway() + + walDir, err := os.MkdirTemp("", "dw-wal") + require.NoError(t, err) + dw.WAL.Path = walDir + + defer os.RemoveAll(walDir) // clean up + + dw.ContentEncoding = "v2" + + assert.NoError(t, dw.Init(WithURLs(fmt.Sprintf("%s?token=tkn_xxxxxxxxxxxxxxxxxxx", ts.URL)))) + + t.Logf("dataway: %s", dw) + + cat := point.Logging + + // setup WAL queue + dw.setupWAL() + + rnd := point.NewRander(point.WithFixedTags(true), point.WithRandText(3)) + + cases := []struct { + name string + pts []*point.Point + }{ + // { + // `1pt`, + // rnd.Rand(1), + // }, + { + `100pt`, + rnd.Rand(100), + }, + } + + reg := prometheus.NewRegistry() + reg.MustRegister(Metrics()...) + for _, tc := range cases { + t.Run(tc.name, func(t *T.T) { + wg := sync.WaitGroup{} + nworker := 1 + njob := 10 + + wg.Add(nworker) + for i := 0; i < nworker; i++ { + go func() { + defer wg.Done() + for x := 0; x < njob; x++ { + assert.NoError(t, dw.Write( + WithPoints(tc.pts), + WithCategory(cat), + WithMaxBodyCap(10*(1<<20)), // 1MB buffer + )) + } + }() + } + + time.Sleep(time.Second) // wait workers ok + + f := dw.newFlusher(cat) + + for { + b, err := f.wal.Get(withReusableBuffer(f.sendBuf, f.marshalBuf)) + assert.NoError(t, err) + if b == nil { + break + } + + raw := b.buf() + + dec := point.GetDecoder(point.WithDecEncoding(point.Protobuf)) + defer point.PutDecoder(dec) + + pts, err := dec.Decode(raw) + require.NoError(t, err) + require.Len(t, pts, len(tc.pts)) + + // each point equal + for idx := range pts { + require.Equal(t, pts[idx].LineProto(), tc.pts[idx].LineProto()) + } + + assert.NoError(t, f.do(b, WithCategory(cat))) + } + + wg.Wait() + + mfs, err := reg.Gather() + require.NoError(t, err) + t.Logf("get metrics:\n%s", metrics.MetricFamily2Text(mfs)) + + t.Cleanup(func() { + metricsReset() + }) + }) + } + }) + + t.Run(`basic`, func(t *T.T) { + t.Skip() + dw := NewDefaultDataway() + dw.WAL.Path = os.TempDir() + + assert.NoError(t, dw.Init(WithURLs(fmt.Sprintf("%s?token=tkn_xxxxxxxxxxxxxxxxxxx", ts.URL)))) + + cat := point.Metric + + // setup WAL queue + dw.setupWAL() + + var kvs point.KVs + kvs = kvs.AddV2("f1", 123, true). + AddV2("f2", "abc", true) + pt := point.NewPointV2(t.Name(), kvs, point.WithTimestamp(123)) + + t.Logf("pt: %s", pt.LineProto()) + + require.NoError(t, + dw.Write(WithPoints([]*point.Point{pt}), WithCategory(cat), WithMaxBodyCap(1<<20))) + + q := dw.walq[cat] + b, err := q.Get() + require.NoError(t, err) + + defer putBody(b) + + t.Logf("body: %s", b) + + raw, err := uhttp.Unzip(b.buf()) + require.NoError(t, err, "body chksum: %x", md5.Sum(b.buf())) + t.Logf("raw: %q", raw) + + dec := point.GetDecoder(point.WithDecEncoding(point.Protobuf)) + defer point.PutDecoder(dec) + + pts, err := dec.Decode(raw) + require.NoError(t, err) + require.Len(t, pts, 1) + require.Equal(t, pts[0].LineProto(), pt.LineProto()) + + f := dw.newFlusher(cat) + assert.NoError(t, f.do(b, WithCategory(cat))) + }) +} diff --git a/internal/io/dataway/metrics.go b/internal/io/dataway/metrics.go index fbae3cd055..d951da6be1 100644 --- a/internal/io/dataway/metrics.go +++ b/internal/io/dataway/metrics.go @@ -14,8 +14,10 @@ var ( ptsCounterVec, bytesCounterVec, writeDropPointsCounterVec, + walPointCounterVec, httpRetry *prometheus.CounterVec + walWorkerFlush, flushFailCacheVec, buildBodyCostVec, buildBodyBatchBytesVec, @@ -23,6 +25,8 @@ var ( buildBodyBatchCountVec, groupedRequestVec, apiSumVec *prometheus.SummaryVec + + walQueueMemLenVec *prometheus.GaugeVec ) func HTTPRetry() *prometheus.CounterVec { @@ -36,7 +40,9 @@ func APISumVec() *prometheus.SummaryVec { // Metrics get all metrics aboud dataway. func Metrics() []prometheus.Collector { return []prometheus.Collector{ + walWorkerFlush, ptsCounterVec, + walPointCounterVec, bytesCounterVec, writeDropPointsCounterVec, apiSumVec, @@ -47,17 +53,21 @@ func Metrics() []prometheus.Collector { buildBodyBatchCountVec, groupedRequestVec, flushFailCacheVec, + walQueueMemLenVec, } } func metricsReset() { + walWorkerFlush.Reset() ptsCounterVec.Reset() + walPointCounterVec.Reset() bytesCounterVec.Reset() writeDropPointsCounterVec.Reset() apiSumVec.Reset() httpRetry.Reset() flushFailCacheVec.Reset() + walQueueMemLenVec.Reset() buildBodyCostVec.Reset() buildBodyBatchBytesVec.Reset() buildBodyBatchPointsVec.Reset() @@ -67,12 +77,15 @@ func metricsReset() { func doRegister() { metrics.MustRegister( + walWorkerFlush, ptsCounterVec, + walPointCounterVec, bytesCounterVec, writeDropPointsCounterVec, apiSumVec, flushFailCacheVec, + walQueueMemLenVec, httpRetry, buildBodyCostVec, buildBodyBatchBytesVec, @@ -84,6 +97,16 @@ func doRegister() { // nolint:gochecknoinits func init() { + walQueueMemLenVec = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: "datakit", + Subsystem: "io", + Name: "dataway_wal_mem_len", + Help: "Dataway WAL's memory queue length", + }, + []string{"category"}, + ) + flushFailCacheVec = prometheus.NewSummaryVec( prometheus.SummaryOpts{ Namespace: "datakit", @@ -113,7 +136,7 @@ func init() { 0.99: 0.001, }, }, - []string{"category", "encoding"}, + []string{"category", "encoding", "stage"}, ) buildBodyBatchCountVec = prometheus.NewSummaryVec( @@ -145,7 +168,7 @@ func init() { 0.99: 0.001, }, }, - []string{"category", "encoding", "gzip"}, + []string{"category", "encoding", "type"}, ) buildBodyBatchPointsVec = prometheus.NewSummaryVec( @@ -161,7 +184,26 @@ func init() { 0.99: 0.001, }, }, - []string{"category", "encoding", "gzip"}, + []string{"category", "encoding"}, + ) + + walWorkerFlush = prometheus.NewSummaryVec( + prometheus.SummaryOpts{ + Namespace: "datakit", + Subsystem: "io", + Name: "dataway_wal_flush", + Help: "Dataway WAL worker flushed bytes", + Objectives: map[float64]float64{ + 0.5: 0.05, + 0.9: 0.01, + 0.99: 0.001, + }, + }, + []string{ + "category", + "gzip", + "queue", // from walqueue disk or mem + }, ) ptsCounterVec = prometheus.NewCounterVec( @@ -174,6 +216,16 @@ func init() { []string{"category", "status"}, ) + walPointCounterVec = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "datakit", + Subsystem: "io", + Name: "wal_point_total", + Help: "WAL queued points", + }, + []string{"category", "status"}, + ) + bytesCounterVec = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: "datakit", diff --git a/internal/io/dataway/pb.sh b/internal/io/dataway/pb.sh new file mode 100644 index 0000000000..c0408b6927 --- /dev/null +++ b/internal/io/dataway/pb.sh @@ -0,0 +1,3 @@ +protoc \ + -I=${GOPATH}/src -I=${GOPATH}/src/github.com/gogo/protobuf/protobuf -I. \ + --gogoslick_out=Mgoogle/protobuf/any.proto=github.com/gogo/protobuf/types:. cachedata.proto diff --git a/internal/io/dataway/pool.go b/internal/io/dataway/pool.go index f446dd6701..40c4fda667 100644 --- a/internal/io/dataway/pool.go +++ b/internal/io/dataway/pool.go @@ -6,12 +6,124 @@ package dataway import ( + "bytes" + "compress/gzip" sync "sync" "github.com/GuanceCloud/cliutils/point" ) -var wpool sync.Pool +var ( + newBufferBodyPool, reuseBufferBodyPool sync.Pool + + wpool sync.Pool + zippool sync.Pool + + defaultBatchSize = (1 << 20) // 1MB +) + +func getZipper() *gzipWriter { + if x := zippool.Get(); x == nil { + buf := bytes.Buffer{} + return &gzipWriter{ + buf: &buf, + w: gzip.NewWriter(&buf), + } + } else { + return x.(*gzipWriter) + } +} + +func putZipper(z *gzipWriter) { + if z != nil { + // reset zip buffer and the writer. + z.buf.Reset() + z.w.Reset(z.buf) + zippool.Put(z) + } +} + +type bodyOpt func(*body) + +func withNewBuffer(n int) bodyOpt { + return func(b *body) { + if n > 0 && b.sendBuf == nil && b.marshalBuf == nil { + b.sendBuf = make([]byte, n) + + // +10% on marshal buffer: we need more bytes for meta-info about the body + extra := int(float64(n) * .1) + b.marshalBuf = make([]byte, n+extra) + b.selfBuffer = 1 + } + } +} + +// withReusableBuffer assign outter buffer that not managed by body instance. +// if withNewBuffer() and withReusableBuffer() both passed, only 1 applied +// according to the order of bodyOpts. +func withReusableBuffer(send, marshal []byte) bodyOpt { + return func(b *body) { + if len(send) > 0 && len(marshal) > 0 { // sendBuf and marshalBuf should not nil + b.sendBuf = send + b.marshalBuf = marshal + b.selfBuffer = 0 // buffer not comes from new buffer + } + } +} + +func getNewBufferBody(opts ...bodyOpt) *body { + var b *body + if x := newBufferBodyPool.Get(); x == nil { + b = &body{ + selfBuffer: 1, + } + } else { + b = x.(*body) + } + + for _, opt := range opts { + opt(b) + } + + if len(b.sendBuf) == 0 || len(b.marshalBuf) == 0 { + panic("no buffer set for new-buffer-body") + } + + return b +} + +func getReuseBufferBody(opts ...bodyOpt) *body { + var b *body + if x := reuseBufferBodyPool.Get(); x == nil { + b = &body{ + selfBuffer: 0, + } + } else { + b = x.(*body) + } + + for _, opt := range opts { + opt(b) + } + + if len(b.sendBuf) == 0 || len(b.marshalBuf) == 0 { + panic("no buffer set for reuse-buffer-body") + } + + return b +} + +func putBody(b *body) { + if b != nil { + b.reset() + + if b.selfBuffer == 1 { + newBufferBodyPool.Put(b) + } else { + reuseBufferBodyPool.Put(b) + } + } +} func getWriter(opts ...WriteOption) *writer { var w *writer @@ -19,8 +131,7 @@ func getWriter(opts ...WriteOption) *writer { if x := wpool.Get(); x == nil { w = &writer{ httpHeaders: map[string]string{}, - batchBytesSize: 1 << 20, // 1MB - body: &body{}, + batchBytesSize: defaultBatchSize, } } else { w = x.(*writer) @@ -39,19 +150,12 @@ func putWriter(w *writer) { w.category = point.UnknownCategory w.dynamicURL = "" w.points = w.points[:0] - w.gzip = false + w.gzip = -1 w.cacheClean = false w.cacheAll = false w.batchBytesSize = 1 << 20 w.batchSize = 0 - w.fc = nil - w.parts = 0 - w.body.reset() - - if w.zipper != nil { - w.zipper.buf.Reset() - w.zipper.w.Reset(w.zipper.buf) - } + w.bcb = nil for k := range w.httpHeaders { delete(w.httpHeaders, k) diff --git a/internal/io/dataway/ptgroup_test.go b/internal/io/dataway/ptgroup_test.go index 363fdd2b0f..612d3694cc 100644 --- a/internal/io/dataway/ptgroup_test.go +++ b/internal/io/dataway/ptgroup_test.go @@ -51,16 +51,13 @@ func BenchmarkGroup(b *T.B) { func TestGroupPoint(t *T.T) { t.Run("duplicate-keys", func(t *T.T) { metricsReset() - dw := &Dataway{ - URLs: []string{"https://fake-dataway.com?token=tkn_xxxxxxxxxx"}, - GlobalCustomerKeys: []string{ - "category", - }, - EnableSinker: true, - GZip: true, - } - assert.NoError(t, dw.Init()) + dw := NewDefaultDataway() + dw.GlobalCustomerKeys = []string{"category"} + dw.EnableSinker = true + dw.GZip = true + + assert.NoError(t, dw.Init(WithURLs("https://fake-dataway.com?token=tkn_xxxxxxxxxx"))) pts := []*point.Point{ point.NewPointV2("some", @@ -82,21 +79,18 @@ func TestGroupPoint(t *T.T) { t.Run("customer-keys", func(t *T.T) { metricsReset() - dw := &Dataway{ - URLs: []string{ - "https://fake-dataway.com?token=tkn_xxxxxxxxxx", - }, - GlobalCustomerKeys: []string{ - "class", - "tag2", - "t1", "t2", "t3", "t4", - "t5", "t6", "t7", "t8", - }, - EnableSinker: true, - GZip: true, + + dw := NewDefaultDataway() + dw.GlobalCustomerKeys = []string{ + "class", + "tag2", + "t1", "t2", "t3", "t4", + "t5", "t6", "t7", "t8", } + dw.EnableSinker = true + dw.GZip = true - assert.NoError(t, dw.Init()) + assert.NoError(t, dw.Init(WithURLs("https://fake-dataway.com?token=tkn_xxxxxxxxxx"))) pts := []*point.Point{ point.NewPointV2("some", @@ -157,19 +151,19 @@ func TestGroupPoint(t *T.T) { t.Run("random-pts-on-logging", func(t *T.T) { metricsReset() - dw := &Dataway{ - URLs: []string{ - "https://fake-dataway.com?token=tkn_xxxxxxxxxx", - }, - GlobalCustomerKeys: []string{"source"}, - EnableSinker: true, - GZip: true, - } - assert.NoError(t, dw.Init(WithGlobalTags(map[string]string{ - "tag1": "value1", - "tag2": "value2", - }))) + dw := NewDefaultDataway() + + dw.GlobalCustomerKeys = []string{"source"} + dw.EnableSinker = true + dw.GZip = true + + assert.NoError(t, dw.Init( + WithURLs("https://fake-dataway.com?token=tkn_xxxxxxxxxx"), + WithGlobalTags(map[string]string{ + "tag1": "value1", + "tag2": "value2", + }))) r := point.NewRander(point.WithFixedTags(true), point.WithRandText(3)) @@ -194,19 +188,18 @@ func TestGroupPoint(t *T.T) { t.Run("basic", func(t *T.T) { metricsReset() - dw := &Dataway{ - URLs: []string{ - "https://fake-dataway.com?token=tkn_xxxxxxxxxx", - }, - GlobalCustomerKeys: []string{"namespace", "app"}, - EnableSinker: true, - GZip: true, - } + dw := NewDefaultDataway() - assert.NoError(t, dw.Init(WithGlobalTags(map[string]string{ - "tag1": "value1", - "tag2": "value2", - }))) + dw.GlobalCustomerKeys = []string{"namespace", "app"} + dw.EnableSinker = true + dw.GZip = true + + assert.NoError(t, dw.Init( + WithURLs("https://fake-dataway.com?token=tkn_xxxxxxxxxx"), + WithGlobalTags(map[string]string{ + "tag1": "value1", + "tag2": "value2", + }))) pts := []*point.Point{ point.NewPointV2("some", @@ -256,17 +249,13 @@ func TestGroupPoint(t *T.T) { t.Run("no-global-tags", func(t *T.T) { metricsReset() - dw := &Dataway{ - URLs: []string{ - "https://fake-dataway.com?token=tkn_xxxxxxxxxx", - }, - - EnableSinker: true, - GlobalCustomerKeys: []string{"namespace", "app"}, - GZip: true, - } - assert.NoError(t, dw.Init()) + dw := NewDefaultDataway() + dw.EnableSinker = true + dw.GlobalCustomerKeys = []string{"namespace", "app"} + dw.GZip = true + + assert.NoError(t, dw.Init(WithURLs("https://fake-dataway.com?token=tkn_xxxxxxxxxx"))) pts := []*point.Point{ point.NewPointV2("some", @@ -316,16 +305,12 @@ func TestGroupPoint(t *T.T) { t.Run("no-global-tags-on-object", func(t *T.T) { metricsReset() - dw := &Dataway{ - URLs: []string{ - "https://fake-dataway.com?token=tkn_xxxxxxxxxx", - }, - GlobalCustomerKeys: []string{"class"}, - EnableSinker: true, - GZip: true, - } + dw := NewDefaultDataway() + dw.GlobalCustomerKeys = []string{"class"} + dw.EnableSinker = true + dw.GZip = true - assert.NoError(t, dw.Init()) + assert.NoError(t, dw.Init(WithURLs("https://fake-dataway.com?token=tkn_xxxxxxxxxx"))) pts := []*point.Point{ point.NewPointV2("some", @@ -378,15 +363,12 @@ func TestGroupPoint(t *T.T) { t.Run("no-global-tags-no-customer-tag-keys", func(t *T.T) { metricsReset() - dw := &Dataway{ - URLs: []string{ - "https://fake-dataway.com?token=tkn_xxxxxxxxxx", - }, - EnableSinker: true, - GZip: true, - } + dw := NewDefaultDataway() + + dw.EnableSinker = true + dw.GZip = true - assert.NoError(t, dw.Init()) + assert.NoError(t, dw.Init(WithURLs("https://fake-dataway.com?token=tkn_xxxxxxxxxx"))) pts := []*point.Point{ point.NewPointV2("some", diff --git a/internal/io/dataway/wal.go b/internal/io/dataway/wal.go new file mode 100644 index 0000000000..19388f4772 --- /dev/null +++ b/internal/io/dataway/wal.go @@ -0,0 +1,224 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the MIT License. +// This product includes software developed at Guance Cloud (https://www.guance.com/). +// Copyright 2021-present Guance, Inc. + +package dataway + +import ( + "errors" + "path/filepath" + "time" + + "github.com/GuanceCloud/cliutils/diskcache" + "github.com/GuanceCloud/cliutils/point" +) + +var defaultRotateAt = 3 * time.Second + +type Cache interface { + // NOTE: reuse callback in diskcache to keep interface ok + // it's better to define Get as + // Get() []byte + BufGet([]byte, diskcache.Fn) error + Put([]byte) error + Size() int64 + Close() error +} + +type WALConf struct { + MaxCapacityGB float64 `toml:"max_capacity_gb"` + Workers int `toml:"workers"` + MemCap int `toml:"mem_cap,omitempty"` + Path string `toml:"path,omitempty"` + FailCacheCleanInterval time.Duration `toml:"fail_cache_clean_interval"` +} + +type WALQueue struct { + disk Cache + mem chan *body + dw *Dataway // back-ref to dataway configures +} + +func NewWAL(dw *Dataway, c Cache) *WALQueue { + q := &WALQueue{ + disk: c, + dw: dw, + } + + if dw.WAL.Workers <= 0 { + dw.WAL.Workers = 1 // set minimal worker, or no flush worker running. + } + + // TIPS: set mem_cap = -1 to disable mem-queue + if dw.WAL.MemCap != -1 && dw.WAL.Workers > 0 { + dw.WAL.MemCap = dw.WAL.Workers + } + + if dw.WAL.MemCap >= 0 { + l.Infof("set wal mem queue cap to %d", dw.WAL.MemCap) + q.mem = make(chan *body, dw.WAL.MemCap) + } + + return q +} + +// Put put a ready-to-send Dataway body to the send queue. +func (q *WALQueue) Put(b *body) error { + select { + case q.mem <- b: // @b will reuse by flush worker + walPointCounterVec.WithLabelValues(b.cat().Alias(), "M").Add(float64(b.npts())) + return nil + default: // pass: put b into disk WAL + } + + putStatus := "" + + l.Debugf("dump body %s to disk queue", b) + + defer func() { + if putStatus != "" { + walPointCounterVec.WithLabelValues(b.cat().Alias(), putStatus).Add(float64(b.npts())) + } + defer putBody(b) // b has dump to disk, do not used any more. + }() + + if x, err := b.dump(); err != nil { + putStatus = "drop" + return err + } else { + if err := q.disk.Put(x); err != nil { + putStatus = "drop" + return err + } else { + // NOTE: do not set putStatus here, we'll update walPointCounterVec during Get(). + return nil + } + } +} + +// Get fetch a ready-to-send Dataway body from the send queue. +func (q *WALQueue) Get(opts ...bodyOpt) (*body, error) { + var b *body + select { + case b = <-q.mem: + // fast path: we get body from WAL.mem + return b, nil // NOTE: no opts are applied to @b if comes from channel + default: // pass: then read from disk queue. + } + + // slow path: we get body from WAL.disk + b = getReuseBufferBody(opts...) + + defer func() { + if len(b.buf()) == 0 { // no data read from disk + putBody(b) + } else { + // Update the metric within Get,because after datakit start, there may be old + // cached data in WAL.disk, we'd add them to current running datakit's metric. + walPointCounterVec.WithLabelValues(b.cat().Alias(), "D").Add(float64(b.npts())) + } + }() + + var raw []byte + if err := q.disk.BufGet(b.marshalBuf, func(x []byte) error { + raw = x + // ASAP ok on Get: we should not occupy the Get lock here, and other flush workers + // need to read next raw body. + return nil + }); err != nil { + if errors.Is(err, diskcache.ErrNoData) { + return nil, nil + } + + l.Errorf("BufGet: %s", err) + return nil, err + } + + if len(raw) == 0 { // no job available + return nil, nil + } + + l.Debugf("from queue %d get bytes", len(raw)) + + if err := b.loadCache(raw); err != nil { + return nil, err + } + + b.from = walFromDisk + b.gzon = isGzip(b.buf()) + + return b, nil +} + +type walBodyCallback func(*body) error + +// DiskGet will fallback if callback failed. +func (q *WALQueue) DiskGet(fn walBodyCallback, opts ...bodyOpt) error { + b := getReuseBufferBody(opts...) + + if err := q.disk.BufGet(b.marshalBuf, func(x []byte) error { + if len(x) == 0 { + return nil + } + + if err := b.loadCache(x); err != nil { + l.Warnf("load cache failed: %s, ignored", err) + return nil + } + + b.from = walFromDisk + b.gzon = isGzip(b.buf()) + if err := fn(b); err != nil { + l.Warnf("walBodyCallback: %s, we try again, ignored", err) + return err + } else { + return nil + } + }); err != nil { + if errors.Is(err, diskcache.ErrNoData) { + return nil + } else { + return err + } + } + + return nil +} + +func (dw *Dataway) doSetupWAL(cacheDir string) (*WALQueue, error) { + dc, err := diskcache.Open( + diskcache.WithPath(cacheDir), + + // drop new data if cache full, no matter normal WAL or fail-cache WAL. + diskcache.WithFILODrop(true), + + diskcache.WithCapacity(int64(dw.WAL.MaxCapacityGB*float64(1<<30))), + diskcache.WithWakeup(defaultRotateAt), // short wakeup on wal queue + ) + if err != nil { + l.Errorf("NewWALCache %s with capacity %f GB: %s", cacheDir, dw.WAL.MaxCapacityGB, err.Error()) + return nil, err + } + + l.Infof("diskcache.New ok(%q) of %f GB", dw.WAL.Path, dw.WAL.MaxCapacityGB) + + return NewWAL(dw, dc), nil +} + +func (dw *Dataway) setupWAL() error { + for _, cat := range point.AllCategories() { + if wal, err := dw.doSetupWAL(filepath.Join(dw.WAL.Path, cat.String())); err != nil { + return err + } else { + dw.walq[cat] = wal + } + } + + if wal, err := dw.doSetupWAL(filepath.Join(dw.WAL.Path, "fc")); err != nil { + return err + } else { + dw.walFail = wal + } + return nil +} diff --git a/internal/io/dataway/wal_test.go b/internal/io/dataway/wal_test.go new file mode 100644 index 0000000000..5edd365233 --- /dev/null +++ b/internal/io/dataway/wal_test.go @@ -0,0 +1,176 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the MIT License. +// This product includes software developed at Guance Cloud (https://www.guance.com/). +// Copyright 2021-present Guance, Inc. + +package dataway + +import ( + T "testing" + "time" + + "github.com/GuanceCloud/cliutils/diskcache" + "github.com/GuanceCloud/cliutils/point" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestWALLoad(t *T.T) { + t.Run(`basic`, func(t *T.T) { + dw := NewDefaultDataway() + + dw.WAL.Path = t.TempDir() + + assert.NoError(t, dw.Init()) + assert.NoError(t, dw.setupWAL()) + + cat := point.Logging + pts := point.RandPoints(100) + w := getWriter(WithPoints(pts), + WithCategory(cat), + WithBodyCallback(dw.enqueueBody), + WithHTTPEncoding(dw.contentEncoding)) + + w.buildPointsBody() + + b, err := dw.walq[cat].Get() + require.NoError(t, err) + require.NotNil(t, b) + assert.Equal(t, walFromMem, b.from) + + defer putBody(b) + + dec := point.GetDecoder(point.WithDecEncoding(dw.contentEncoding)) + defer point.PutDecoder(dec) + + // check if body in WAL are the same as @pts + got, err := dec.Decode(b.buf()) + assert.NoError(t, err) + assert.Equal(t, len(pts), len(got)) + }) + + t.Run(`no-mem-queue`, func(t *T.T) { + dw := NewDefaultDataway() + + dw.WAL.Path = t.TempDir() + dw.WAL.MemCap = -1 // disable mem-queue + + assert.NoError(t, dw.Init()) + assert.NoError(t, dw.setupWAL()) + + cat := point.Logging + pts := point.RandPoints(100) + w := getWriter(WithPoints(pts), + WithCategory(cat), + WithBodyCallback(dw.enqueueBody), + WithHTTPEncoding(dw.contentEncoding)) + + w.buildPointsBody() + + dc := dw.walq[cat].disk.(*diskcache.DiskCache) + assert.NoError(t, dc.Rotate()) // force rotate + + f := dw.newFlusher(cat) + + b, err := f.wal.Get(withReusableBuffer(f.sendBuf, f.marshalBuf)) + require.NoError(t, err) + require.NotNil(t, b) + assert.Equal(t, walFromDisk, b.from) + + defer putBody(b) + + dec := point.GetDecoder(point.WithDecEncoding(dw.contentEncoding)) + defer point.PutDecoder(dec) + + // check if body in WAL are the same as @pts + got, err := dec.Decode(b.buf()) + assert.NoError(t, err) + assert.Equal(t, len(pts), len(got)) + }) + + t.Run(`no-mem-queue-auto-rotate`, func(t *T.T) { + dw := NewDefaultDataway() + + dw.WAL.Path = t.TempDir() + dw.WAL.MemCap = -1 // disable mem-queue + + assert.NoError(t, dw.Init()) + assert.NoError(t, dw.setupWAL()) + + cat := point.Logging + pts := point.RandPoints(100) + w := getWriter(WithPoints(pts), + WithCategory(cat), + WithBodyCallback(dw.enqueueBody), + WithHTTPEncoding(dw.contentEncoding)) + + w.buildPointsBody() + + time.Sleep(time.Second * 4) // default auto rotate is 3sec + + f := dw.newFlusher(cat) + + b, err := f.wal.Get(withReusableBuffer(f.sendBuf, f.marshalBuf)) + require.NoError(t, err) + require.NotNil(t, b) + assert.Equal(t, walFromDisk, b.from) + + defer putBody(b) + + dec := point.GetDecoder(point.WithDecEncoding(dw.contentEncoding)) + defer point.PutDecoder(dec) + + // check if body in WAL are the same as @pts + got, err := dec.Decode(b.buf()) + assert.NoError(t, err) + assert.Equal(t, len(pts), len(got)) + }) + + t.Run(`full-mem-queue`, func(t *T.T) { + dw := NewDefaultDataway() + + dw.WAL.Path = t.TempDir() + + assert.NoError(t, dw.Init()) + assert.NoError(t, dw.setupWAL()) + + cat := point.Logging + pts := point.RandPoints(100) + w := getWriter(WithPoints(pts), + WithCategory(cat), + WithBodyCallback(dw.enqueueBody), + WithHTTPEncoding(dw.contentEncoding)) + + w.buildPointsBody() + w.buildPointsBody() // 2nd write will dump to disk + + time.Sleep(time.Second * 4) // default auto rotate is 3sec + + f := dw.newFlusher(cat) + + for i := 0; i < 2; i++ { + b, err := f.wal.Get(withReusableBuffer(f.sendBuf, f.marshalBuf)) + require.NoError(t, err) + require.NotNil(t, b) + + dec := point.GetDecoder(point.WithDecEncoding(dw.contentEncoding)) + defer point.PutDecoder(dec) + + // check if body in WAL are the same as @pts + got, err := dec.Decode(b.buf()) + assert.NoError(t, err) + assert.Equal(t, len(pts), len(got)) + if i == 0 { // from mem + assert.Equal(t, walFromMem, b.from) + } else { // from disk + assert.Equal(t, walFromDisk, b.from) + } + + putBody(b) + } + + b, err := f.wal.Get(withReusableBuffer(f.sendBuf, f.marshalBuf)) // no data any more + assert.Nil(t, b) + assert.NoError(t, err) + }) +} diff --git a/internal/io/dataway/write.go b/internal/io/dataway/write.go index 1d6ac848f1..78dada02be 100644 --- a/internal/io/dataway/write.go +++ b/internal/io/dataway/write.go @@ -8,13 +8,8 @@ package dataway import ( "bytes" "compress/gzip" - "errors" - "fmt" - "github.com/GuanceCloud/cliutils/diskcache" "github.com/GuanceCloud/cliutils/point" - "gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/io/failcache" - pb "google.golang.org/protobuf/proto" ) var MaxKodoBody = 10 * 1000 * 1000 @@ -45,12 +40,6 @@ func WithDynamicURL(urlStr string) WriteOption { } } -func WithFailCache(fc failcache.Cache) WriteOption { - return func(w *writer) { - w.fc = fc - } -} - func WithCacheAll(on bool) WriteOption { return func(w *writer) { w.cacheAll = on @@ -63,17 +52,9 @@ func WithCacheClean(on bool) WriteOption { } } -func WithGzip(on bool) WriteOption { +func WithGzip(on int) WriteOption { return func(w *writer) { w.gzip = on - - if on && w.zipper == nil { - buf := bytes.Buffer{} - w.zipper = &gzipWriter{ - buf: &buf, - w: gzip.NewWriter(&buf), - } - } } } @@ -83,32 +64,23 @@ func WithBatchSize(n int) WriteOption { } } -func WithBatchBytesSize(n int) WriteOption { +func WithHTTPEncoding(t point.Encoding) WriteOption { return func(w *writer) { - if n > 0 { - w.batchBytesSize = n - if w.sendBuffer == nil { - w.sendBuffer = make([]byte, w.batchBytesSize) - } - } + w.httpEncoding = t } } -func WithHTTPEncoding(t point.Encoding) WriteOption { +func WithMaxBodyCap(x int) WriteOption { return func(w *writer) { - w.httpEncoding = t + if x > 0 { + w.batchBytesSize = x + } } } -// WithReusable reset some fields of the writer. -// This make it able to use the writer multiple times before put back to pool. -func WithReusable() WriteOption { +func WithBodyCallback(cb bodyCallback) WriteOption { return func(w *writer) { - w.parts = 0 - if w.zipper != nil { - w.zipper.buf.Reset() - w.zipper.w.Reset(w.zipper.buf) - } + w.bcb = cb } } @@ -121,73 +93,20 @@ type writer struct { category point.Category dynamicURL string - body *body - points []*point.Point - sendBuffer []byte - // if bothe batch limit set, prefer batchBytesSize. batchBytesSize int // limit point pyaload bytes approximately batchSize int // limit point count - parts int - - zipper *gzipWriter httpEncoding point.Encoding - gzip bool + gzip int cacheClean, cacheAll bool httpHeaders map[string]string - fc failcache.Cache -} - -func isGzip(data []byte) bool { - if len(data) < 2 { - return false - } - - // See: https://stackoverflow.com/a/6059342/342348 - return data[0] == 0x1f && data[1] == 0x8b -} - -func loadCache(data []byte) (*CacheData, error) { - pd := &CacheData{} - if err := pb.Unmarshal(data, pd); err != nil { - return nil, fmt.Errorf("loadCache: %w", err) - } - - return pd, nil -} - -func (dw *Dataway) cleanCache(w *writer, data []byte) error { - pd, err := loadCache(data) - if err != nil { - log.Warnf("pb.Unmarshal(%d bytes -> %s): %s, ignored", len(data), w.category, err) - return nil - } - - cat := point.Category(pd.Category) - enc := point.Encoding(pd.PayloadType) - - WithGzip(isGzip(pd.Payload))(w) // check if bytes is gzipped - WithCategory(cat)(w) // use category in cached data - WithHTTPEncoding(enc)(w) - - for _, ep := range dw.eps { - // If some of endpoint send ok, any failed write will cause re-write on these ok ones. - // So, do NOT configure multiple endpoint in dataway URL list. - if err := ep.writePointData(w, &body{buf: pd.Payload}); err != nil { - log.Warnf("cleanCache: %s", err) - return err - } - } - - // only set metric on clean-ok - flushFailCacheVec.WithLabelValues(cat.String()).Observe(float64(len(pd.Payload))) - return nil + bcb bodyCallback } func (dw *Dataway) doGroupPoints(ptg *ptGrouper, cat point.Category, points []*point.Point) { @@ -201,7 +120,7 @@ func (dw *Dataway) doGroupPoints(ptg *ptGrouper, cat point.Category, points []*p tv := ptg.sinkHeaderValue(dw.globalTags, dw.GlobalCustomerKeys) - log.Debugf("add point to group %q", tv) + l.Debugf("add point to group %q", tv) ptg.groupedPts[tv] = append(ptg.groupedPts[tv], pt) } @@ -216,14 +135,20 @@ func (dw *Dataway) groupPoints(ptg *ptGrouper, } func (dw *Dataway) Write(opts ...WriteOption) error { + gzOn := 0 + if dw.GZip { + gzOn = 1 + } + w := getWriter( // set content encoding(protobuf/line-protocol/json) WithHTTPEncoding(dw.contentEncoding), // setup gzip on or off - WithGzip(dw.GZip), + WithGzip(gzOn), // set raw body size limit - WithBatchBytesSize(dw.MaxRawBodySize), + WithMaxBodyCap(dw.MaxRawBodySize), ) + defer putWriter(w) // Append extra wirte options from caller @@ -233,35 +158,14 @@ func (dw *Dataway) Write(opts ...WriteOption) error { } } - if w.cacheClean { - if w.fc == nil { - return nil - } - - if err := w.fc.Get(func(x []byte) error { - if len(x) == 0 { - return nil - } - - log.Debugf("try flush %d bytes on %q", len(x), w.category) - - return dw.cleanCache(w, x) - }); err != nil { - if !errors.Is(err, diskcache.ErrEOF) { - log.Warnf("on %s failcache.Get: %s, ignored", w.category, err) - } - } - - // always ok on clean-cache - return nil + if w.bcb == nil { // set default callback + w.bcb = dw.enqueueBody } // split single point array into multiple part according to // different X-Global-Tags. - if dw.EnableSinker && - (len(dw.globalTags) > 0 || len(dw.GlobalCustomerKeys) > 0) && - len(dw.eps) > 0 { - log.Debugf("under sinker...") + if dw.sinkEnabled() { + l.Debugf("under sinker...") ptg := getGrouper() defer putGrouper(ptg) @@ -269,25 +173,24 @@ func (dw *Dataway) Write(opts ...WriteOption) error { dw.groupPoints(ptg, w.category, w.points) for k, points := range ptg.groupedPts { - WithReusable()(w) WithHTTPHeader(HeaderXGlobalTags, k)(w) WithPoints(points)(w) - // only apply to 1st dataway address - if err := dw.eps[0].writePoints(w); err != nil { + if err := w.buildPointsBody(); err != nil { return err } } } else { - // write points to multiple endpoints - for _, ep := range dw.eps { - WithReusable()(w) - - if err := ep.writePoints(w); err != nil { - return err - } + if err := w.buildPointsBody(); err != nil { + return err } } return nil } + +func (dw *Dataway) sinkEnabled() bool { + return dw.EnableSinker && + (len(dw.globalTags) > 0 || len(dw.GlobalCustomerKeys) > 0) && + len(dw.eps) > 0 +} diff --git a/internal/io/dataway/write_test.go b/internal/io/dataway/write_test.go index ce71a4a027..3eb75f1f11 100644 --- a/internal/io/dataway/write_test.go +++ b/internal/io/dataway/write_test.go @@ -6,8 +6,6 @@ package dataway import ( - "bytes" - "compress/gzip" "fmt" "io" "net/http" @@ -16,12 +14,12 @@ import ( "time" "github.com/GuanceCloud/cliutils/diskcache" - lp "github.com/GuanceCloud/cliutils/lineproto" "github.com/GuanceCloud/cliutils/metrics" uhttp "github.com/GuanceCloud/cliutils/network/http" "github.com/GuanceCloud/cliutils/point" "github.com/prometheus/client_golang/prometheus" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" "gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/datakit" ) @@ -32,7 +30,7 @@ func TestIsGZip(t *T.T) { gz, err := datakit.GZip(data) assert.NoError(t, err) - assert.True(t, isGzip(gz)) + assert.Equal(t, int8(1), isGzip(gz)) }) } @@ -40,65 +38,108 @@ func TestFailCache(t *T.T) { t.Run(`test-failcache-data`, func(t *T.T) { // server to accept not-sinked points(2 pts) ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - t.Logf("category: %s", r.URL.Path) + t.Logf("%s category: %s", time.Now(), r.URL.Path) + for k, v := range r.Header { + t.Logf("%s: %s", k, v) + } w.WriteHeader(http.StatusInternalServerError) // mocked dataway fail })) + defer ts.Close() + time.Sleep(time.Second) + + reg := prometheus.NewRegistry() + + reg.MustRegister(diskcache.Metrics()...) + reg.MustRegister(Metrics()...) + t.Cleanup(func() { - ts.Close() + metricsReset() + diskcache.ResetMetrics() }) - p := t.TempDir() - fc, err := diskcache.Open(diskcache.WithPath(p)) - assert.NoError(t, err) + cat := point.Logging - dw := &Dataway{ - URLs: []string{fmt.Sprintf("%s?token=tkn_11111111111111111111", ts.URL)}, - GZip: true, - } + dw := NewDefaultDataway() + dw.WAL.Path = t.TempDir() + dw.MaxRetryCount = 1 - assert.NoError(t, dw.Init()) + require.NoError(t, dw.Init(WithURLs(ts.URL))) + require.NoError(t, dw.setupWAL()) pts := point.RandPoints(100) // write logging - assert.NoError(t, dw.Write(WithCategory(point.Logging), - WithFailCache(fc), - WithPoints(pts))) + w := getWriter(WithPoints(pts), + WithCategory(cat), + WithBodyCallback(func(w *writer, b *body) error { + return dw.doFlush(w, b) + }), + WithHTTPEncoding(dw.contentEncoding)) + defer putWriter(w) + + // POST body to @ts + assert.NoError(t, w.buildPointsBody()) + + // POST fail and the request dumpped to diskcache + dc := dw.walFail.disk.(*diskcache.DiskCache) + assert.NoError(t, dc.Rotate()) // force rotate + require.True(t, dc.Size() > 0) - assert.NoError(t, fc.Rotate()) // force rotate + // check if BufGet ok + assert.Equal(t, "always error", dc.BufGet(nil, func(x []byte) error { // not EOF + t.Logf("get %d bytes", len(x)) + return fmt.Errorf("always error") + }).Error()) - assert.NoError(t, fc.Get(func(x []byte) error { - if len(x) == 0 { + assert.Equal(t, "error again", dc.BufGet(nil, func(x []byte) error { // auto fallback and not EOF + t.Logf("get %d bytes", len(x)) + return fmt.Errorf("error again") + }).Error()) + + t.Logf("diskcache:\n%s", dc.Pretty()) + + f := dw.newFlusher(cat) + + assert.Error(t, f.cleanFailCache()) // clean cache retry will fail: @ts still return 5XX + + // we can also get the cache from fail-cache: the diskcache will rollback the failed Get(). + assert.NoError(t, dw.walFail.DiskGet(func(b *body) error { + defer putBody(b) + + if len(b.buf()) == 0 { return nil } - pd, err := loadCache(x) - assert.NoError(t, err) - // check cached data - assert.True(t, isGzip(pd.Payload)) - assert.Equal(t, point.Logging, point.Category(pd.Category)) - assert.Equal(t, point.LineProtocol, point.Encoding(pd.PayloadType)) + assert.Equal(t, int8(1), isGzip(b.buf())) + assert.Equal(t, cat, b.cat()) + assert.Equal(t, point.Protobuf, b.enc()) // unmarshal payload - r, err := gzip.NewReader(bytes.NewBuffer(pd.Payload)) + x, err := uhttp.Unzip(b.buf()) assert.NoError(t, err) - buf := bytes.NewBuffer(nil) - _, err = io.Copy(buf, r) - assert.NoError(t, err) - - dec := point.GetDecoder(point.WithDecEncoding(point.LineProtocol)) + dec := point.GetDecoder(point.WithDecEncoding(dw.contentEncoding)) defer point.PutDecoder(dec) - got, err := dec.Decode(buf.Bytes()) + got, err := dec.Decode(x) assert.NoError(t, err) assert.Len(t, got, len(pts)) return nil + }, withReusableBuffer(f.sendBuf, f.marshalBuf))) + + assert.Equal(t, diskcache.ErrNoData, dc.BufGet(nil, func([]byte) error { + return nil // make sure no data available in fail-cache })) + + t.Logf("diskcache: %s", dc.Pretty()) + + mfs, err := reg.Gather() + assert.NoError(t, err) + t.Logf("metrics:\n%s", metrics.MetricFamily2Text(mfs)) }) } @@ -117,31 +158,45 @@ func TestWriteWithCache(t *T.T) { reg.MustRegister(diskcache.Metrics()...) reg.MustRegister(Metrics()...) - p := t.TempDir() - fc, err := diskcache.Open(diskcache.WithPath(p)) - assert.NoError(t, err) + dw := NewDefaultDataway() + dw.EnableHTTPTrace = true + dw.HTTPTimeout = 10 * time.Millisecond // easy timeout + dw.GZip = true + dw.WAL.Path = t.TempDir() + dw.MaxRetryCount = 1 - dw := &Dataway{ - URLs: []string{fmt.Sprintf("%s?token=tkn_11111111111111111111", ts.URL)}, - EnableHTTPTrace: true, - HTTPTimeout: 10 * time.Millisecond, // easy timeout - GZip: true, - } - assert.NoError(t, dw.Init()) + assert.NoError(t, dw.Init(WithURLs(fmt.Sprintf("%s?token=tkn_11111111111111111111", ts.URL)))) + require.NoError(t, dw.setupWAL()) pts := point.RandPoints(100) - - // write dialtesting on category logging - assert.NoError(t, dw.Write( - WithCategory(point.DynamicDWCategory), - WithFailCache(fc), - WithPoints(pts), WithDynamicURL(fmt.Sprintf("%s/v1/write/logging?token=tkn_for_dialtesting", ts.URL)))) - - // write metric - assert.NoError(t, dw.Write(WithCategory(point.Metric), WithPoints(pts))) + cat := point.DynamicDWCategory + + dynURL := fmt.Sprintf("%s/v1/write/logging?token=tkn_for_dialtesting", ts.URL) + + // write dial-testing + w := getWriter(WithPoints(pts), + WithCategory(cat), + WithDynamicURL(dynURL), + WithBodyCallback(func(w *writer, b *body) error { + t.Logf("doFlush to %s", w.dynamicURL) + return dw.doFlush(w, b, WithDynamicURL(dynURL)) + }), + WithHTTPEncoding(dw.contentEncoding)) + defer putWriter(w) + + // POST dial-testing body to @ts + assert.NoError(t, w.buildPointsBody()) + + WithCategory(point.Metric)(w) // try send metric, we need to reset w + WithBodyCallback(func(w *writer, b *body) error { // change callback + return dw.doFlush(w, b) + })(w) + // POST metric body to @ts + assert.NoError(t, w.buildPointsBody()) // check cache content - assert.NoError(t, fc.Rotate()) // force rotate + dc := dw.walFail.disk.(*diskcache.DiskCache) + assert.NoError(t, dc.Rotate()) // force rotate mfs, err := reg.Gather() assert.NoError(t, err) @@ -161,7 +216,6 @@ func TestWriteWithCache(t *T.T) { assert.Equal(t, float64(100), m.GetCounter().GetValue(), metrics.MetricFamily2Text(mfs)) t.Cleanup(func() { - assert.NoError(t, fc.Close()) metricsReset() diskcache.ResetMetrics() }) @@ -173,6 +227,9 @@ func TestWriteWithCache(t *T.T) { t.Logf("category: %s", r.URL.Path) w.WriteHeader(http.StatusInternalServerError) // mocked dataway fail })) + defer ts.Close() + + time.Sleep(time.Second) reg := prometheus.NewRegistry() reg.MustRegister(diskcache.Metrics()...) @@ -180,52 +237,57 @@ func TestWriteWithCache(t *T.T) { defer ts.Close() - p := t.TempDir() - fc, err := diskcache.Open(diskcache.WithPath(p)) - assert.NoError(t, err) + dw := NewDefaultDataway() + dw.EnableHTTPTrace = true + dw.GZip = true + dw.WAL.Path = t.TempDir() + dw.MaxRetryCount = 1 - dw := &Dataway{ - URLs: []string{fmt.Sprintf("%s?token=tkn_11111111111111111111", ts.URL)}, - EnableHTTPTrace: true, - GZip: true, - } - assert.NoError(t, dw.Init()) + assert.NoError(t, dw.Init(WithURLs(fmt.Sprintf("%s?token=tkn_11111111111111111111", ts.URL)))) + require.NoError(t, dw.setupWAL()) + cat := point.Logging pts := point.RandPoints(100) - // write logging - assert.NoError(t, dw.Write(WithCategory(point.Logging), - WithFailCache(fc), - WithPoints(pts))) + w := getWriter(WithPoints(pts), + WithCategory(cat), + WithBodyCallback(func(w *writer, b *body) error { + return dw.doFlush(w, b) + }), + WithHTTPEncoding(dw.contentEncoding)) + defer putWriter(w) + + assert.NoError(t, w.buildPointsBody()) // check cache content - assert.NoError(t, fc.Rotate()) + dc := dw.walFail.disk.(*diskcache.DiskCache) + dc.Size() + assert.NoError(t, dc.Rotate()) // force rotate + + f := dw.newFlusher(cat) // try clean cache, but API still failed, and again put to cache - assert.NoError(t, dw.Write(WithCategory(point.Logging), - WithFailCache(fc), - WithCacheClean(true))) + assert.Contains(t, f.cleanFailCache().Error(), "internal error") mfs, err := reg.Gather() assert.NoError(t, err) t.Logf("metrics: %s", metrics.MetricFamily2Text(mfs)) - m := metrics.GetMetricOnLabels(mfs, "diskcache_get_total", p) + m := metrics.GetMetricOnLabels(mfs, "diskcache_get_bytes", dc.Path()) // only 1 get(in dw.Write with-cache-clean) - assert.Equal(t, 1.0, m.GetCounter().GetValue()) + assert.Equal(t, uint64(1), m.GetSummary().GetSampleCount()) // 1 put(dw.Write with-cache-clean failed do not add another Put) - m = metrics.GetMetricOnLabels(mfs, "diskcache_put_total", p) - assert.Equal(t, 1.0, m.GetCounter().GetValue()) + m = metrics.GetMetricOnLabels(mfs, "diskcache_put_bytes", dc.Path()) + assert.Equal(t, uint64(1), m.GetSummary().GetSampleCount()) // put-bytes same as get-bytes: 2 puts only trigger 1 cache,the 2nd do nothing - mput := metrics.GetMetricOnLabels(mfs, "diskcache_put_bytes_total", p).GetCounter().GetValue() - mget := metrics.GetMetricOnLabels(mfs, "diskcache_get_bytes_total", p).GetCounter().GetValue() + mput := metrics.GetMetricOnLabels(mfs, "diskcache_put_bytes", dc.Path()).GetSummary().GetSampleSum() + mget := metrics.GetMetricOnLabels(mfs, "diskcache_get_bytes", dc.Path()).GetSummary().GetSampleSum() assert.Equal(t, 1.0, mput/mget) t.Cleanup(func() { - assert.NoError(t, fc.Close()) metricsReset() diskcache.ResetMetrics() }) @@ -234,8 +296,11 @@ func TestWriteWithCache(t *T.T) { func TestX(t *T.T) { t.Run("write-100pts-with-group", func(t *T.T) { + cat := point.Logging + npts := 100 + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - assert.Equal(t, datakit.Logging, r.URL.Path) + assert.Equal(t, cat.URL(), r.URL.Path) body, err := io.ReadAll(r.Body) defer r.Body.Close() @@ -249,9 +314,13 @@ func TestX(t *T.T) { x, err = uhttp.Unzip(body) assert.NoError(t, err) - pts, err := lp.ParsePoints(x, nil) + dec := point.GetDecoder(point.WithDecEncoding(point.Protobuf)) + defer point.PutDecoder(dec) + + pts, err := dec.Decode(x) assert.NoError(t, err) - assert.Len(t, pts, 100) + + assert.Len(t, pts, npts) for k := range r.Header { t.Logf("%s: %s", k, r.Header.Get(k)) @@ -267,28 +336,39 @@ func TestX(t *T.T) { reg.MustRegister(diskcache.Metrics()...) reg.MustRegister(Metrics()...) - pts := point.RandPoints(100) + pts := point.RandPoints(npts) // add extra tags to match group tag key/value for _, pt := range pts { pt.MustAddTag("tag1", "value1") pt.MustAddTag("tag2", "value2") + // NOTE: tag3 not added } - dw := &Dataway{ - URLs: []string{fmt.Sprintf("%s?token=tkn_11111111111111111111", ts.URL)}, - EnableSinker: true, - GZip: true, - // GlobalCustomerKeys: []string{"tag1", "tag2"}, - } + dw := NewDefaultDataway() + dw.EnableSinker = true + dw.GZip = true + dw.WAL.Path = t.TempDir() + dw.MaxRetryCount = 1 + dw.GlobalCustomerKeys = nil // no customer keys, all sinker group based on global-tags + assert.NoError(t, dw.Init( + WithURLs(fmt.Sprintf("%s?token=tkn_11111111111111111111", ts.URL)), WithGlobalTags(map[string]string{ // add global tag as match group tag key/value "tag1": "value1", "tag2": "value2", - "tag3": "value3", // not used + "tag3": "value3", // not used in random point, so we should not sink on this tag. }))) + require.NoError(t, dw.setupWAL()) - assert.NoError(t, dw.Write(WithCategory(point.Logging), WithPoints(pts))) + assert.NoError(t, dw.Write(WithCategory(point.Logging), + WithPoints(pts), + WithHTTPEncoding(dw.contentEncoding), + WithBodyCallback(func(w *writer, b *body) error { + t.Logf("body: %s", b) + return dw.doFlush(w, b) + }), + )) t.Cleanup(func() { ts.Close() @@ -315,7 +395,10 @@ func TestWritePoints(t *T.T) { x, err = uhttp.Unzip(body) assert.NoError(t, err) - pts, err := lp.ParsePoints(x, nil) + dec := point.GetDecoder(point.WithDecEncoding(point.Protobuf)) + defer point.PutDecoder(dec) + + pts, err := dec.Decode(x) assert.NoError(t, err) assert.Len(t, pts, 100) @@ -328,14 +411,21 @@ func TestWritePoints(t *T.T) { reg.MustRegister(Metrics()...) pts := point.RandPoints(100) + dw := NewDefaultDataway() + dw.GZip = true + dw.WAL.Path = t.TempDir() + dw.MaxRetryCount = 1 - dw := &Dataway{ - URLs: []string{fmt.Sprintf("%s?token=tkn_11111111111111111111", ts.URL)}, - GZip: true, - } - assert.NoError(t, dw.Init()) + assert.NoError(t, dw.Init(WithURLs(fmt.Sprintf("%s?token=tkn_11111111111111111111", ts.URL)))) + require.NoError(t, dw.setupWAL()) - assert.NoError(t, dw.Write(WithCategory(point.Logging), WithPoints(pts))) + assert.NoError(t, dw.Write( + WithBodyCallback(func(w *writer, b *body) error { + return dw.doFlush(w, b) + }), + WithHTTPEncoding(dw.contentEncoding), + WithCategory(point.Logging), + WithPoints(pts))) t.Cleanup(func() { ts.Close() @@ -344,7 +434,7 @@ func TestWritePoints(t *T.T) { }) }) - t.Run("write.with.pb", func(t *T.T) { + t.Run("write-with-pb", func(t *T.T) { r := point.NewRander() origin := r.Rand(10) @@ -371,14 +461,22 @@ func TestWritePoints(t *T.T) { t.Logf("body size: %d/%d, pts: %d", len(body), len(x), len(pts)) })) - dw := &Dataway{ - URLs: []string{fmt.Sprintf("%s?token=tkn_some", ts.URL)}, - ContentEncoding: "protobuf", - GZip: true, - } - assert.NoError(t, dw.Init()) + dw := NewDefaultDataway() + dw.ContentEncoding = "protobuf" + dw.GZip = true + dw.WAL.Path = t.TempDir() + dw.MaxRetryCount = 1 + + assert.NoError(t, dw.Init(WithURLs(fmt.Sprintf("%s?token=tkn_some", ts.URL)))) + require.NoError(t, dw.setupWAL()) - assert.NoError(t, dw.Write(WithCategory(point.Logging), WithPoints(origin))) + assert.NoError(t, dw.Write( + WithBodyCallback(func(w *writer, b *body) error { + return dw.doFlush(w, b) + }), + WithHTTPEncoding(dw.contentEncoding), + WithCategory(point.Logging), + WithPoints(origin))) t.Cleanup(func() { ts.Close() metricsReset() @@ -386,7 +484,7 @@ func TestWritePoints(t *T.T) { }) }) - t.Run("write.with.large.pb", func(t *T.T) { + t.Run("write-with-large-pb", func(t *T.T) { var ( r = point.NewRander(point.WithRandText(3)) origin = r.Rand(1000) @@ -412,15 +510,22 @@ func TestWritePoints(t *T.T) { t.Logf("body size: %d/%d, pts: %d", len(body), len(x), len(pts)) })) - dw := &Dataway{ - URLs: []string{fmt.Sprintf("%s?token=tkn_some", ts.URL)}, - ContentEncoding: "protobuf", - MaxRawBodySize: 512 * 1024, - GZip: true, - } + dw := NewDefaultDataway() + dw.ContentEncoding = "protobuf" + dw.MaxRawBodySize = 512 * 1024 + dw.GZip = true + dw.WAL.Path = t.TempDir() + dw.MaxRetryCount = 1 - assert.NoError(t, dw.Init()) - assert.NoError(t, dw.Write(WithCategory(point.Logging), WithPoints(origin))) + assert.NoError(t, dw.Init(WithURLs(fmt.Sprintf("%s?token=tkn_some", ts.URL)))) + require.NoError(t, dw.setupWAL()) + assert.NoError(t, dw.Write( + WithBodyCallback(func(w *writer, b *body) error { + return dw.doFlush(w, b) + }), + WithHTTPEncoding(dw.contentEncoding), + WithCategory(point.Logging), + WithPoints(origin))) assert.Len(t, get, len(origin)) for idx, pt := range get { @@ -461,16 +566,24 @@ func TestWritePoints(t *T.T) { t.Logf("body size: %d/%d, pts: %d", len(body), len(x), len(pts)) })) - dw := &Dataway{ - URLs: []string{fmt.Sprintf("%s?token=tkn_some", ts.URL)}, - ContentEncoding: "protobuf", - EnableSinker: true, - GlobalCustomerKeys: []string{"tag1", "tag2"}, - GZip: true, - } - assert.NoError(t, dw.Init()) + dw := NewDefaultDataway() + dw.ContentEncoding = "protobuf" + dw.EnableSinker = true + dw.GlobalCustomerKeys = []string{"tag1", "tag2"} + dw.GZip = true + dw.WAL.Path = t.TempDir() + dw.MaxRetryCount = 1 - assert.NoError(t, dw.Write(WithPoints(pts), WithCategory(point.Logging))) + assert.NoError(t, dw.Init(WithURLs(fmt.Sprintf("%s?token=tkn_some", ts.URL)))) + require.NoError(t, dw.setupWAL()) + + assert.NoError(t, dw.Write( + WithBodyCallback(func(w *writer, b *body) error { + return dw.doFlush(w, b) + }), + WithHTTPEncoding(dw.contentEncoding), + WithPoints(pts), + WithCategory(point.Logging))) // len(pts) == 2, sinked into 2 requests according to the tags. assert.Equal(t, 2, requests) diff --git a/internal/io/failcache/cache.go b/internal/io/failcache/cache.go deleted file mode 100644 index 9b2ca30d1a..0000000000 --- a/internal/io/failcache/cache.go +++ /dev/null @@ -1,20 +0,0 @@ -// Unless explicitly stated otherwise all files in this repository are licensed -// under the MIT License. -// This product includes software developed at Guance Cloud (https://www.guance.com/). -// Copyright 2021-present Guance, Inc. - -// Package failcache implements local cache for datakit. -package failcache - -import ( - "github.com/GuanceCloud/cliutils/diskcache" -) - -type Cache interface { - // NOTE: reuse callback in diskcache to keep interface ok - // it's better to define Get as - // Get() []byte - Get(diskcache.Fn) error - Put([]byte) error - Close() error -} diff --git a/internal/io/feed_aws_lambda.go b/internal/io/feed_aws_lambda.go index 6d6f051e46..713b7e96d5 100644 --- a/internal/io/feed_aws_lambda.go +++ b/internal/io/feed_aws_lambda.go @@ -44,7 +44,7 @@ func (a *awsLambdaOutput) flush() { if len(pts) == 0 { continue } - err := defIO.doFlush(pts, cat, nil) + err := defIO.doCompact(pts, cat) if err != nil { log.Warnf("post %d points to %s failed: %s, ignored", len(pts), cat, err) } diff --git a/internal/io/feed_debug.go b/internal/io/feed_debug.go index 38f9aec112..9fdd244f50 100644 --- a/internal/io/feed_debug.go +++ b/internal/io/feed_debug.go @@ -21,6 +21,7 @@ type debugOutput struct{} var _ FeederOutputer = new(debugOutput) func (fo *debugOutput) Reader(cat point.Category) <-chan *feedOption { + // not implemented return nil } diff --git a/internal/io/feed_dw.go b/internal/io/feed_dw.go index c55a266fae..0fa71a3e7b 100644 --- a/internal/io/feed_dw.go +++ b/internal/io/feed_dw.go @@ -61,11 +61,7 @@ func (fo *datawayOutput) Write(data *feedOption) error { if data.syncSend { defIO.recordPoints(data) - fc, ok := defIO.fcs[data.cat.String()] - if !ok { - log.Infof("IO local cache not set for %q", data.cat.String()) - } - err := defIO.doFlush(data.pts, data.cat, fc) + err := defIO.doCompact(data.pts, data.cat) if err != nil { log.Warnf("post %d points to %s failed: %s, ignored", len(data.pts), data.cat, err) } diff --git a/internal/io/io.go b/internal/io/io.go index 75770f8231..8b652c53d8 100644 --- a/internal/io/io.go +++ b/internal/io/io.go @@ -15,7 +15,6 @@ import ( "github.com/GuanceCloud/cliutils/point" "gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/datakit" "gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/io/dataway" - "gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/io/failcache" "gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/io/filter" "gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/recorder" ) @@ -34,13 +33,8 @@ type dkIO struct { dw dataway.IDataway filters map[string]filter.FilterConditions - cacheSizeGB int - cacheCleanInterval time.Duration - enableCache, - cacheAll bool - withFilter, - withConsumer bool + withCompactor bool recorder *recorder.Recorder @@ -48,15 +42,10 @@ type dkIO struct { availableCPUs, flushWorkers int - maxCacheCount int + compactAt int - ////////////////////////// - // inner fields - ////////////////////////// fo FeederOutputer - fcs map[string]failcache.Cache - lock sync.RWMutex } @@ -75,17 +64,11 @@ func Start(opts ...IOOption) { func getIO() *dkIO { x := &dkIO{ - cacheSizeGB: 1 * 1024 * 1024, - cacheCleanInterval: 30 * time.Second, - enableCache: false, - - withFilter: true, - withConsumer: true, + withFilter: true, + withCompactor: true, flushInterval: time.Second * 10, - maxCacheCount: 1024, - - fcs: map[string]failcache.Cache{}, + compactAt: 1024, lock: sync.RWMutex{}, } @@ -109,13 +92,13 @@ func (x *dkIO) start() { }) } - if x.withConsumer { - fn := func(cat point.Category, n int) { - log.Infof("start %d workers on %q", n, cat) - g := datakit.G("io/consumer/" + cat.Alias()) + if x.withCompactor { + compactorWorker := func(cat point.Category, n int) { + log.Infof("start %dth workers on %q", n, cat) + g := datakit.G("io/compactor/" + cat.Alias()) for i := 0; i < n; i++ { g.Go(func(_ context.Context) error { - x.runConsumer(cat) + x.runCompactor(cat) return nil }) } @@ -131,16 +114,17 @@ func (x *dkIO) start() { //nolint:exhaustive switch c { - case point.Metric, + case + point.Metric, point.Network, point.Logging, point.Tracing, point.RUM: - fn(c, nworker) + compactorWorker(c, nworker) flushWorkersVec.WithLabelValues(c.String()).Set(float64(nworker)) default: - fn(c, 1) + compactorWorker(c, 1) flushWorkersVec.WithLabelValues(c.String()).Set(1) } } diff --git a/internal/io/option.go b/internal/io/option.go index 5a3c8f5376..a93bfbf31f 100644 --- a/internal/io/option.go +++ b/internal/io/option.go @@ -6,12 +6,8 @@ package io import ( - "path/filepath" "time" - "github.com/GuanceCloud/cliutils/diskcache" - "github.com/GuanceCloud/cliutils/point" - "gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/datakit" "gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/io/dataway" "gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/io/filter" "gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/recorder" @@ -40,10 +36,10 @@ func WithRecorder(r *recorder.Recorder) IOOption { } } -// WithConsumer disble consumer on IO feed. -func WithConsumer(on bool) IOOption { +// WithCompactor disble consumer on IO feed. +func WithCompactor(on bool) IOOption { return func(x *dkIO) { - x.withConsumer = on + x.withCompactor = on } } @@ -70,64 +66,8 @@ func WithFilters(filters map[string]filter.FilterConditions) IOOption { } } -// WithDiskCacheCleanInterval used to control clean(retry-on-failed-data) -// interval of disk cache. -func WithDiskCacheCleanInterval(du time.Duration) IOOption { - return func(x *dkIO) { - if int64(du) > 0 { - x.cacheCleanInterval = du - } - } -} - -// WithDiskCacheSize used to set max disk cache(in GB bytes). -func WithDiskCacheSize(gb int) IOOption { - return func(x *dkIO) { - if gb > 0 { - x.cacheSizeGB = gb - } - } -} - -// WithCacheAll will cache all categories. -// By default, metric(M), object(CO/O) and dial-testing data point not cached. -func WithCacheAll(on bool) IOOption { - return func(x *dkIO) { - x.cacheAll = on - } -} - -// WithDiskCache used to set/unset disk cache on failed data. -func WithDiskCache(on bool) IOOption { - return func(x *dkIO) { - x.enableCache = on - if !on { - log.Infof("io diskcache not set") - return - } - - for _, c := range point.AllCategories() { - p := filepath.Join(datakit.CacheDir, c.String()) - capacity := int64(x.cacheSizeGB * 1024 * 1024 * 1024) - - cache, err := diskcache.Open( - diskcache.WithPath(p), - diskcache.WithCapacity(capacity), - diskcache.WithWakeup(30*time.Second), // to disable generate too many files under cache - ) - if err != nil { - log.Warnf("NewWALCache to %s with capacity %d: %s", p, capacity, err.Error()) - continue - } else { - log.Infof("diskcache.New ok on category %q on path %q, cap %d", c.String(), p, capacity) - x.fcs[c.String()] = cache - } - } - } -} - -// WithFlushWorkers set IO flush workers. -func WithFlushWorkers(n int) IOOption { +// WithCompactWorkers set IO flush workers. +func WithCompactWorkers(n int) IOOption { return func(x *dkIO) { if n > 0 { x.flushWorkers = n @@ -135,8 +75,8 @@ func WithFlushWorkers(n int) IOOption { } } -// WithFlushInterval used to contol when to flush cached data. -func WithFlushInterval(d time.Duration) IOOption { +// WithCompactInterval used to contol when to flush cached data. +func WithCompactInterval(d time.Duration) IOOption { return func(x *dkIO) { if int64(d) > 0 { x.flushInterval = d @@ -144,13 +84,13 @@ func WithFlushInterval(d time.Duration) IOOption { } } -// WithMaxCacheCount used to set max cache size. +// WithCompactAt used to set max cache size. // The count used to control when to send the cached data. -func WithMaxCacheCount(count int) IOOption { +func WithCompactAt(count int) IOOption { return func(x *dkIO) { if count > 0 { log.Debugf("set max cache count to %d", count) - x.maxCacheCount = count + x.compactAt = count } } } diff --git a/internal/io/oss.go b/internal/io/oss.go deleted file mode 100644 index d479ea390b..0000000000 --- a/internal/io/oss.go +++ /dev/null @@ -1,75 +0,0 @@ -// Unless explicitly stated otherwise all files in this repository are licensed -// under the MIT License. -// This product includes software developed at Guance Cloud (https://www.guance.com/). -// Copyright 2021-present Guance, Inc. - -package io - -import ( - "fmt" - "io" - "net/http" - - "github.com/aliyun/aliyun-oss-go-sdk/oss" -) - -type OSSClient struct { - EndPoint string `toml:"endpoint"` - AccessKeyID string `toml:"access_key_id"` - AccessKeySecret string `toml:"access_key_secret"` - BucketName string `toml:"bucket_name"` - DomainName string `toml:"domain_name,omitempty"` - - Cli *oss.Client -} - -func NewOSSClient(endpoint, ak, sk, bucket string) (*OSSClient, error) { - oc := &OSSClient{ - EndPoint: endpoint, - AccessKeyID: ak, - AccessKeySecret: sk, - BucketName: bucket, - } - cli, err := oc.GetOSSCli() - if err != nil { - return nil, err - } - oc.Cli = cli - return oc, nil -} - -func (oc *OSSClient) GetOSSCli() (*oss.Client, error) { - cli, err := oss.New(oc.EndPoint, oc.AccessKeyID, oc.AccessKeySecret) - if err != nil { - return nil, err - } - return cli, nil -} - -func (oc *OSSClient) OSSUPLoad(objectName string, reader io.Reader) error { - bucket, err := oc.Cli.Bucket(oc.BucketName) - if err != nil { - return err - } - err = bucket.PutObject(objectName, reader) - return err -} - -func (oc *OSSClient) GetOSSUrl(remotePath string) string { - if oc.DomainName == "" { - return fmt.Sprintf("https://%s.%s/%s", oc.BucketName, oc.EndPoint, remotePath) - } - return fmt.Sprintf("https://%s/%s", oc.DomainName, remotePath) -} - -func (oc *OSSClient) ObjectExist(remotePath string) (http.Header, error) { - bucket, err := oc.Cli.Bucket(oc.BucketName) - if err != nil { - return nil, err - } - header, err := bucket.GetObjectMeta(remotePath) - if err != nil { - return nil, err - } - return header, nil -} diff --git a/internal/io/toml.go b/internal/io/toml.go index 63a24c58f8..a8734d005c 100644 --- a/internal/io/toml.go +++ b/internal/io/toml.go @@ -5,7 +5,11 @@ package io -import "gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/io/filter" +import ( + "time" + + "gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/io/filter" +) // IOConf configure io module in datakit.conf. type IOConf struct { @@ -15,13 +19,8 @@ type IOConf struct { MaxCacheCount int `toml:"max_cache_count"` MaxDynamicCacheCountDeprecated int `toml:"max_dynamic_cache_count,omitzero"` - FlushInterval string `toml:"flush_interval"` - FlushWorkers int `toml:"flush_workers"` - - EnableCache bool `toml:"enable_cache"` - CacheAll bool `toml:"cache_all"` - CacheSizeGB int `toml:"cache_max_size_gb"` - CacheCleanInterval string `toml:"cache_clean_interval"` + CompactInterval time.Duration `toml:"flush_interval"` + CompactWorkers int `toml:"flush_workers"` Filters map[string]filter.FilterConditions `toml:"filters"` } diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index 8aec8f0655..a311b033a0 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -41,13 +41,11 @@ func init() { type runtimeInfo struct { goroutines int - heapAlloc uint64 - sys uint64 - cpuUsage float64 - gcPauseTotal uint64 - gcNum uint32 - memStats *process.MemoryInfoStat + cpuUsage float64 + + goMemStats *runtime.MemStats + osMemStats *process.MemoryInfoStat ioCountersstats *process.IOCountersStat numCtxSwitch *process.NumCtxSwitchesStat @@ -62,20 +60,19 @@ func getRuntimeInfo() *runtimeInfo { usage = u } - ms, _ := resourcelimit.MyMemStat() - - return &runtimeInfo{ - goroutines: runtime.NumGoroutine(), - heapAlloc: m.HeapAlloc, - sys: m.Sys, - cpuUsage: usage, - memStats: ms, - - gcPauseTotal: m.PauseTotalNs, - gcNum: m.NumGC, + ri := &runtimeInfo{ + goroutines: runtime.NumGoroutine(), + goMemStats: &m, + cpuUsage: usage, ioCountersstats: resourcelimit.MyIOCountersStat(), numCtxSwitch: resourcelimit.MyCtxSwitch(), } + + if ms, err := resourcelimit.MyMemStat(); err == nil { + ri.osMemStats = ms + } + + return ri } // collector for basic runtime info. @@ -92,16 +89,23 @@ var ( riHeapAllocDesc = p8s.NewDesc( "datakit_heap_alloc_bytes", - "Datakit memory heap bytes", + "Datakit memory heap bytes(Deprecated by `datakit_golang_mem_usage`)", nil, nil, ) riSysAllocDesc = p8s.NewDesc( "datakit_sys_alloc_bytes", - "Datakit memory system bytes", + "Datakit memory system bytes(Deprecated by `datakit_golang_mem_usage`)", + nil, nil, ) + riGolangMemDesc = p8s.NewDesc( + "datakit_golang_mem_usage", + "Datakit golang memory usage stats", + []string{"type"}, nil, + ) + riMemStatDesc = p8s.NewDesc( "datakit_mem_stat", "Datakit memory system bytes", @@ -199,18 +203,38 @@ func (rc runtimeInfoCollector) Describe(ch chan<- *p8s.Desc) { func (rc runtimeInfoCollector) Collect(ch chan<- p8s.Metric) { ri := getRuntimeInfo() - ch <- p8s.MustNewConstSummary(riGCPauseDesc, uint64(ri.gcNum), float64(ri.gcPauseTotal)/float64(time.Second), nil) ch <- p8s.MustNewConstMetric(riGoroutineDesc, p8s.GaugeValue, float64(ri.goroutines)) - ch <- p8s.MustNewConstMetric(riHeapAllocDesc, p8s.GaugeValue, float64(ri.heapAlloc)) - ch <- p8s.MustNewConstMetric(riSysAllocDesc, p8s.GaugeValue, float64(ri.sys)) - - if ri.memStats != nil { - ch <- p8s.MustNewConstMetric(riMemStatDesc, p8s.GaugeValue, float64(ri.memStats.RSS), "rss") - ch <- p8s.MustNewConstMetric(riMemStatDesc, p8s.GaugeValue, float64(ri.memStats.VMS), "vms") - ch <- p8s.MustNewConstMetric(riMemStatDesc, p8s.GaugeValue, float64(ri.memStats.HWM), "hwm") - ch <- p8s.MustNewConstMetric(riMemStatDesc, p8s.GaugeValue, float64(ri.memStats.Data), "data") - ch <- p8s.MustNewConstMetric(riMemStatDesc, p8s.GaugeValue, float64(ri.memStats.Stack), "stack") - ch <- p8s.MustNewConstMetric(riMemStatDesc, p8s.GaugeValue, float64(ri.memStats.Locked), "locked") + + if ri.osMemStats != nil { + ch <- p8s.MustNewConstMetric(riMemStatDesc, p8s.GaugeValue, float64(ri.osMemStats.RSS), "rss") + ch <- p8s.MustNewConstMetric(riMemStatDesc, p8s.GaugeValue, float64(ri.osMemStats.VMS), "vms") + ch <- p8s.MustNewConstMetric(riMemStatDesc, p8s.GaugeValue, float64(ri.osMemStats.HWM), "hwm") + ch <- p8s.MustNewConstMetric(riMemStatDesc, p8s.GaugeValue, float64(ri.osMemStats.Data), "data") + ch <- p8s.MustNewConstMetric(riMemStatDesc, p8s.GaugeValue, float64(ri.osMemStats.Stack), "stack") + ch <- p8s.MustNewConstMetric(riMemStatDesc, p8s.GaugeValue, float64(ri.osMemStats.Locked), "locked") + } + + if ri.goMemStats != nil { + ch <- p8s.MustNewConstSummary(riGCPauseDesc, uint64(ri.goMemStats.NumGC), float64(ri.goMemStats.PauseTotalNs)/float64(time.Second), nil) + + // the 2 deprecated by following `heap_alloc' and `total' + ch <- p8s.MustNewConstMetric(riHeapAllocDesc, p8s.GaugeValue, float64(ri.goMemStats.HeapAlloc)) + ch <- p8s.MustNewConstMetric(riSysAllocDesc, p8s.GaugeValue, float64(ri.goMemStats.Sys-ri.goMemStats.HeapReleased)) + + ch <- p8s.MustNewConstMetric(riGolangMemDesc, p8s.GaugeValue, float64(ri.goMemStats.Sys-ri.goMemStats.HeapReleased), "total") + ch <- p8s.MustNewConstMetric(riGolangMemDesc, p8s.GaugeValue, float64(ri.goMemStats.HeapAlloc), "heap_alloc") + ch <- p8s.MustNewConstMetric(riGolangMemDesc, p8s.GaugeValue, float64(ri.goMemStats.HeapInuse-ri.goMemStats.HeapAlloc), "heap_unused") + ch <- p8s.MustNewConstMetric(riGolangMemDesc, p8s.GaugeValue, float64(ri.goMemStats.HeapIdle-ri.goMemStats.HeapReleased), "heap_free") + ch <- p8s.MustNewConstMetric(riGolangMemDesc, p8s.GaugeValue, float64(ri.goMemStats.HeapReleased), "heap_released") + ch <- p8s.MustNewConstMetric(riGolangMemDesc, p8s.GaugeValue, float64(ri.goMemStats.StackInuse), "goroutine_stack") + ch <- p8s.MustNewConstMetric(riGolangMemDesc, p8s.GaugeValue, float64(ri.goMemStats.StackSys-ri.goMemStats.StackInuse), "thread_stack") + ch <- p8s.MustNewConstMetric(riGolangMemDesc, p8s.GaugeValue, float64(ri.goMemStats.GCSys), "gc") + ch <- p8s.MustNewConstMetric(riGolangMemDesc, p8s.GaugeValue, float64(ri.goMemStats.MSpanInuse), "mspan_inuse") + ch <- p8s.MustNewConstMetric(riGolangMemDesc, p8s.GaugeValue, float64(ri.goMemStats.MSpanSys-ri.goMemStats.MSpanInuse), "mspan_free") + ch <- p8s.MustNewConstMetric(riGolangMemDesc, p8s.GaugeValue, float64(ri.goMemStats.MCacheInuse), "mcache_inuse") + ch <- p8s.MustNewConstMetric(riGolangMemDesc, p8s.GaugeValue, float64(ri.goMemStats.MCacheSys-ri.goMemStats.MCacheInuse), "mcache_free") + ch <- p8s.MustNewConstMetric(riGolangMemDesc, p8s.GaugeValue, float64(ri.goMemStats.OtherSys), "other") + ch <- p8s.MustNewConstMetric(riGolangMemDesc, p8s.GaugeValue, float64(ri.goMemStats.BuckHashSys), "buckets") } ch <- p8s.MustNewConstMetric(riCPUUsageDesc, p8s.GaugeValue, ri.cpuUsage) diff --git a/internal/monitor/app.go b/internal/monitor/app.go index fb323ec8e0..d842b5871e 100644 --- a/internal/monitor/app.go +++ b/internal/monitor/app.go @@ -25,11 +25,12 @@ var ( inputsFeedCols = strings.Split(`Input|Cat|Feeds|P90Lat|P90Pts|Filtered|LastFeed|AvgCost|Errors`, "|") plStatsCols = strings.Split("Script|Cat|Namespace|TotalPts|DropPts|ErrPts|PLUpdate|AvgCost", "|") + walStatsCols = strings.Split("Cat|Points(mem/disk/drop/total)", "|") enabledInputCols = strings.Split(`Input|Count|Crashed`, "|") goroutineCols = strings.Split(`Name|Running|Done|TotalCost`, "|") httpAPIStatCols = strings.Split(`API|Status|Total|Latency|BodySize`, "|") filterRuleCols = strings.Split("Cat|Total|Filtered(%)|Cost", "|") - ioStatCols = strings.Split(`Cat|ChanUsage|Points(ok/total)|Bytes(ok/total/gz)`, "|") + dwptsStatCols = strings.Split(`Cat|Points(ok/total)|Bytes(ok/total/gz)`, "|") dwCols = strings.Split(`API|Status|Count|Latency|Retry`, "|") moduleGoroutine = []string{"G", "goroutine"} @@ -41,22 +42,26 @@ var ( modulePipeline = []string{"P", "pipeline"} moduleIO = []string{"IO", "io_stats"} moduleDataway = []string{"W", "dataway"} + moduleWAL = []string{"WAL", "wal"} + + labelCategory = "category" + labelName = "name" ) type monitorAPP struct { app *tview.Application // UI elements - basicInfoTable *tview.Table - golangRuntime *tview.Table - inputsStatTable *tview.Table - plStatTable *tview.Table - enabledInputTable *tview.Table - goroutineStatTable *tview.Table - httpServerStatTable *tview.Table - ioStatTable *tview.Table - dwTable *tview.Table - + basicInfoTable *tview.Table + golangRuntime *tview.Table + inputsStatTable *tview.Table + plStatTable *tview.Table + walStatTable *tview.Table + enabledInputTable *tview.Table + goroutineStatTable *tview.Table + httpServerStatTable *tview.Table + dwTable *tview.Table + dwptsTable *tview.Table filterStatsTable *tview.Table filterRulesStatsTable *tview.Table diff --git a/internal/monitor/flex.go b/internal/monitor/flex.go index 296ee7ea1f..326a6893cb 100644 --- a/internal/monitor/flex.go +++ b/internal/monitor/flex.go @@ -28,10 +28,14 @@ func (app *monitorAPP) setupFlex() { AddItem(app.filterStatsTable, 0, 2, false). // filter stats AddItem(app.filterRulesStatsTable, 0, 8, false), // filter rules stats 0, 10, false). - AddItem(app.plStatTable, 0, 15, false). AddItem(tview.NewFlex().SetDirection(tview.FlexColumn). - AddItem(app.ioStatTable, 0, 10, false). - AddItem(app.dwTable, 0, 10, false), 0, 10, false). + AddItem(app.plStatTable, 0, 10, false). // pipeline stats + AddItem(app.walStatTable, 0, 10, false), // WAL stats + 0, 10, false). + AddItem(tview.NewFlex().SetDirection(tview.FlexColumn). + AddItem(app.dwptsTable, 0, 10, false). + AddItem(app.dwTable, 0, 10, false), + 0, 10, false). AddItem(app.anyErrorPrompt, 0, 1, false). AddItem(app.exitPrompt, 0, 1, false) return @@ -70,13 +74,17 @@ func (app *monitorAPP) setupFlex() { } if exitsStr(app.onlyModules, moduleIO) { - flex.AddItem(tview.NewFlex().SetDirection(tview.FlexColumn).AddItem(app.ioStatTable, 0, 10, false), 0, 10, false) + flex.AddItem(tview.NewFlex().SetDirection(tview.FlexColumn).AddItem(app.dwptsTable, 0, 10, false), 0, 10, false) } if exitsStr(app.onlyModules, moduleDataway) { flex.AddItem(tview.NewFlex().SetDirection(tview.FlexColumn).AddItem(app.dwTable, 0, 10, false), 0, 10, false) } + if exitsStr(app.onlyModules, moduleWAL) { + flex.AddItem(tview.NewFlex().SetDirection(tview.FlexColumn).AddItem(app.walStatTable, 0, 10, false), 0, 10, false) + } + flex.AddItem(app.anyErrorPrompt, 0, 1, false).AddItem(app.exitPrompt, 0, 1, false) return diff --git a/internal/monitor/render.go b/internal/monitor/render.go index a4b77672fb..cd3ca170e8 100644 --- a/internal/monitor/render.go +++ b/internal/monitor/render.go @@ -23,24 +23,25 @@ func (app *monitorAPP) render() { app.inputsStatTable.Clear() app.enabledInputTable.Clear() app.plStatTable.Clear() + app.walStatTable.Clear() app.goroutineStatTable.Clear() - app.ioStatTable.Clear() + app.dwptsTable.Clear() app.dwTable.Clear() + app.httpServerStatTable.Clear() app.filterStatsTable.Clear() app.filterRulesStatsTable.Clear() app.renderBasicInfoTable(app.mfs) app.renderGolangRuntimeTable(app.mfs) app.renderEnabledInputTable(app.mfs, enabledInputCols) - app.renderInputsFeedTable(app.mfs, inputsFeedCols) app.renderGoroutineTable(app.mfs, goroutineCols) app.renderHTTPStatTable(app.mfs, httpAPIStatCols) - app.renderFilterStatsTable(app.mfs) app.renderFilterRulesStatsTable(app.mfs, filterRuleCols) app.renderPLStatTable(app.mfs, plStatsCols) - app.renderIOTable(app.mfs, ioStatCols) + app.renderWALStatTable(app.mfs, walStatsCols) + app.renderDWPointsTable(app.mfs, dwptsStatCols) app.renderDatawayTable(app.mfs, dwCols) end: diff --git a/internal/monitor/setup.go b/internal/monitor/setup.go index c4b5bb7307..c5617441cf 100644 --- a/internal/monitor/setup.go +++ b/internal/monitor/setup.go @@ -20,39 +20,96 @@ func (app *monitorAPP) setup() { app.golangRuntime.SetBorder(true).SetTitle("[red]R[white]untime Info").SetTitleAlign(tview.AlignLeft) // inputs running stats - app.inputsStatTable = tview.NewTable().SetFixed(1, 1).SetSelectable(true, false).SetBorders(false).SetSeparator(tview.Borders.Vertical) + app.inputsStatTable = tview.NewTable(). + SetFixed(1, 1). + SetSelectable(true, false). + SetBorders(false). + SetSeparator(tview.Borders.Vertical) app.inputsStatTable.SetBorder(true).SetTitle("[red]In[white]puts Info").SetTitleAlign(tview.AlignLeft) // pipeline running stats - app.plStatTable = tview.NewTable().SetFixed(1, 1).SetSelectable(true, false).SetBorders(false).SetSeparator(tview.Borders.Vertical) - app.plStatTable.SetBorder(true).SetTitle("[red]P[white]ipeline Info").SetTitleAlign(tview.AlignLeft) + app.plStatTable = tview.NewTable(). + SetFixed(1, 1). + SetSelectable(true, false). + SetBorders(false). + SetSeparator(tview.Borders.Vertical) + app.plStatTable. + SetBorder(true). + SetTitle("[red]P[white]ipeline Info"). + SetTitleAlign(tview.AlignLeft) + + // WAL running stats + app.walStatTable = tview.NewTable(). + SetFixed(1, 1). + SetSelectable(true, false). + SetBorders(false). + SetSeparator(tview.Borders.Vertical) + app.walStatTable. + SetBorder(true). + SetTitle("[red]WAL[white] Info"). + SetTitleAlign(tview.AlignLeft) // enabled inputs app.enabledInputTable = tview.NewTable().SetFixed(1, 1).SetSelectable(true, false).SetBorders(false) - app.enabledInputTable.SetBorder(true).SetTitle("Enabled [red]In[white]puts").SetTitleAlign(tview.AlignLeft) + app.enabledInputTable. + SetBorder(true). + SetTitle("Enabled [red]In[white]puts"). + SetTitleAlign(tview.AlignLeft) // goroutine stats - app.goroutineStatTable = tview.NewTable().SetFixed(1, 1).SetSelectable(true, false).SetBorders(false).SetSeparator(tview.Borders.Vertical) - app.goroutineStatTable.SetBorder(true).SetTitle("[red]G[white]oroutine Groups").SetTitleAlign(tview.AlignLeft) + app.goroutineStatTable = tview.NewTable(). + SetFixed(1, 1). + SetSelectable(true, false). + SetBorders(false). + SetSeparator(tview.Borders.Vertical) + app.goroutineStatTable. + SetBorder(true). + SetTitle("[red]G[white]oroutine Groups"). + SetTitleAlign(tview.AlignLeft) // 9592 http stats - app.httpServerStatTable = tview.NewTable().SetFixed(1, 1).SetSelectable(true, false).SetBorders(false).SetSeparator(tview.Borders.Vertical) - app.httpServerStatTable.SetBorder(true).SetTitle("[red]H[white]TTP APIs").SetTitleAlign(tview.AlignLeft) - - // IO stats - app.ioStatTable = tview.NewTable().SetFixed(1, 1).SetSelectable(true, false).SetBorders(false).SetSeparator(tview.Borders.Vertical) - app.ioStatTable.SetBorder(true).SetTitle("[red]IO[white] Info").SetTitleAlign(tview.AlignLeft) + app.httpServerStatTable = tview.NewTable(). + SetFixed(1, 1). + SetSelectable(true, false). + SetBorders(false). + SetSeparator(tview.Borders.Vertical) + app.httpServerStatTable. + SetBorder(true). + SetTitle("[red]H[white]TTP APIs"). + SetTitleAlign(tview.AlignLeft) + + // dataway points stats + app.dwptsTable = tview.NewTable(). + SetFixed(1, 1). + SetSelectable(true, false). + SetBorders(false). + SetSeparator(tview.Borders.Vertical) + app.dwptsTable. + SetBorder(true). + SetTitle("[red]IO[white] Info"). + SetTitleAlign(tview.AlignLeft) // dataway stats - app.dwTable = tview.NewTable().SetFixed(1, 1).SetSelectable(true, false).SetBorders(false).SetSeparator(tview.Borders.Vertical) + app.dwTable = tview.NewTable(). + SetFixed(1, 1). + SetSelectable(true, false). + SetBorders(false). + SetSeparator(tview.Borders.Vertical) app.dwTable.SetBorder(true).SetTitle("Data[red]W[white]ay APIs").SetTitleAlign(tview.AlignLeft) // filter stats app.filterStatsTable = tview.NewTable().SetFixed(1, 1).SetSelectable(true, false).SetBorders(false) app.filterStatsTable.SetBorder(true).SetTitle("[red]F[white]ilter").SetTitleAlign(tview.AlignLeft) - app.filterRulesStatsTable = tview.NewTable().SetFixed(1, 1).SetSelectable(true, false).SetBorders(false).SetSeparator(tview.Borders.Vertical) - app.filterRulesStatsTable.SetBorder(true).SetTitle("[red]F[white]ilter Rules").SetTitleAlign(tview.AlignLeft) + app.filterRulesStatsTable = tview.NewTable(). + SetFixed(1, 1). + SetSelectable(true, false). + SetBorders(false). + SetSeparator(tview.Borders.Vertical) + app.filterRulesStatsTable. + SetBorder(true). + SetTitle("[red]F[white]ilter Rules"). + SetTitleAlign(tview.AlignLeft) // bottom prompt app.exitPrompt = tview.NewTextView().SetDynamicColors(true) diff --git a/internal/monitor/view_dwpts.go b/internal/monitor/view_dwpts.go new file mode 100644 index 0000000000..181108b0f7 --- /dev/null +++ b/internal/monitor/view_dwpts.go @@ -0,0 +1,117 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the MIT License. +// This product includes software developed at Guance Cloud (https://www.guance.com/). +// Copyright 2021-present Guance, Inc. + +package monitor + +import ( + "fmt" + "net/http" + + "github.com/GuanceCloud/cliutils/point" + "github.com/gdamore/tcell/v2" + dto "github.com/prometheus/client_model/go" + "github.com/rivo/tview" + "gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/strarr" +) + +func (app *monitorAPP) renderDWPointsTable(mfs map[string]*dto.MetricFamily, colArr []string) { + table := app.dwptsTable + + if app.anyError != nil { + return + } + + if mfs == nil { + table.SetTitle("Point [red]U[white]pload Info(no data collected)") + return + } + + table.SetTitle("Point [red]U[white]pload Info") + + // set table header + for idx := range colArr { + table.SetCell(0, idx, tview.NewTableCell(colArr[idx]). + SetMaxWidth(app.maxTableWidth). + SetTextColor(tcell.ColorGreen).SetAlign(tview.AlignRight)) + } + + dwPtsTotal := mfs["datakit_io_dataway_point_total"] + dwBytesTotal := mfs["datakit_io_dataway_point_bytes_total"] + + if dwPtsTotal == nil { + return + } + + var ( + ptsTotal, + ptsOK float64 + row = 1 + cats = []string{} + ) + + for _, m := range dwPtsTotal.Metric { + for _, lp := range m.GetLabel() { + val := lp.GetValue() + if lp.GetName() == labelCategory && !strarr.Contains(cats, val) { + cats = append(cats, val) + break + } + } + } + + for _, cat := range cats { + table.SetCell(row, + 0, + tview.NewTableCell(point.CatString(cat).Alias()). + SetMaxWidth(app.maxTableWidth). + SetAlign(tview.AlignRight)) + + // ok points + if x := metricWithLabel(dwPtsTotal, + point.CatString(cat).String(), http.StatusText(http.StatusOK)); x != nil { + ptsOK = x.GetCounter().GetValue() + } + + // total points + if x := metricWithLabel(dwPtsTotal, + point.CatString(cat).String(), "total"); x != nil { + ptsTotal = x.GetCounter().GetValue() + } + + // only show ok points and total points. + table.SetCell(row, 1, + tview.NewTableCell(fmt.Sprintf("%s/%s", + number(ptsOK), number(ptsTotal))). + SetMaxWidth(app.maxTableWidth).SetAlign(tview.AlignRight)) + + if dwBytesTotal != nil { + var bytesOk, bytesTotal, bytesGzipTotal float64 + if x := metricWithLabel(dwBytesTotal, + point.CatString(cat).String(), "raw", http.StatusText(http.StatusOK)); x != nil { + bytesOk = x.GetCounter().GetValue() + } + + // total raw bytes + if x := metricWithLabel(dwBytesTotal, + point.CatString(cat).String(), "raw", "total"); x != nil { + bytesTotal = x.GetCounter().GetValue() + } + + // total gzip bytes + if x := metricWithLabel(dwBytesTotal, + point.CatString(cat).String(), "gzip", "total"); x != nil { + bytesGzipTotal = x.GetCounter().GetValue() + } + + // only show ok points and total points. + table.SetCell(row, 2, + tview.NewTableCell(fmt.Sprintf("%s/%s(%s)", + number(bytesOk), number(bytesTotal), number(bytesGzipTotal))). + SetMaxWidth(app.maxTableWidth).SetAlign(tview.AlignRight)) + } + + row++ + } +} diff --git a/internal/monitor/view_feed.go b/internal/monitor/view_feed.go index 41407b20bf..25c943a8c7 100644 --- a/internal/monitor/view_feed.go +++ b/internal/monitor/view_feed.go @@ -81,10 +81,10 @@ func (app *monitorAPP) renderInputsFeedTable(mfs map[string]*dto.MetricFamily, c val := lp.GetValue() switch lp.GetName() { - case "name": + case labelName: inputName = val - case "category": //nolint:goconst + case labelCategory: //nolint:goconst cat = val default: @@ -114,10 +114,10 @@ func (app *monitorAPP) renderInputsFeedTable(mfs map[string]*dto.MetricFamily, c col++ // P90Lat - feedSum := metricWithLabel(feedCost, cat, inputName).GetSummary() + feedSum := metricWithLabel(feedCost, cat, inputName) feedLat := "-" if feedSum != nil { - q := feedSum.GetQuantile()[1] // p90 + q := feedSum.GetSummary().GetQuantile()[1] // p90 if v := q.GetValue(); math.IsNaN(v) { feedLat = "NaN" } else { diff --git a/internal/monitor/view_filter.go b/internal/monitor/view_filter.go index 10f6498412..e37a96b883 100644 --- a/internal/monitor/view_filter.go +++ b/internal/monitor/view_filter.go @@ -57,7 +57,7 @@ func (app *monitorAPP) renderFilterRulesStatsTable(mfs map[string]*dto.MetricFam filter = val case "source": source = val - case "category": + case labelCategory: cat = val table.SetCell(row, 0, tview.NewTableCell(point.CatString(val).Alias()). SetMaxWidth(app.maxTableWidth).SetAlign(tview.AlignCenter)) diff --git a/internal/monitor/view_io.go b/internal/monitor/view_io.go deleted file mode 100644 index 3136908225..0000000000 --- a/internal/monitor/view_io.go +++ /dev/null @@ -1,134 +0,0 @@ -// Unless explicitly stated otherwise all files in this repository are licensed -// under the MIT License. -// This product includes software developed at Guance Cloud (https://www.guance.com/). -// Copyright 2021-present Guance, Inc. - -package monitor - -import ( - "fmt" - "net/http" - - "github.com/GuanceCloud/cliutils/point" - "github.com/gdamore/tcell/v2" - dto "github.com/prometheus/client_model/go" - "github.com/rivo/tview" -) - -func (app *monitorAPP) renderIOTable(mfs map[string]*dto.MetricFamily, colArr []string) { - table := app.ioStatTable - - if app.anyError != nil { - return - } - - if mfs == nil { - table.SetTitle("[red]IO[white] Info(no data collected)") - return - } - - table.SetTitle("[red]IO[white] Info") - - // set table header - for idx := range colArr { - table.SetCell(0, idx, tview.NewTableCell(colArr[idx]). - SetMaxWidth(app.maxTableWidth). - SetTextColor(tcell.ColorGreen).SetAlign(tview.AlignRight)) - } - - chanCap := mfs["datakit_io_chan_capacity"] - chanUsage := mfs["datakit_io_chan_usage"] - dwPtsTotal := mfs["datakit_io_dataway_point_total"] - dwBytesTotal := mfs["datakit_io_dataway_point_bytes_total"] - - if chanUsage == nil { - return - } - - row := 1 - for _, m := range chanUsage.Metric { - lps := m.GetLabel() - - var ( - cat string - - used, - capacity int64 - - ptsTotal, ptsOK float64 - ) - - for _, lp := range lps { - val := lp.GetValue() - if lp.GetName() == "category" { - cat = val - - table.SetCell(row, 0, tview.NewTableCell(point.CatString(cat).Alias()). - SetMaxWidth(app.maxTableWidth).SetAlign(tview.AlignRight)) - used = int64(m.GetGauge().GetValue()) - } - } - - if chanCap != nil { - x := metricWithLabel(chanCap, "all-the-same") - if x == nil { - capacity = 0 // should not been here - } else { - capacity = int64(x.GetGauge().GetValue()) - } - - table.SetCell(row, 1, tview.NewTableCell(fmt.Sprintf("%d/%d", used, capacity)). - SetMaxWidth(app.maxTableWidth).SetAlign(tview.AlignRight)) - } - - if dwPtsTotal != nil { - if x := metricWithLabel(dwPtsTotal, - point.CatString(cat).String(), http.StatusText(http.StatusOK)); x != nil { - ptsOK = x.GetCounter().GetValue() - } - - if x := metricWithLabel(dwPtsTotal, - point.CatString(cat).String(), "total"); x != nil { - ptsTotal = x.GetCounter().GetValue() - } - - // only show ok points and total points. - table.SetCell(row, 2, - tview.NewTableCell(fmt.Sprintf("%s/%s", - number(ptsOK), number(ptsTotal))). - SetMaxWidth(app.maxTableWidth).SetAlign(tview.AlignRight)) - - // For failed points, there maybe more reasons(more tags), so do not - // show here, we can see them via /metrics API. - } - - if dwBytesTotal != nil { - var bytesOk, bytesTotal, bytesGzipTotal float64 - if x := metricWithLabel(dwBytesTotal, - point.CatString(cat).String(), "raw", http.StatusText(http.StatusOK)); x != nil { - bytesOk = x.GetCounter().GetValue() - } - - if x := metricWithLabel(dwBytesTotal, - point.CatString(cat).String(), "raw", "total"); x != nil { - bytesTotal = x.GetCounter().GetValue() - } - - if x := metricWithLabel(dwBytesTotal, - point.CatString(cat).String(), "gzip", "total"); x != nil { - bytesGzipTotal = x.GetCounter().GetValue() - } - - // only show ok points and total points. - table.SetCell(row, 3, - tview.NewTableCell(fmt.Sprintf("%s/%s(%s)", - number(bytesOk), number(bytesTotal), number(bytesGzipTotal))). - SetMaxWidth(app.maxTableWidth).SetAlign(tview.AlignRight)) - - // For failed points, there maybe more reasons(more tags), so do not - // show here, we can see them via /metrics API. - } - - row++ - } -} diff --git a/internal/monitor/view_pl.go b/internal/monitor/view_pl.go index f05cdee4ba..230050b81e 100644 --- a/internal/monitor/view_pl.go +++ b/internal/monitor/view_pl.go @@ -50,12 +50,12 @@ func (app *monitorAPP) renderPLStatTable(mfs map[string]*dto.MetricFamily, colAr for _, lp := range lps { val := lp.GetValue() switch lp.GetName() { - case "name": + case labelName: name = val table.SetCell(row, 0, tview.NewTableCell(val). SetMaxWidth(app.maxTableWidth).SetAlign(tview.AlignRight)) - case "category": + case labelCategory: cat = val table.SetCell(row, 1, tview.NewTableCell(point.CatString(val).Alias()). diff --git a/internal/monitor/view_runtime.go b/internal/monitor/view_runtime.go index 2a89c50a73..7085a13971 100644 --- a/internal/monitor/view_runtime.go +++ b/internal/monitor/view_runtime.go @@ -22,9 +22,10 @@ func (app *monitorAPP) renderGolangRuntimeTable(mfs map[string]*dto.MetricFamily } goroutines := app.mfs["datakit_goroutines"] - heapAlloc := app.mfs["datakit_heap_alloc_bytes"] - sysAlloc := app.mfs["datakit_sys_alloc_bytes"] - memStat := app.mfs["datakit_mem_stat"] + + osMemStat := app.mfs["datakit_mem_stat"] + goMemStat := app.mfs["datakit_golang_mem_usage"] + cpuUsage := app.mfs["datakit_cpu_usage"] gcSummary := app.mfs["datakit_gc_summary_seconds"] openFiles := app.mfs["datakit_open_files"] @@ -39,26 +40,31 @@ func (app *monitorAPP) renderGolangRuntimeTable(mfs map[string]*dto.MetricFamily row++ } - var heap, sysMem float64 - if heapAlloc != nil && len(heapAlloc.Metric) == 1 { - m := heapAlloc.Metric[0] - heap = m.GetGauge().GetValue() - } + var heap, memTotal float64 // golang runtime heap and total memory + for _, m := range goMemStat.Metric { + lps := m.GetLabel() + if len(lps) != 1 { + continue + } - if sysAlloc != nil && len(sysAlloc.Metric) == 1 { - m := sysAlloc.Metric[0] - sysMem = m.GetGauge().GetValue() + switch lps[0].GetValue() { + case "total": + memTotal = m.GetGauge().GetValue() + case "heap_alloc": + heap = m.GetGauge().GetValue() + default: // pass + } } table.SetCell(row, 0, - tview.NewTableCell("Sys/Heap").SetMaxWidth(app.maxTableWidth).SetAlign(tview.AlignRight)) + tview.NewTableCell("Total/Heap").SetMaxWidth(app.maxTableWidth).SetAlign(tview.AlignRight)) table.SetCell(row, 1, - tview.NewTableCell(fmt.Sprintf("%s/%s", number(sysMem), number(heap))). + tview.NewTableCell(fmt.Sprintf("%s/%s", number(memTotal), number(heap))). SetMaxWidth(app.maxTableWidth).SetAlign(tview.AlignLeft)) row++ - var rss, vms float64 - for _, m := range memStat.Metric { + var rss, vms float64 // OS RSS and VMS memory + for _, m := range osMemStat.Metric { lps := m.GetLabel() if len(lps) != 1 { continue diff --git a/internal/monitor/view_wal.go b/internal/monitor/view_wal.go new file mode 100644 index 0000000000..ab54b487f7 --- /dev/null +++ b/internal/monitor/view_wal.go @@ -0,0 +1,92 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the MIT License. +// This product includes software developed at Guance Cloud (https://www.guance.com/). +// Copyright 2021-present Guance, Inc. + +package monitor + +import ( + "fmt" + + "github.com/gdamore/tcell/v2" + dto "github.com/prometheus/client_model/go" + "github.com/rivo/tview" + "gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/strarr" +) + +func (app *monitorAPP) renderWALStatTable(mfs map[string]*dto.MetricFamily, colArr []string) { + table := app.walStatTable + + if app.anyError != nil { + return + } + + if mfs == nil { + table.SetTitle("[red]WAL[white] Info(no data collected)") + return + } + + table.SetTitle("[red]WAL[white] Info") + + // set table header + for idx := range colArr { + table.SetCell(0, idx, tview.NewTableCell(colArr[idx]). + SetMaxWidth(app.maxTableWidth). + SetTextColor(tcell.ColorGreen).SetAlign(tview.AlignRight)) + } + + ptsTotal := mfs["datakit_io_wal_point_total"] + + if ptsTotal == nil { + return + } + + var ( + row = 1 + cats []string + ) + + for _, m := range ptsTotal.Metric { + for _, lp := range m.GetLabel() { + val := lp.GetValue() + if lp.GetName() == labelCategory && !strarr.Contains(cats, val) { + cats = append(cats, val) + break + } + } + } + + for _, cat := range cats { + var total, + mem, + disk, + drop float64 + + table.SetCell(row, + 0, + tview.NewTableCell(cat). + SetMaxWidth(app.maxTableWidth). + SetAlign(tview.AlignRight)) + if x := metricWithLabel(ptsTotal, cat, "M"); x != nil { + mem = x.GetCounter().GetValue() + } + + if x := metricWithLabel(ptsTotal, cat, "D"); x != nil { + disk = x.GetCounter().GetValue() + } + + if x := metricWithLabel(ptsTotal, cat, "drop"); x != nil { + drop = x.GetCounter().GetValue() + } + + total = mem + disk + drop + + // only show ok points and total points. + table.SetCell(row, 1, + tview.NewTableCell(fmt.Sprintf("%s/%s/%s/%s", + number(mem), number(disk), number(drop), number(total))). + SetMaxWidth(app.maxTableWidth).SetAlign(tview.AlignRight)) + + row++ + } +} diff --git a/internal/plugins/inputs/profile/input.go b/internal/plugins/inputs/profile/input.go index 83a08fef26..c7eed8a485 100644 --- a/internal/plugins/inputs/profile/input.go +++ b/internal/plugins/inputs/profile/input.go @@ -795,7 +795,7 @@ func (ipt *Input) InitDiskQueueIO() error { reqData = msg return nil }); err != nil { - if errors.Is(err, diskcache.ErrEOF) { + if errors.Is(err, diskcache.ErrNoData) { log.Debugf("disk queue is empty: %s", err) time.Sleep(time.Second * 3) } else { diff --git a/internal/plugins/inputs/profile/pyroscope_test.go b/internal/plugins/inputs/profile/pyroscope_test.go index fa16ce38b1..e8bf668386 100644 --- a/internal/plugins/inputs/profile/pyroscope_test.go +++ b/internal/plugins/inputs/profile/pyroscope_test.go @@ -27,28 +27,29 @@ func checkDevHost() bool { // go test -v -timeout 30s -run ^TestPyroscopeRun$ gitlab.jiagouyun.com/cloudcare-tools/datakit/internal/plugins/inputs/profile // func TestPyroscopeRun(t *testing.T) { -// if !checkDevHost() { -// return -// } +// if !checkDevHost() { +// return +// } // -// pyro := pyroscopeOpts{ -// URL: "0.0.0.0:4040", -// } +// pyro := pyroscopeOpts{ +// URL: "0.0.0.0:4040", +// } // -// config.Cfg.Dataway = &dataway.Dataway{URLs: []string{"http://?token="}} +// config.Cfg.Dataway = dataway.NewDefaultDataway() +// config.Cfg.Dataway.URLs = []string{"http://?token="} // -// err := config.Cfg.SetupDataway() -// if err != nil { -// panic(err) -// } -// ipt := &Input{ -// semStop: cliutils.NewSem(), -// } -// err = pyro.run(ipt) -// if err != nil { -// panic(err) -// } -//} +// err := config.Cfg.SetupDataway() +// if err != nil { +// panic(err) +// } +// ipt := &Input{ +// semStop: cliutils.NewSem(), +// } +// err = pyro.run(ipt) +// if err != nil { +// panic(err) +// } +// } // go test -v -timeout 30s -run ^Test_getReportCacheKeyName$ gitlab.jiagouyun.com/cloudcare-tools/datakit/plugins/inputs/profile func Test_getReportCacheKeyName(t *testing.T) { diff --git a/internal/plugins/inputs/prom/input.go b/internal/plugins/inputs/prom/input.go index 2ce7639752..aa501c5cb6 100644 --- a/internal/plugins/inputs/prom/input.go +++ b/internal/plugins/inputs/prom/input.go @@ -166,55 +166,6 @@ func (i *Input) tryInit() { i.l.Errorf("Init %s", err) return } - - // Callback func. - if i.StreamSize > 0 { - i.callbackFunc = func(pts []*point.Point) error { - // Append tags to points - for _, v := range i.urlTags[i.currentURL] { - for _, pt := range pts { - pt.AddTag(v.key, v.value) - } - } - - if len(pts) < 1 { - return nil - } - - if i.AsLogging != nil && i.AsLogging.Enable { - // Feed measurement as logging. - for _, pt := range pts { - // We need to feed each point separately because - // each point might have different measurement name. - if err := i.Feeder.FeedV2(point.Logging, []*point.Point{pt}, - dkio.WithCollectCost(time.Since(i.startTime)), - dkio.WithElection(i.Election), - dkio.WithInputName(pt.Name()), - ); err != nil { - i.Feeder.FeedLastError(err.Error(), - metrics.WithLastErrorInput(inputName), - metrics.WithLastErrorSource(inputName+"/"+i.Source), - ) - i.l.Errorf("feed logging: %s", err) - } - } - } else if err := i.Feeder.FeedV2(point.Metric, pts, - dkio.WithCollectCost(time.Since(i.startTime)), - dkio.WithElection(i.Election), - dkio.WithInputName(inputName+"/"+i.Source), - ); err != nil { - i.Feeder.FeedLastError(err.Error(), - metrics.WithLastErrorInput(inputName), - metrics.WithLastErrorSource(inputName+"/"+i.Source), - ) - i.l.Errorf("feed measurement: %s", err) - } - i.FeedUpMetric(i.currentURL) - return nil - } - } - - i.isInitialized = true } func (i *Input) collect() error { @@ -402,6 +353,54 @@ func (i *Input) Resume() error { } } +type promHandleCallback func([]*point.Point) error + +func (i *Input) defaultHandleCallback() promHandleCallback { + return func(pts []*point.Point) error { + // Append tags to points + for _, v := range i.urlTags[i.currentURL] { + for _, pt := range pts { + pt.AddTag(v.key, v.value) + } + } + + if len(pts) < 1 { + return nil + } + + if i.AsLogging != nil && i.AsLogging.Enable { + // Feed measurement as logging. + for _, pt := range pts { + // We need to feed each point separately because + // each point might have different measurement name. + if err := i.Feeder.FeedV2(point.Logging, []*point.Point{pt}, + dkio.WithCollectCost(time.Since(i.startTime)), + dkio.WithElection(i.Election), + dkio.WithInputName(pt.Name()), + ); err != nil { + i.Feeder.FeedLastError(err.Error(), + metrics.WithLastErrorInput(inputName), + metrics.WithLastErrorSource(inputName+"/"+i.Source), + ) + i.l.Errorf("feed logging: %s", err) + } + } + } else if err := i.Feeder.FeedV2(point.Metric, pts, + dkio.WithCollectCost(time.Since(i.startTime)), + dkio.WithElection(i.Election), + dkio.WithInputName(inputName+"/"+i.Source), + ); err != nil { + i.Feeder.FeedLastError(err.Error(), + metrics.WithLastErrorInput(inputName), + metrics.WithLastErrorSource(inputName+"/"+i.Source), + ) + i.l.Errorf("feed measurement: %s", err) + } + i.FeedUpMetric(i.currentURL) + return nil + } +} + func (i *Input) Init() error { i.l = logger.SLogger(inputName + "/" + i.Source) @@ -443,6 +442,10 @@ func (i *Input) Init() error { i.urlTags[u] = tempTags } + if i.StreamSize > 0 && i.callbackFunc == nil { // set callback on streamming-mode + i.callbackFunc = i.defaultHandleCallback() + } + opts := []iprom.PromOption{ iprom.WithLogger(i.l), // WithLogger must in the first iprom.WithSource(i.Source), diff --git a/internal/plugins/inputs/prom/input_test.go b/internal/plugins/inputs/prom/input_test.go index b677ae3f88..2863ead11e 100644 --- a/internal/plugins/inputs/prom/input_test.go +++ b/internal/plugins/inputs/prom/input_test.go @@ -57,6 +57,7 @@ promhttp_metric_handler_errors_total{cause="encoding"} 0 inp := NewProm() inp.URLs = []string{srv.URL} + inp.StreamSize = 0 inp.Tagger = &taggerMock{ hostTags: map[string]string{ @@ -108,6 +109,7 @@ some_info{info1="data1"} 0 t.Logf("url: %s", srv.URL) inp := NewProm() + inp.StreamSize = 0 inp.URLs = []string{srv.URL} inp.tryInit() @@ -154,6 +156,7 @@ promhttp_metric_handler_errors_total{cause="encoding",ignore_me="some"} 0 t.Logf("url: %s", srv.URL) inp := NewProm() + inp.StreamSize = 0 inp.URLs = []string{srv.URL} inp.TagsIgnore = []string{"ignore_me"} @@ -188,6 +191,7 @@ promhttp_metric_handler_errors_total{cause="encoding"} 0 t.Logf("url: %s", srv.URL) inp := NewProm() + inp.StreamSize = 0 inp.URLs = []string{srv.URL} inp.TagsRename = &iprom.RenameTags{ Mapping: map[string]string{ @@ -227,6 +231,7 @@ promhttp_metric_handler_errors_total{cause="encoding"} 0 t.Logf("url: %s", srv.URL) inp := NewProm() + inp.StreamSize = 0 inp.URLs = []string{srv.URL} inp.AsLogging = &iprom.AsLogging{ Enable: true, @@ -266,6 +271,8 @@ promhttp_metric_handler_errors_total{cause="encoding"} 0 t.Logf("url: %s", srv.URL) inp := NewProm() + inp.StreamSize = 0 + inp.StreamSize = 0 inp.URLs = []string{srv.URL} inp.MeasurementName = "some" @@ -301,6 +308,7 @@ promhttp_metric_handler_errors_total{cause="encoding"} 0 t.Logf("url: %s", srv.URL) inp := NewProm() + inp.StreamSize = 0 inp.URLs = []string{srv.URL} inp.MeasurementPrefix = "some_" @@ -337,6 +345,7 @@ promtcp_metric_handler_errors_total{cause="encoding"} 0 t.Logf("url: %s", srv.URL) inp := NewProm() + inp.StreamSize = 0 inp.URLs = []string{srv.URL} inp.Measurements = []iprom.Rule{ { @@ -377,6 +386,7 @@ promhttp_metric_handler_errors_total{cause="encoding"} 0 t.Logf("url: %s", srv.URL) inp := NewProm() + inp.StreamSize = 0 inp.URLs = []string{srv.URL} inp.DisableInstanceTag = true @@ -413,6 +423,7 @@ promhttp_metric_handler_errors_total{cause="encoding-3",some="foo-3"} 0 t.Logf("url: %s", srv.URL) inp := NewProm() + inp.StreamSize = 0 inp.URLs = []string{srv.URL} inp.IgnoreTagKV = map[string][]string{ "cause": {"encoding-1", "encoding-2"}, // keep `encoding-3' @@ -479,7 +490,6 @@ func TestInputBatch(t *T.T) { }, } - inp.StreamSize = 1 ptCh := make(chan []*point.Point, 1) points := []*point.Point{} wg := sync.WaitGroup{} @@ -538,7 +548,6 @@ promhttp_metric_handler_errors_total{cause="encoding"} 0 inp := NewProm() inp.URLs = []string{srv.URL} - inp.StreamSize = 1 ptCh := make(chan []*point.Point, 1) points := []*point.Point{} wg := sync.WaitGroup{} @@ -596,7 +605,6 @@ some_info{info1="data1"} 0 // info type disabled inp.DisableInfoTag = true - inp.StreamSize = 1 ptCh := make(chan []*point.Point, 1) points := []*point.Point{} wg := sync.WaitGroup{} @@ -650,7 +658,6 @@ some_info{info1="data1"} 0 inp.URLs = []string{srv.URL} inp.TagsIgnore = []string{"ignore_me"} - inp.StreamSize = 1 ptCh := make(chan []*point.Point, 1) points := []*point.Point{} wg := sync.WaitGroup{} @@ -708,7 +715,6 @@ some_info{info1="data1"} 0 }, } - inp.StreamSize = 1 ptCh := make(chan []*point.Point, 1) points := []*point.Point{} wg := sync.WaitGroup{} @@ -766,7 +772,6 @@ some_info{info1="data1"} 0 Service: "as-logging", } - inp.StreamSize = 1 ptCh := make(chan []*point.Point, 1) points := []*point.Point{} wg := sync.WaitGroup{} @@ -822,7 +827,6 @@ some_info{info1="data1"} 0 inp.URLs = []string{srv.URL} inp.MeasurementName = "some" - inp.StreamSize = 1 ptCh := make(chan []*point.Point, 1) points := []*point.Point{} wg := sync.WaitGroup{} @@ -877,7 +881,6 @@ some_info{info1="data1"} 0 inp.URLs = []string{srv.URL} inp.MeasurementPrefix = "some_" - inp.StreamSize = 1 ptCh := make(chan []*point.Point, 1) points := []*point.Point{} wg := sync.WaitGroup{} @@ -938,7 +941,6 @@ some_info{info1="data1"} 0 }, } - inp.StreamSize = 1 ptCh := make(chan []*point.Point, 1) points := []*point.Point{} wg := sync.WaitGroup{} @@ -993,7 +995,6 @@ some_info{info1="data1"} 0 inp.URLs = []string{srv.URL} inp.DisableInstanceTag = true - inp.StreamSize = 1 ptCh := make(chan []*point.Point, 1) points := []*point.Point{} wg := sync.WaitGroup{} @@ -1052,7 +1053,6 @@ some_info{info1="data1"} 0 "some": {"foo-1", "foo-3"}, // keep `foo-2' } - inp.StreamSize = 1 ptCh := make(chan []*point.Point, 1) points := []*point.Point{} wg := sync.WaitGroup{} @@ -1173,7 +1173,6 @@ func TestBatchParser(t *T.T) { inp := NewProm() inp.URLs = []string{srv.URL} - inp.StreamSize = 1 ptCh := make(chan []*point.Point, 1) points := []*point.Point{} wg := sync.WaitGroup{} @@ -1286,7 +1285,6 @@ process_runtime_jvm_buffer_count{pool="mapped - 'non-volatile memory'"} 0.0 1680 inp := NewProm() inp.URLs = []string{srv.URL} - inp.StreamSize = 1 ptCh := make(chan []*point.Point, 1) points := []*point.Point{} wg := sync.WaitGroup{} @@ -1360,7 +1358,6 @@ func TestLargeBatch(t *T.T) { inp := NewProm() inp.Feeder = feeder inp.URLs = []string{srv.URL} - inp.StreamSize = 1 start := time.Now() stopCh := make(chan bool) @@ -1465,7 +1462,6 @@ func TestLargeFileBatch(t *T.T) { inp := NewProm() inp.Feeder = feeder inp.URLs = []string{"large-metrics.txt"} - inp.StreamSize = 1 start := time.Now() stopCh := make(chan bool) diff --git a/internal/plugins/inputs/rum/sessionreplay.go b/internal/plugins/inputs/rum/sessionreplay.go index 5641a06943..3d6325dde5 100644 --- a/internal/plugins/inputs/rum/sessionreplay.go +++ b/internal/plugins/inputs/rum/sessionreplay.go @@ -228,7 +228,7 @@ func (ipt *Input) initSessionReplayWorkers() error { msgData = msg return nil }); err != nil { - if errors.Is(err, diskcache.ErrEOF) { + if errors.Is(err, diskcache.ErrNoData) { log.Debugf("disk queue is empty: %s", err) time.Sleep(time.Millisecond * 1500) } else { diff --git a/internal/plugins/inputs/rum/sessionreplay_test.go b/internal/plugins/inputs/rum/sessionreplay_test.go index fcb9b1a884..b7608c1812 100644 --- a/internal/plugins/inputs/rum/sessionreplay_test.go +++ b/internal/plugins/inputs/rum/sessionreplay_test.go @@ -107,7 +107,7 @@ func TestSessionReplayHandler(t *testing.T) { return nil }) - assert.ErrorIs(t, err, diskcache.ErrEOF) + assert.ErrorIs(t, err, diskcache.ErrNoData) assert.NoError(t, ipt.replayDiskQueue.Close()) } @@ -141,7 +141,7 @@ func TestReplayDiskQueue(t *testing.T) { return nil }) - assert.ErrorIs(t, err, diskcache.ErrEOF) + assert.ErrorIs(t, err, diskcache.ErrNoData) assert.NoError(t, ipt.replayDiskQueue.Close()) } diff --git a/internal/plugins/inputs/rum/sourcemap_test.go b/internal/plugins/inputs/rum/sourcemap_test.go index aa45941b35..16e2d6316d 100644 --- a/internal/plugins/inputs/rum/sourcemap_test.go +++ b/internal/plugins/inputs/rum/sourcemap_test.go @@ -29,9 +29,10 @@ import ( func TestHandleSourcemapUpload(t *testing.T) { const Token = "xxxxxxxxxx" - dw := &dataway.Dataway{URLs: []string{"http://localhost:9529?token=" + Token}} - err := dw.Init() - assert.NoError(t, err) + dw := dataway.NewDefaultDataway() + dw.URLs = []string{"http://localhost:9529?token=" + Token} + assert.NoError(t, dw.Init()) + config.Cfg.Dataway = dw dir, err := ioutil.TempDir("./", "tmp") @@ -276,9 +277,10 @@ func TestHandleSourcemapDelete(t *testing.T) { platform = "web" ) - dw := &dataway.Dataway{URLs: []string{"http://localhost:9529?token=" + Token}} - err := dw.Init() - assert.NoError(t, err) + dw := dataway.NewDefaultDataway() + dw.URLs = []string{"http://localhost:9529?token=" + Token} + assert.NoError(t, dw.Init()) + config.Cfg.Dataway = dw ipt := defaultInput() diff --git a/internal/storage/storage.go b/internal/storage/storage.go index 5e6b09873b..93ad925101 100644 --- a/internal/storage/storage.go +++ b/internal/storage/storage.go @@ -90,7 +90,7 @@ func (s *Storage) RunConsumeWorker() error { key, buf, err := s.Get() if err != nil { - if errors.Is(err, dc.ErrEOF) { + if errors.Is(err, dc.ErrNoData) { s.log.Debug("local-cache empty") time.Sleep(time.Second) continue diff --git a/scripts/v1-data.go b/scripts/v1-data.go index 97eef7f990..3c02c36254 100644 --- a/scripts/v1-data.go +++ b/scripts/v1-data.go @@ -7,6 +7,7 @@ package main import ( "flag" + "fmt" "os" "github.com/GuanceCloud/cliutils" @@ -14,7 +15,8 @@ import ( var ( flagLen = flag.Int64("len", 32, "generated data length(kb)") - flagCount = flag.Int("count", 1, "generated data count") + flagCount = flag.Int("count", 3, "generated data count") + flagP8s = flag.Bool("p8s", false, "generate promethues metric text") flagFile = flag.String("output", "v1.data", "data output to file") ) @@ -50,8 +52,80 @@ func genLargeLog() { } } +func genLargeP8sMetric() { + countComment := `# HELP some counter +# TYPE my_count counter` + countMetricTemplate := `my_count{tag1="%05d",tag2="%05d"} 5.739356555004705 +` + + gaugeComment := `# HELP some gauge +# TYPE my_gauge gauge` + + gaugeMetricTemplate := `my_gauge{tag1="%05d",tag2="%05d"} 5.739356555004705 +` + + summaryComment := `# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles. +# TYPE go_gc_duration_seconds summary` + + summaryMetricTemplate := `go_gc_duration_seconds{tag1="%05d",tag2="%05d",quantile="0"} 0.000159 +go_gc_duration_seconds{tag1="%05d",tag2="%05d",quantile="0.25"} 0.000346833 +go_gc_duration_seconds{tag1="%05d",tag2="%05d",quantile="0.5"} 0.000542084 +go_gc_duration_seconds{tag1="%05d",tag2="%05d",quantile="0.75"} 0.000859208 +go_gc_duration_seconds{tag1="%05d",tag2="%05d",quantile="1"} 0.010519458 +go_gc_duration_seconds_sum{tag1="%05d",tag2="%05d"} 0.76161121 +go_gc_duration_seconds_count{tag1="%05d",tag2="%05d"} 1234 +` + + histogramComment := `# HELP go_gc_heap_allocs_by_size_bytes Distribution of heap allocations by approximate size. Note that this does not include tiny objects as defined by /gc/heap/tiny/allocs:objects, only tiny blocks. +# TYPE go_gc_heap_allocs_by_size_bytes histogram` + + histogramMetricTemplate := `go_gc_heap_allocs_by_size_bytes_bucket{tag1="%05d",tag2="%05d",le="8.999999999999998"} 1.9150407e+07 +go_gc_heap_allocs_by_size_bytes_bucket{tag1="%05d",tag2="%05d",le="24.999999999999996"} 2.72283031e+08 +go_gc_heap_allocs_by_size_bytes_bucket{tag1="%05d",tag2="%05d",le="64.99999999999999"} 3.66184003e+08 +go_gc_heap_allocs_by_size_bytes_bucket{tag1="%05d",tag2="%05d",le="144.99999999999997"} 4.1888465e+08 +go_gc_heap_allocs_by_size_bytes_bucket{tag1="%05d",tag2="%05d",le="320.99999999999994"} 4.30278691e+08 +go_gc_heap_allocs_by_size_bytes_bucket{tag1="%05d",tag2="%05d",le="704.9999999999999"} 4.3276057e+08 +go_gc_heap_allocs_by_size_bytes_bucket{tag1="%05d",tag2="%05d",le="1536.9999999999998"} 4.33416083e+08 +go_gc_heap_allocs_by_size_bytes_bucket{tag1="%05d",tag2="%05d",le="3200.9999999999995"} 4.33837422e+08 +go_gc_heap_allocs_by_size_bytes_bucket{tag1="%05d",tag2="%05d",le="6528.999999999999"} 4.34562466e+08 +go_gc_heap_allocs_by_size_bytes_bucket{tag1="%05d",tag2="%05d",le="13568.999999999998"} 4.3479673e+08 +go_gc_heap_allocs_by_size_bytes_bucket{tag1="%05d",tag2="%05d",le="27264.999999999996"} 4.34931507e+08 +go_gc_heap_allocs_by_size_bytes_bucket{tag1="%05d",tag2="%05d",le="+Inf"} 4.35160484e+08 +go_gc_heap_allocs_by_size_bytes_sum{tag1="%05d",tag2="%05d"} 2.325903862e+11 +go_gc_heap_allocs_by_size_bytes_count{tag1="%05d",tag2="%05d"} 4.35160484e+08 +` + + // generate n count point + fmt.Println(countComment) + for i := 0; i < *flagCount; i++ { + fmt.Printf(countMetricTemplate, i, i) + } + + // generate n gauge point + fmt.Println(gaugeComment) + for i := 0; i < *flagCount; i++ { + fmt.Printf(gaugeMetricTemplate, i, i) + } + + // generate n summary point + fmt.Println(summaryComment) + for i := 0; i < *flagCount; i++ { + fmt.Printf(summaryMetricTemplate, i, i, i, i, i, i, i, i, i, i, i, i, i, i) + } + + // generate n histogram point + fmt.Println(histogramComment) + for i := 0; i < *flagCount; i++ { + fmt.Printf(histogramMetricTemplate, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i) + } +} + // nolint: typecheck func main() { flag.Parse() + if *flagP8s { + genLargeP8sMetric() + return + } genLargeLog() } diff --git a/scripts/v1-write.go b/scripts/v1-write.go index 8beafc9927..c9c6a8b8d1 100644 --- a/scripts/v1-write.go +++ b/scripts/v1-write.go @@ -12,6 +12,7 @@ import ( "log" "net/http" "os" + "strings" "sync/atomic" "syscall" "time" @@ -28,6 +29,7 @@ var ( flagMaxBody = flag.Int("max-body", 0, "set max body size(kb)") flagDecode = flag.Bool("decode", false, "try decode request") flag5XXRatio = flag.Int("5xx-ratio", 0, "fail request ratio(minimal is 1/1000)") + flagLatency = flag.Duration("latency", time.Millisecond*10, "latency used on API cost") MPts, LPts, TPts, totalReq, req5xx atomic.Int64 ) @@ -55,6 +57,10 @@ func benchHTTPServer() { func(c *gin.Context) { log.Printf("************************************************") + if *flagLatency > 0 { + time.Sleep(*flagLatency) + } + totalReq.Add(1) if *flag5XXRatio > 0 { @@ -73,9 +79,10 @@ func benchHTTPServer() { } var ( - start = time.Now() - encoding point.Encoding - dec *point.Decoder + start = time.Now() + encoding point.Encoding + dec *point.Decoder + headerArr []string ) if body, err := io.ReadAll(c.Request.Body); err != nil { @@ -84,7 +91,6 @@ func benchHTTPServer() { } else { elapsed := time.Since(start) if len(body) > 0 { - log.Printf("************************************************") log.Printf("copy elapsed %s, bandwidth %fKB/S", elapsed, float64(len(body))/(float64(elapsed)/float64(time.Second))/1024.0) } @@ -93,19 +99,22 @@ func benchHTTPServer() { } for k, _ := range c.Request.Header { - log.Printf("%s: %s", k, c.Request.Header.Get(k)) + headerArr = append(headerArr, fmt.Sprintf("%s: %s", k, c.Request.Header.Get(k))) } + log.Printf("URL: %s", c.Request.URL) + log.Printf("headers:\n%s", strings.Join(headerArr, "\n")) + if c.Request.Header.Get("Content-Encoding") == "gzip" { unzipbody, err := uhttp.Unzip(body) if err != nil { //log.Printf("unzip: %s, body: %q", err, body) - log.Printf("unzip: %s", err) + log.Printf("[ERROR] unzip(header %q): %s", body[:10], err) c.Data(http.StatusBadRequest, "", []byte(err.Error())) return } - log.Printf("unzip body: %d => %d(%.4f)", len(body), len(unzipbody), float64(len(body))/float64(len(unzipbody))) + log.Printf("[INFO] unzip body: %d => %d(%.4f)", len(body), len(unzipbody), float64(len(body))/float64(len(unzipbody))) body = unzipbody } @@ -121,19 +130,21 @@ func benchHTTPServer() { defer point.PutDecoder(dec) default: // not implemented - log.Printf("unknown encoding %s", encoding) + log.Printf("[ERROR] unknown encoding %s", encoding) + return } if dec != nil { if pts, err := dec.Decode(body); err != nil { - log.Printf("decode on %s error: %s", encoding, err) + log.Printf("[ERROR] decode on %s error: %s", encoding, err) } else { nwarns := 0 for _, pt := range pts { if len(pt.Warns()) > 0 { - //fmt.Printf(pt.Pretty()) nwarns++ } + + log.Println(pt.LineProto()) } cat := point.CatURL(c.Request.URL.Path) @@ -147,7 +158,7 @@ func benchHTTPServer() { TPts.Add(int64(len(pts))) } - log.Printf("decode %d points, %d with warnnings", len(pts), nwarns) + log.Printf("[INFO] decode %d points, %d with warnnings", len(pts), nwarns) } showInfo() @@ -190,6 +201,7 @@ func showENVs() { // nolint: typecheck func main() { + log.SetFlags(log.LstdFlags | log.Lshortfile) showENVs() var rLimit syscall.Rlimit diff --git a/vendor/github.com/GuanceCloud/cliutils/Makefile b/vendor/github.com/GuanceCloud/cliutils/Makefile index 311e96ef05..c17d1d1a27 100644 --- a/vendor/github.com/GuanceCloud/cliutils/Makefile +++ b/vendor/github.com/GuanceCloud/cliutils/Makefile @@ -20,3 +20,6 @@ copyright_check_auto_fix: test: LOGGER_PATH=nul CGO_CFLAGS=-Wno-undef-prefix go test -test.v -timeout 99999m -cover ./... + +show_metrics: + @promlinter list . --add-help -o md --with-vendor diff --git a/vendor/github.com/GuanceCloud/cliutils/diskcache/README.md b/vendor/github.com/GuanceCloud/cliutils/diskcache/README.md index 8f67a8b22b..9ada6d7797 100644 --- a/vendor/github.com/GuanceCloud/cliutils/diskcache/README.md +++ b/vendor/github.com/GuanceCloud/cliutils/diskcache/README.md @@ -78,51 +78,50 @@ log.Println(m.LineProto()) // get line-protocol format of metrics 支持通过如下环境变量来覆盖默认的缓存配置: -| 环境变量 | 描述 | -| --- | --- | -| ENV_DISKCACHE_BATCH_SIZE | 设置单个磁盘文件大小,单位字节,默认 64MB | -| ENV_DISKCACHE_MAX_DATA_SIZE | 限制单次写入的字节大小,避免意料之外的巨量数据写入,单位字节,默认不限制 | -| ENV_DISKCACHE_CAPACITY | 限制缓存能使用的磁盘上限,一旦用量超过该限制,老数据将被移除掉。默认不限制 | -| ENV_DISKCACHE_NO_SYNC | 禁用磁盘写入的 sync 同步,默认不开启。一旦开启,可能导致磁盘数据丢失问题 | -| ENV_DISKCACHE_NO_LOCK | 禁用文件目录夹锁。默认是加锁状态,一旦不加锁,在同一个目录多开(`Open`)可能导致文件混乱 | -| ENV_DISKCACHE_NO_POS | 禁用磁盘写入位置记录,默认带有位置记录。一旦不记录,程序重启会导致部分数据重复消费(`Get`) | -| ENV_DISKCACHE_NO_FALLBACK_ON_ERROR | 禁用错误回退机制 | +| 环境变量 | 单位 | 描述 | +| --- | --- | --- | +| ENV_DISKCACHE_BATCH_SIZE | byte | 设置单个磁盘文件大小,单位字节,默认 64MB | +| ENV_DISKCACHE_MAX_DATA_SIZE | byte | 限制单次写入的字节大小,避免意料之外的巨量数据写入,单位字节,默认不限制 | +| ENV_DISKCACHE_CAPACITY | byte | 限制缓存能使用的磁盘上限,一旦用量超过该限制,老数据将被移除掉。默认不限制 | +| ENV_DISKCACHE_NO_SYNC | N/A | 禁用磁盘写入的 sync 同步,默认不开启。一旦开启,可能导致磁盘数据丢失问题 | +| ENV_DISKCACHE_NO_LOCK | N/A | 禁用文件目录夹锁。默认是加锁状态,一旦不加锁,在同一个目录多开(`Open`)可能导致文件混乱 | +| ENV_DISKCACHE_NO_POS | N/A | 禁用磁盘写入位置记录,默认带有位置记录。一旦不记录,程序重启会导致部分数据重复消费(`Get`) | +| ENV_DISKCACHE_NO_FALLBACK_ON_ERROR | N/A | 禁用错误回退机制 | ## Prometheus 指标 所有指标可选的 label 列表如下: -| label | 取值 | 说明 | -| --- | --- | --- | -| no_fallback_on_error | true/false | 是否关闭错误回退(即禁止 Get() 回调失败时,再次读到老的数据) | -| no_lock | true/false | 是否关闭加锁功能(即允许一个 cache 目录同时被多次 `Open()`) | -| no_pos | true/false | 是否关闭 pos 功能 | -| no_sync | true/false | 是否关闭同步写入功能 | -| path | cache 所在磁盘目录 | cache 所在磁盘目录 | +| label | 取值 | 说明 | +| --- | --- | --- | +| `no_fallback_on_error` | true/false | 是否关闭错误回退(即禁止 Get() 回调失败时,再次读到老的数据) | +| `no_lock` | true/false | 是否关闭加锁功能(即允许一个 cache 目录同时被多次 `Open()`) | +| `no_pos` | true/false | 是否关闭 pos 功能 | +| `no_sync` | true/false | 是否关闭同步写入功能 | +| `path` | cache 所在磁盘目录 | cache 所在磁盘目录 | 指标列表如下: -| TYPE | NAME | LABELS | HELP | -| --- | --- | --- | --- | -| COUNTER | `diskcache_put_bytes_total` | `path` | Cache Put() bytes count | -| COUNTER | `diskcache_get_total` | `path` | Cache Get() count | -| COUNTER | `diskcache_wakeup_total` | `path` | Wakeup count on sleeping write file | -| COUNTER | `diskcache_get_bytes_total` | `path` | Cache Get() bytes count | -| GAUGE | `diskcache_capacity` | `path` | Current capacity(in bytes) | -| GAUGE | `diskcache_max_data` | `path` | Max data to Put(in bytes), default 0 | -| GAUGE | `diskcache_batch_size` | `path` | Data file size(in bytes) | -| GAUGE | `diskcache_size` | `path` | Current cache size(in bytes) | -| GAUGE | `diskcache_open_time` | `no_fallback_on_error,no_lock,no_pos,no_sync,path` | Current cache Open time in unix timestamp(second) | -| GAUGE | `diskcache_last_close_time` | `path` | Current cache last Close time in unix timestamp(second) | -| GAUGE | `diskcache_datafiles` | `path` | Current un-readed data files | -| SUMMARY | `diskcache_get_latency` | `path` | Get() time cost(micro-second) | -| SUMMARY | `diskcache_put_latency` | `path` | Put() time cost(micro-second) | -| COUNTER | `diskcache_dropped_bytes_total` | `path` | Dropped bytes during Put() when capacity reached. | -| COUNTER | `diskcache_dropped_total` | `path` | Dropped files during Put() when capacity reached. | -| COUNTER | `diskcache_rotate_total` | `path` | Cache rotate count, mean file rotate from data to data.0000xxx | -| COUNTER | `diskcache_remove_total` | `path` | Removed file count, if some file read EOF, remove it from un-readed list | -| COUNTER | `diskcache_put_total` | `path` | Cache Put() count | +|TYPE|NAME|LABELS|HELP| +|---|---|---|---| +|SUMMARY|`diskcache_dropped_data`|`path,reason`|Dropped data during Put() when capacity reached.| +|COUNTER|`diskcache_rotate_total`|`path`|Cache rotate count, mean file rotate from data to data.0000xxx| +|COUNTER|`diskcache_remove_total`|`path`|Removed file count, if some file read EOF, remove it from un-read list| +|COUNTER|`diskcache_wakeup_total`|`path`|Wakeup count on sleeping write file| +|COUNTER|`diskcache_seek_back_total`|`path`|Seek back when Get() got any error| +|GAUGE|`diskcache_capacity`|`path`|Current capacity(in bytes)| +|GAUGE|`diskcache_max_data`|`path`|Max data to Put(in bytes), default 0| +|GAUGE|`diskcache_batch_size`|`path`|Data file size(in bytes)| +|GAUGE|`diskcache_size`|`path`|Current cache size(in bytes)| +|GAUGE|`diskcache_open_time`|`no_fallback_on_error,no_lock,no_pos,no_sync,path`|Current cache Open time in unix timestamp(second)| +|GAUGE|`diskcache_last_close_time`|`path`|Current cache last Close time in unix timestamp(second)| +|GAUGE|`diskcache_datafiles`|`path`|Current un-read data files| +|SUMMARY|`diskcache_stream_put`|`path`|Stream put times| +|SUMMARY|`diskcache_get_latency`|`path`|Get() cost seconds| +|SUMMARY|`diskcache_put_latency`|`path`|Put() cost seconds| +|SUMMARY|`diskcache_put_bytes`|`path`|Cache Put() bytes| +|SUMMARY|`diskcache_get_bytes`|`path`|Cache Get() bytes| ## 性能估算 diff --git a/vendor/github.com/GuanceCloud/cliutils/diskcache/diskcache.go b/vendor/github.com/GuanceCloud/cliutils/diskcache/diskcache.go index 2f4b59586f..b6c4b7b47c 100644 --- a/vendor/github.com/GuanceCloud/cliutils/diskcache/diskcache.go +++ b/vendor/github.com/GuanceCloud/cliutils/diskcache/diskcache.go @@ -20,7 +20,9 @@ import ( "errors" "fmt" "os" + "strings" "sync" + "sync/atomic" "time" ) @@ -36,11 +38,16 @@ var ( // Invalid read size. ErrUnexpectedReadSize = errors.New("unexpected read size") + ErrTooSmallReadBuf = errors.New("too small read buffer") + // Data send to Put() exceed the maxDataSize. ErrTooLargeData = errors.New("too large data") // Get on no data cache. - ErrEOF = errors.New("EOF") + ErrNoData = errors.New("no data") + + // Diskcache full, no data can be write now + ErrCacheFull = errors.New("cache full") // Invalid cache filename. ErrInvalidDataFileName = errors.New("invalid datafile name") @@ -72,20 +79,24 @@ type DiskCache struct { // how long to wakeup a sleeping write-file wakeup time.Duration - wlock, // used to exclude concurrent Put. - rlock *sync.Mutex // used to exclude concurrent Get. - rwlock *sync.Mutex // used to exclude switch/rotate/drop/Close + wlock, // write-lock: used to exclude concurrent Put to the header file. + rlock *sync.Mutex // read-lock: used to exclude concurrent Get on the tail file. + rwlock *sync.Mutex // used to exclude switch/rotate/drop/Close on current disk cache instance. flock *flock // disabled multi-Open on same path pos *pos // current read fd position info // specs of current diskcache - size, // current byte size + size atomic.Int64 // current byte size curBatchSize, // current writing file's size + curReadSize, // current reading file's size batchSize, // current batch size(static) capacity int64 // capacity of the diskcache maxDataSize int32 // max data size of single Put() + batchHeader []byte + streamBuf []byte + // File permission, default 0750/0640 dirPerms, filePerms os.FileMode @@ -94,6 +105,7 @@ type DiskCache struct { noSync, // NoSync if enabled, may cause data missing, default false noFallbackOnError, // ignore Fn() error noPos, // no position + filoDrop, // first-in-last-out drop, meas we chooes to drop the new-comming data first noLock bool // no file lock // labels used to export prometheus flags @@ -108,12 +120,40 @@ func (c *DiskCache) String() string { // if there too many files(>10), only print file count if n := len(c.dataFiles); n > 10 { return fmt.Sprintf("%s/[size: %d][fallback: %v][nosync: %v][nopos: %v][nolock: %v][files: %d][maxDataSize: %d][batchSize: %d][capacity: %d][dataFiles: %d]", - c.path, c.size, c.noFallbackOnError, c.noSync, c.noPos, c.noLock, len(c.dataFiles), c.maxDataSize, c.batchSize, c.capacity, n, + c.path, c.size.Load(), c.noFallbackOnError, c.noSync, c.noPos, c.noLock, len(c.dataFiles), c.maxDataSize, c.batchSize, c.capacity, n, ) } else { // nolint: lll return fmt.Sprintf("%s/[size: %d][fallback: %v][nosync: %v][nopos: %v][nolock: %v][files: %d][maxDataSize: %d][batchSize: %d][capacity: %d][dataFiles: %v]", - c.path, c.size, c.noFallbackOnError, c.noSync, c.noLock, c.noPos, len(c.dataFiles), c.maxDataSize, c.batchSize, c.capacity, c.dataFiles, + c.path, c.size.Load(), c.noFallbackOnError, c.noSync, c.noLock, c.noPos, len(c.dataFiles), c.maxDataSize, c.batchSize, c.capacity, c.dataFiles, ) } } + +func (c *DiskCache) Pretty() string { + c.rwlock.Lock() + defer c.rwlock.Unlock() + + arr := []string{} + + arr = append(arr, "path: "+c.path) + arr = append(arr, fmt.Sprintf("size: %d", c.size.Load())) + arr = append(arr, fmt.Sprintf("max-data-size: %d", c.maxDataSize)) + arr = append(arr, fmt.Sprintf("capacity: %d", c.capacity)) + arr = append(arr, fmt.Sprintf("data-files(%d):", len(c.dataFiles))) + + for i, df := range c.dataFiles { + arr = append(arr, "\t"+df) + if i > 10 { + arr = append(arr, fmt.Sprintf("omitted %d files...", len(c.dataFiles)-i)) + } + } + + if c.rfd != nil { + arr = append(arr, fmt.Sprintf("cur-read: %s", c.rfd.Name())) + } else { + arr = append(arr, "no-Get()") + } + + return strings.Join(arr, "\n") +} diff --git a/vendor/github.com/GuanceCloud/cliutils/diskcache/drop.go b/vendor/github.com/GuanceCloud/cliutils/diskcache/drop.go index 678173e782..ac456192e6 100644 --- a/vendor/github.com/GuanceCloud/cliutils/diskcache/drop.go +++ b/vendor/github.com/GuanceCloud/cliutils/diskcache/drop.go @@ -21,6 +21,7 @@ func (c *DiskCache) dropBatch() error { return nil } + // FILO drop: accept new data, and drop old data. fname := c.dataFiles[0] if c.rfd != nil && c.curReadfile == fname { @@ -36,14 +37,13 @@ func (c *DiskCache) dropBatch() error { return err } - c.size -= fi.Size() + c.size.Add(-fi.Size()) c.dataFiles = c.dataFiles[1:] - droppedBatchVec.WithLabelValues(c.path, reasonExceedCapacity).Inc() - droppedBytesVec.WithLabelValues(c.path).Add(float64(fi.Size())) + droppedDataVec.WithLabelValues(c.path, reasonExceedCapacity).Observe(float64(fi.Size())) datafilesVec.WithLabelValues(c.path).Set(float64(len(c.dataFiles))) - sizeVec.WithLabelValues(c.path).Set(float64(c.size)) + sizeVec.WithLabelValues(c.path).Set(float64(c.size.Load())) } return nil diff --git a/vendor/github.com/GuanceCloud/cliutils/diskcache/get.go b/vendor/github.com/GuanceCloud/cliutils/diskcache/get.go index 11b982d39d..5ec72dca23 100644 --- a/vendor/github.com/GuanceCloud/cliutils/diskcache/get.go +++ b/vendor/github.com/GuanceCloud/cliutils/diskcache/get.go @@ -35,19 +35,26 @@ func (c *DiskCache) switchNextFile() error { func (c *DiskCache) skipBadFile() error { defer func() { - droppedBatchVec.WithLabelValues(c.path, reasonBadDataFile).Inc() + droppedDataVec.WithLabelValues(c.path, reasonBadDataFile).Observe(float64(c.curReadSize)) }() return c.switchNextFile() } // Get fetch new data from disk cache, then passing to fn -// if any error occurred during call fn, the reading data is -// dropped, and will not read again. // // Get is safe to call concurrently with other operations and will // block until all other operations finish. func (c *DiskCache) Get(fn Fn) error { + return c.doGet(nil, fn) +} + +// BufGet fetch new data from disk cache, and read into buf +func (c *DiskCache) BufGet(buf []byte, fn Fn) error { + return c.doGet(buf, fn) +} + +func (c *DiskCache) doGet(buf []byte, fn Fn) error { var ( n, nbytes int err error @@ -60,11 +67,10 @@ func (c *DiskCache) Get(fn Fn) error { defer func() { if uint32(nbytes) != EOFHint { - getBytesVec.WithLabelValues(c.path).Add(float64(nbytes)) + getBytesVec.WithLabelValues(c.path).Observe(float64(nbytes)) // get on EOF not counted as a real Get - getVec.WithLabelValues(c.path).Inc() - getLatencyVec.WithLabelValues(c.path).Observe(float64(time.Since(start) / time.Microsecond)) + getLatencyVec.WithLabelValues(c.path).Observe(float64(time.Since(start)) / float64(time.Second)) } }() @@ -89,14 +95,11 @@ func (c *DiskCache) Get(fn Fn) error { retry: if c.rfd == nil { - return ErrEOF + return ErrNoData } - hdr := make([]byte, dataHeaderLen) - if n, err = c.rfd.Read(hdr); err != nil || n != dataHeaderLen { - // + if n, err = c.rfd.Read(c.batchHeader); err != nil || n != dataHeaderLen { // On bad datafile, just ignore and delete the file. - // if err = c.skipBadFile(); err != nil { return err } @@ -105,7 +108,7 @@ retry: } // how many bytes of current data? - nbytes = int(binary.LittleEndian.Uint32(hdr[0:])) + nbytes = int(binary.LittleEndian.Uint32(c.batchHeader)) if uint32(nbytes) == EOFHint { // EOF if err := c.switchNextFile(); err != nil { @@ -115,9 +118,21 @@ retry: goto retry // read next new file to save another Get() calling. } - databuf := make([]byte, nbytes) + if buf == nil { + buf = make([]byte, nbytes) + } + + if len(buf) < nbytes { + // seek to next read position + if _, err := c.rfd.Seek(int64(nbytes), io.SeekCurrent); err != nil { + return err + } + + droppedDataVec.WithLabelValues(c.path, reasonTooSmallReadBuffer).Observe(float64(nbytes)) + return ErrTooSmallReadBuf + } - if n, err = c.rfd.Read(databuf); err != nil { + if n, err := c.rfd.Read(buf[:nbytes]); err != nil { return err } else if n != nbytes { return ErrUnexpectedReadSize @@ -127,7 +142,7 @@ retry: goto __updatePos } - if err = fn(databuf); err != nil { + if err = fn(buf[:nbytes]); err != nil { // seek back if !c.noFallbackOnError { if _, serr := c.rfd.Seek(-int64(dataHeaderLen+nbytes), io.SeekCurrent); serr != nil { diff --git a/vendor/github.com/GuanceCloud/cliutils/diskcache/meta.go b/vendor/github.com/GuanceCloud/cliutils/diskcache/meta.go new file mode 100644 index 0000000000..ad9f44c2c1 --- /dev/null +++ b/vendor/github.com/GuanceCloud/cliutils/diskcache/meta.go @@ -0,0 +1,47 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the MIT License. +// This product includes software developed at Guance Cloud (https://www.guance.com/). +// Copyright 2021-present Guance, Inc. + +package diskcache + +// Size return current size of the cache. +func (c *DiskCache) Size() int64 { + c.rwlock.Lock() + defer c.rwlock.Unlock() + + if len(c.dataFiles) > 0 { // there are files waiting to be read + return c.size.Load() + } else { + return 0 + } +} + +// RawSize return current size plus current writing file(`data') of the cache. +func (c *DiskCache) RawSize() int64 { + return c.size.Load() +} + +// Capacity return max capacity of the cache. +func (c *DiskCache) Capacity() int64 { + return c.capacity +} + +// MaxDataSize return max single data piece size of the cache. +func (c *DiskCache) MaxDataSize() int32 { + return c.maxDataSize +} + +// MaxBatchSize return max single data file size of the cache. +// +// With proper data file size(default is 20MB), we can make the switch/rotate +// and garbage collection more quickly when all piece of data wthin the data +// file has been Get() out of the file. +func (c *DiskCache) MaxBatchSize() int64 { + return c.batchSize +} + +// Path return dir of current diskcache. +func (c *DiskCache) Path() string { + return c.path +} diff --git a/vendor/github.com/GuanceCloud/cliutils/diskcache/metric.go b/vendor/github.com/GuanceCloud/cliutils/diskcache/metric.go index 86fdf5430a..3a10b5517d 100644 --- a/vendor/github.com/GuanceCloud/cliutils/diskcache/metric.go +++ b/vendor/github.com/GuanceCloud/cliutils/diskcache/metric.go @@ -11,16 +11,10 @@ import ( ) var ( - droppedBatchVec, - droppedBytesVec, rotateVec, removeVec, - putVec, - getVec, - putBytesVec, wakeupVec, - seekBackVec, - getBytesVec *prometheus.CounterVec + seekBackVec *prometheus.CounterVec sizeVec, openTimeVec, @@ -30,6 +24,10 @@ var ( batchSizeVec, datafilesVec *prometheus.GaugeVec + droppedDataVec, + putBytesVec, + getBytesVec, + streamPutVec, getLatencyVec, putLatencyVec *prometheus.SummaryVec @@ -37,11 +35,30 @@ var ( ) func setupMetrics() { + streamPutVec = prometheus.NewSummaryVec( + prometheus.SummaryOpts{ + Namespace: ns, + Name: "stream_put", + Help: "Stream put times", + Objectives: map[float64]float64{ + 0.5: 0.05, + 0.9: 0.01, + 0.99: 0.001, + }, + }, + []string{"path"}, + ) + getLatencyVec = prometheus.NewSummaryVec( prometheus.SummaryOpts{ Namespace: ns, Name: "get_latency", - Help: "Get() time cost(micro-second)", + Help: "Get() cost seconds", + Objectives: map[float64]float64{ + 0.5: 0.05, + 0.9: 0.01, + 0.99: 0.001, + }, }, []string{"path"}, ) @@ -50,25 +67,54 @@ func setupMetrics() { prometheus.SummaryOpts{ Namespace: ns, Name: "put_latency", - Help: "Put() time cost(micro-second)", + Help: "Put() cost seconds", + Objectives: map[float64]float64{ + 0.5: 0.05, + 0.9: 0.01, + 0.99: 0.001, + }, }, []string{"path"}, ) - droppedBytesVec = prometheus.NewCounterVec( - prometheus.CounterOpts{ + putBytesVec = prometheus.NewSummaryVec( + prometheus.SummaryOpts{ Namespace: ns, - Name: "dropped_bytes_total", - Help: "Dropped bytes during Put() when capacity reached.", + Name: "put_bytes", + Help: "Cache Put() bytes", + Objectives: map[float64]float64{ + 0.5: 0.05, + 0.9: 0.01, + 0.99: 0.001, + }, }, []string{"path"}, ) - droppedBatchVec = prometheus.NewCounterVec( - prometheus.CounterOpts{ + getBytesVec = prometheus.NewSummaryVec( + prometheus.SummaryOpts{ + Namespace: ns, + Name: "get_bytes", + Help: "Cache Get() bytes", + Objectives: map[float64]float64{ + 0.5: 0.05, + 0.9: 0.01, + 0.99: 0.001, + }, + }, + []string{"path"}, + ) + + droppedDataVec = prometheus.NewSummaryVec( + prometheus.SummaryOpts{ Namespace: ns, - Name: "dropped_total", - Help: "Dropped files during Put() when capacity reached.", + Name: "dropped_data", + Help: "Dropped data during Put() when capacity reached.", + Objectives: map[float64]float64{ + 0.5: 0.05, + 0.9: 0.01, + 0.99: 0.001, + }, }, []string{"path", "reason"}, ) @@ -91,33 +137,6 @@ func setupMetrics() { []string{"path"}, ) - putVec = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Namespace: ns, - Name: "put_total", - Help: "Cache Put() count", - }, - []string{"path"}, - ) - - putBytesVec = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Namespace: ns, - Name: "put_bytes_total", - Help: "Cache Put() bytes count", - }, - []string{"path"}, - ) - - getVec = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Namespace: ns, - Name: "get_total", - Help: "Cache Get() count", - }, - []string{"path"}, - ) - wakeupVec = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: ns, @@ -136,15 +155,6 @@ func setupMetrics() { []string{"path"}, ) - getBytesVec = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Namespace: ns, - Name: "get_bytes_total", - Help: "Cache Get() bytes count", - }, - []string{"path"}, - ) - capVec = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: ns, @@ -215,63 +225,16 @@ func setupMetrics() { []string{"path"}, ) - metrics.MustRegister( - droppedBatchVec, - droppedBytesVec, - rotateVec, - putVec, - getVec, - putBytesVec, - wakeupVec, - seekBackVec, - getBytesVec, - - openTimeVec, - lastCloseTimeVec, - capVec, - batchSizeVec, - maxDataVec, - sizeVec, - datafilesVec, - - getLatencyVec, - putLatencyVec) -} - -// register to specified registry for testing. -func register(reg *prometheus.Registry) { - reg.MustRegister( - droppedBatchVec, - droppedBytesVec, - rotateVec, - putVec, - getVec, - putBytesVec, - wakeupVec, - seekBackVec, - getBytesVec, - - capVec, - batchSizeVec, - maxDataVec, - sizeVec, - datafilesVec, - - getLatencyVec, - putLatencyVec) + metrics.MustRegister(Metrics()...) } // ResetMetrics used to cleanup exist metrics of diskcache. func ResetMetrics() { - droppedBatchVec.Reset() - droppedBytesVec.Reset() + streamPutVec.Reset() + droppedDataVec.Reset() rotateVec.Reset() - putVec.Reset() - getVec.Reset() - putBytesVec.Reset() wakeupVec.Reset() seekBackVec.Reset() - getBytesVec.Reset() capVec.Reset() batchSizeVec.Reset() maxDataVec.Reset() @@ -279,25 +242,17 @@ func ResetMetrics() { datafilesVec.Reset() getLatencyVec.Reset() putLatencyVec.Reset() + putBytesVec.Reset() + getBytesVec.Reset() } -// Labels export cache's labels used to query prometheus metrics. -// func (c *DiskCache) Labels() []string { -// return c.labels -//} - func Metrics() []prometheus.Collector { return []prometheus.Collector{ - droppedBatchVec, - droppedBytesVec, + droppedDataVec, rotateVec, removeVec, - putVec, - getVec, - putBytesVec, wakeupVec, seekBackVec, - getBytesVec, sizeVec, openTimeVec, @@ -309,6 +264,8 @@ func Metrics() []prometheus.Collector { getLatencyVec, putLatencyVec, + getBytesVec, + putBytesVec, } } diff --git a/vendor/github.com/GuanceCloud/cliutils/diskcache/open.go b/vendor/github.com/GuanceCloud/cliutils/diskcache/open.go index c9f346e631..db794ce850 100644 --- a/vendor/github.com/GuanceCloud/cliutils/diskcache/open.go +++ b/vendor/github.com/GuanceCloud/cliutils/diskcache/open.go @@ -48,6 +48,9 @@ func defaultInstance() *DiskCache { return &DiskCache{ noSync: false, + streamBuf: make([]byte, 4*1024), + batchHeader: make([]byte, dataHeaderLen), + batchSize: 20 * 1024 * 1024, maxDataSize: 0, // not set @@ -129,9 +132,9 @@ func (c *DiskCache) doOpen() error { switch filepath.Base(path) { case ".lock", ".pos": // ignore them case "data": // count on size - c.size += fi.Size() + c.size.Add(fi.Size()) default: - c.size += fi.Size() + c.size.Add(fi.Size()) c.dataFiles = append(c.dataFiles, path) } diff --git a/vendor/github.com/GuanceCloud/cliutils/diskcache/options.go b/vendor/github.com/GuanceCloud/cliutils/diskcache/options.go index ecd5182cbf..ccc829fa5d 100644 --- a/vendor/github.com/GuanceCloud/cliutils/diskcache/options.go +++ b/vendor/github.com/GuanceCloud/cliutils/diskcache/options.go @@ -26,6 +26,18 @@ func WithNoFallbackOnError(on bool) CacheOption { } } +// WithFILODrop set drop policy during Put() when cache's size +// almost reached it's capacity(). When set FILO drop, the Put() +// will fail immediately with a error. +// +// Default drop policy is FIFO, means all Put() will OK and the +// cache drop old data automatically. +func WithFILODrop(on bool) CacheOption { + return func(c *DiskCache) { + c.filoDrop = on + } +} + // WithNoLock set .lock on or off. // // File '.lock' used to exclude Open() on same path. @@ -128,3 +140,11 @@ func WithPath(x string) CacheOption { c.path = filepath.Clean(x) } } + +func WithStreamSize(x int32) CacheOption { + return func(c *DiskCache) { + if x > 0 { + c.streamBuf = make([]byte, x) + } + } +} diff --git a/vendor/github.com/GuanceCloud/cliutils/diskcache/put.go b/vendor/github.com/GuanceCloud/cliutils/diskcache/put.go index 5249d29c1c..31b4e4ffc6 100644 --- a/vendor/github.com/GuanceCloud/cliutils/diskcache/put.go +++ b/vendor/github.com/GuanceCloud/cliutils/diskcache/put.go @@ -7,6 +7,10 @@ package diskcache import ( "encoding/binary" + "errors" + "fmt" + "io" + "os" "time" ) @@ -20,13 +24,17 @@ func (c *DiskCache) Put(data []byte) error { defer c.wlock.Unlock() defer func() { - putVec.WithLabelValues(c.path).Inc() - putBytesVec.WithLabelValues(c.path).Add(float64(len(data))) - putLatencyVec.WithLabelValues(c.path).Observe(float64(time.Since(start) / time.Microsecond)) - sizeVec.WithLabelValues(c.path).Set(float64(c.size)) + putBytesVec.WithLabelValues(c.path).Observe(float64(len(data))) + putLatencyVec.WithLabelValues(c.path).Observe(float64(time.Since(start)) / float64(time.Second)) + sizeVec.WithLabelValues(c.path).Set(float64(c.size.Load())) }() - if c.capacity > 0 && c.size+int64(len(data)) > c.capacity { + if c.capacity > 0 && c.size.Load()+int64(len(data)) > c.capacity { + if c.filoDrop { // do not accept new data + droppedDataVec.WithLabelValues(c.path, reasonExceedCapacity).Observe(float64(len(data))) + return ErrCacheFull + } + if err := c.dropBatch(); err != nil { return err } @@ -54,7 +62,7 @@ func (c *DiskCache) Put(data []byte) error { } c.curBatchSize += int64(len(data) + dataHeaderLen) - c.size += int64(len(data) + dataHeaderLen) + c.size.Add(int64(len(data) + dataHeaderLen)) c.wfdLastWrite = time.Now() // rotate new file @@ -66,3 +74,83 @@ func (c *DiskCache) Put(data []byte) error { return nil } + +func (c *DiskCache) putPart(part []byte) error { + if _, err := c.wfd.Write(part); err != nil { + return err + } + + if !c.noSync { + if err := c.wfd.Sync(); err != nil { + return err + } + } + return nil +} + +// StreamPut read from r for bytes and write to storage. +// +// If we read the data from some network stream(such as HTTP response body), +// we can use StreamPut to avoid a intermidiate buffer to accept the huge(may be) body. +func (c *DiskCache) StreamPut(r io.Reader, size int) error { + var ( + n = 0 + total = 0 + err error + startOffset int64 + start = time.Now() + round = 0 + ) + + c.wlock.Lock() + defer c.wlock.Unlock() + + if c.capacity > 0 && c.size.Load()+int64(size) > c.capacity { + return ErrCacheFull + } + + if startOffset, err = c.wfd.Seek(0, os.SEEK_CUR); err != nil { + return fmt.Errorf("Seek(0, SEEK_CUR): %w", err) + } + + defer func() { + if total > 0 && err != nil { // fallback to origin postion + if _, serr := c.wfd.Seek(startOffset, os.SEEK_SET); serr != nil { + } + } + + putBytesVec.WithLabelValues(c.path).Observe(float64(size)) + putLatencyVec.WithLabelValues(c.path).Observe(float64(time.Since(start)) / float64(time.Second)) + sizeVec.WithLabelValues(c.path).Set(float64(c.size.Load())) + streamPutVec.WithLabelValues(c.path).Observe(float64(round)) + }() + + binary.LittleEndian.PutUint32(c.batchHeader, uint32(size)) + if _, err := c.wfd.Write(c.batchHeader); err != nil { + return err + } + + for { + n, err = r.Read(c.streamBuf) + if err != nil { + if errors.Is(err, io.EOF) { + break + } else { + return err + } + } + + if n == 0 { + break + } + + if err = c.putPart(c.streamBuf); err != nil { + return err + } else { + total += n + round++ + } + } + + return nil +} diff --git a/vendor/github.com/GuanceCloud/cliutils/diskcache/rotate.go b/vendor/github.com/GuanceCloud/cliutils/diskcache/rotate.go index cdea726f4b..acf77ac877 100644 --- a/vendor/github.com/GuanceCloud/cliutils/diskcache/rotate.go +++ b/vendor/github.com/GuanceCloud/cliutils/diskcache/rotate.go @@ -31,7 +31,7 @@ func (c *DiskCache) rotate() error { defer func() { rotateVec.WithLabelValues(c.path).Inc() - sizeVec.WithLabelValues(c.path).Set(float64(c.size)) + sizeVec.WithLabelValues(c.path).Set(float64(c.size.Load())) datafilesVec.WithLabelValues(c.path).Set(float64(len(c.dataFiles))) }() @@ -91,7 +91,7 @@ func (c *DiskCache) removeCurrentReadingFile() error { defer c.rwlock.Unlock() defer func() { - sizeVec.WithLabelValues(c.path).Set(float64(c.size)) + sizeVec.WithLabelValues(c.path).Set(float64(c.size.Load())) removeVec.WithLabelValues(c.path).Inc() datafilesVec.WithLabelValues(c.path).Set(float64(len(c.dataFiles))) }() @@ -105,7 +105,7 @@ func (c *DiskCache) removeCurrentReadingFile() error { if fi, err := os.Stat(c.curReadfile); err == nil { // file exist if fi.Size() > dataHeaderLen { - c.size -= (fi.Size() - dataHeaderLen) // EOF bytes do not counted in size + c.size.Add(-(fi.Size() - dataHeaderLen)) // EOF bytes do not counted in size } if err := os.Remove(c.curReadfile); err != nil { diff --git a/vendor/github.com/GuanceCloud/cliutils/diskcache/switch.go b/vendor/github.com/GuanceCloud/cliutils/diskcache/switch.go index 24e01fbda3..9c1754782b 100644 --- a/vendor/github.com/GuanceCloud/cliutils/diskcache/switch.go +++ b/vendor/github.com/GuanceCloud/cliutils/diskcache/switch.go @@ -76,6 +76,12 @@ func (c *DiskCache) doSwitchNextFile() error { c.rfd = fd + if fi, err := c.rfd.Stat(); err != nil { + return fmt.Errorf("on rfd.Stat(): %w", err) + } else { + c.curReadSize = fi.Size() + } + if !c.noPos { c.pos.Name = []byte(c.curReadfile) c.pos.Seek = 0 diff --git a/vendor/modules.txt b/vendor/modules.txt index f438405624..fec937802a 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -37,7 +37,7 @@ github.com/DataDog/sketches-go/ddsketch/mapping github.com/DataDog/sketches-go/ddsketch/pb/sketchpb github.com/DataDog/sketches-go/ddsketch/stat github.com/DataDog/sketches-go/ddsketch/store -# github.com/GuanceCloud/cliutils v1.1.22-0.20240930074036-255c78c086fd +# github.com/GuanceCloud/cliutils v1.1.22-0.20241018104846-17e816f0e123 ## explicit; go 1.19 github.com/GuanceCloud/cliutils github.com/GuanceCloud/cliutils/dialtesting