From 247728d35108a84132a107b6dc024684b6cdaf12 Mon Sep 17 00:00:00 2001 From: Matt Durham Date: Thu, 28 Dec 2023 11:51:57 -0500 Subject: [PATCH] initial benchmark draft --- .gitignore | 6 +- cmd/benchmark/README.md | 47 +++++++++++++ cmd/benchmark/benchmark.sh | 16 +++++ cmd/benchmark/main.go | 133 +++++++++++++++++++++++++++++++++++++ cmd/benchmark/normal.river | 78 ++++++++++++++++++++++ cmd/benchmark/test.river | 33 +++++++++ go.mod | 2 +- 7 files changed, 312 insertions(+), 3 deletions(-) create mode 100644 cmd/benchmark/README.md create mode 100755 cmd/benchmark/benchmark.sh create mode 100644 cmd/benchmark/main.go create mode 100644 cmd/benchmark/normal.river create mode 100644 cmd/benchmark/test.river diff --git a/.gitignore b/.gitignore index 4f5cb31e3c33..54a98fed0465 100644 --- a/.gitignore +++ b/.gitignore @@ -7,7 +7,9 @@ /.eventcache vendor data-agent - +/cmd/benchmark/data/ +/cmd/benchmark/main +/cmd/benchmark/grafana-agent-flow /cmd/agent/agent /cmd/agentctl/agentctl /cmd/agent-operator/agent-operator @@ -24,4 +26,4 @@ cover*.out .uptodate node_modules -/docs/variables.mk.local \ No newline at end of file +/docs/variables.mk.local diff --git a/cmd/benchmark/README.md b/cmd/benchmark/README.md new file mode 100644 index 000000000000..564dce3c2194 --- /dev/null +++ b/cmd/benchmark/README.md @@ -0,0 +1,47 @@ +# Benchmark notes + +These are synthetic benchmarks meant to represent common workloads. These are not meant to be exhaustive or fine grained. +These will give a coarse idea of how the agent behaves in a sitations. + +## Running the benchmarks + +Running `PROM_USERNAME="" PROM_PASSWORD="" ./benchmark.sh` will start the benchmark and run for 8 hours. The duration and type of tests +can be adjusted by editting the `benchmark.sh` file. This will start two Agents and the benchmark runner. Relevant CPU and memory metrics +will be sent to the endpoint described in `normal.river`. + +TODO: Add mixin for graph I am using + +## Adjusting the benchmark + +Each benchmark can be adjusted within `test.river`. These settings allow fine tuning to a specific scenario. Each `prometheus.test.metric` component +exposes a service discovery URL that is used to collect the targets. + +## Benchmark categories + +### prometheus.test.metrics "single" + +This roughly represents a single node exporter and is the simpliest use case. Every `10m` 5% of the metrics are replaced driven by `churn_percent`. + +### prometheus.test.metrics "many" + +This roughly represents scraping many node_exporter instances in say a Kubernetes environment. + +### prometheus.test.metrics "large" + +This represents scraping 2 very large instances with 1,000,000 series. + +### prometheus.test.metrics "churn" + +This represents a worst case scenario, 2 large instances with an extremely high churn rate. + +## Adjusting the tests + +`prometheus.relabel` is often a CPU bottleneck so adding additional rules allows you to test the impact of that. + +## Rules + +There are existing rules to only send to the prometheus remote write the specific metrics that matter. These are tagged with the `runtype` and the benchmark. For instance `normal-large`. + +The benchmark starts an endpoint to consume the metrics from `prometheus.test.metrics`, in half the tests it will return HTTP Status 200 and in the other half will return 500. + +TODO add optional pyroscope profiles diff --git a/cmd/benchmark/benchmark.sh b/cmd/benchmark/benchmark.sh new file mode 100755 index 000000000000..5459b6287b20 --- /dev/null +++ b/cmd/benchmark/benchmark.sh @@ -0,0 +1,16 @@ +go build ./main.go + +# each test is ran with the first argument being the name , the second whether the endpoint accepts metrics, the third for the duration and the last being the discovery +# endpont. See test.river for details on each endpoint. +./main churn true 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.churn/discovery" +./main churn false 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.churn/discovery" + +./main single true 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.single/discovery" +./main single false 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.single/discovery" + +./main many true 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.many/discovery" +./main many false 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.many/discovery" + +./main large true 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.large/discovery" +./main large false 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.large/discovery" + diff --git a/cmd/benchmark/main.go b/cmd/benchmark/main.go new file mode 100644 index 000000000000..c7ffb055caf9 --- /dev/null +++ b/cmd/benchmark/main.go @@ -0,0 +1,133 @@ +package main + +import ( + "fmt" + "net/http" + "os" + "os/exec" + "strconv" + "syscall" + "time" + + "github.com/gorilla/mux" +) + +// main handles creating the benchmark. +func main() { + username := os.Getenv("PROM_USERNAME") + if username == "" { + panic("PROM_USERNAME env must be set") + } + password := os.Getenv("PROM_PASSWORD") + if password == "" { + panic("PROM_PASSWORD env must be set") + } + + // Start the HTTP server, that can swallow requests. + go httpServer() + // Build the agent + buildAgent() + + name := os.Args[1] + allowWal := os.Args[2] + duration := os.Args[3] + discovery := os.Args[4] + allowWalBool, _ := strconv.ParseBool(allowWal) + parsedDuration, _ := time.ParseDuration(duration) + fmt.Println(name, allowWalBool, parsedDuration, discovery) + startRun(name, allowWalBool, parsedDuration, discovery) + +} + +func startRun(name string, allowWAL bool, run time.Duration, discovery string) { + os.RemoveAll("./data/normal-data") + os.RemoveAll("./data/test-data") + + allow = allowWAL + _ = os.Setenv("NAME", name) + _ = os.Setenv("ALLOW_WAL", strconv.FormatBool(allowWAL)) + _ = os.Setenv("DISCOVERY", discovery) + + metric := startMetricsAgent() + fmt.Println("starting metric agent") + defer metric.Process.Kill() + defer metric.Process.Release() + defer metric.Wait() + defer syscall.Kill(-metric.Process.Pid, syscall.SIGKILL) + defer os.RemoveAll("./data/test-data") + + old := startNormalAgent() + fmt.Println("starting normal agent") + defer old.Process.Kill() + defer old.Process.Release() + defer old.Wait() + defer syscall.Kill(-old.Process.Pid, syscall.SIGKILL) + defer os.RemoveAll("./data/normal-data") + + time.Sleep(run) +} + +func buildAgent() { + cmd := exec.Command("go", "build", "../grafana-agent-flow") + cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + err := cmd.Run() + if err != nil { + panic(err.Error()) + } +} + +func startNormalAgent() *exec.Cmd { + cmd := exec.Command("./grafana-agent-flow", "run", "./normal.river", "--storage.path=./data/normal-data", "--server.http.listen-addr=127.0.0.1:12346") + cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} + //cmd.Stdout = os.Stdout + //cmd.Stderr = os.Stderr + err := cmd.Start() + if err != nil { + panic(err.Error()) + } + return cmd +} + +func startMetricsAgent() *exec.Cmd { + cmd := exec.Command("./grafana-agent-flow", "run", "./test.river", "--storage.path=./data/test-data", "--server.http.listen-addr=127.0.0.1:9001") + cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} + err := cmd.Start() + if err != nil { + panic(err.Error()) + } + return cmd +} + +var allow = false + +func httpServer() { + r := mux.NewRouter() + r.HandleFunc("/post", func(w http.ResponseWriter, r *http.Request) { + handlePost(w, r) + }) + r.HandleFunc("/allow", func(w http.ResponseWriter, r *http.Request) { + println("allowing") + allow = true + }) + r.HandleFunc("/block", func(w http.ResponseWriter, r *http.Request) { + println("blocking") + allow = false + }) + http.Handle("/", r) + println("Starting server") + err := http.ListenAndServe(":8888", nil) + if err != nil { + println(err) + } +} + +func handlePost(w http.ResponseWriter, r *http.Request) { + if allow { + return + } else { + println("returning 500") + w.WriteHeader(500) + } +} diff --git a/cmd/benchmark/normal.river b/cmd/benchmark/normal.river new file mode 100644 index 000000000000..58b5d429462e --- /dev/null +++ b/cmd/benchmark/normal.river @@ -0,0 +1,78 @@ + + +logging { + level = "debug" +} + + +discovery.http "disco" { + url = env("DISCOVERY") +} + + +prometheus.scrape "scraper" { + targets = concat([{"__address__" = "localhost:12346"}]) + forward_to = [prometheus.relabel.mutator.receiver] + scrape_interval = "60s" +} + + +prometheus.relabel "mutator" { + rule { + source_labels = ["__name__"] + regex = "(.+)" + replacement = "normal" + target_label = "runtype" + } + rule { + source_labels = ["__name__"] + regex = "(.+)" + replacement = env("NAME") + target_label = "test_name" + } + rule { + source_labels = ["__name__"] + regex = "(.+)" + replacement = env("ALLOW_WAL") + target_label = "remote_write_enable" + } + rule { + source_labels = ["__name__"] + regex = "(.+)" + replacement = env("DISCOVERY") + target_label = "discovery" + } + + +rule { + source_labels = ["__name__"] + action = "keep" + regex = "(agent_wal_storage_active_series|agent_resources_process_cpu_seconds_total|go_memstats_alloc_bytes|go_gc_duration_seconds_sum|go_gc_duration_seconds_count)" + } + + + forward_to = [prometheus.remote_write.agent_stats.receiver] +} + +prometheus.remote_write "agent_stats" { + endpoint { + url = "https://prometheus-us-central1.grafana.net/api/prom/push" + basic_auth { + username = env("PROM_USERNAME") + password = env("PROM_PASSWORD") + } + } +} + +prometheus.scrape "data" { + targets = discovery.http.disco.targets + forward_to = [prometheus.remote_write.empty.receiver] + scrape_interval = "60s" +} + +prometheus.remote_write "empty" { + endpoint { + url = "http://localhost:8888/post" + } +} + diff --git a/cmd/benchmark/test.river b/cmd/benchmark/test.river new file mode 100644 index 000000000000..ba146d60bdb0 --- /dev/null +++ b/cmd/benchmark/test.river @@ -0,0 +1,33 @@ +// This is meant to mimic handling a single node_exporter instance. +prometheus.test.metrics "single" { + number_of_instances = 1 + number_of_metrics = 2000 + number_of_labels = 5 + metrics_refresh = "10m" + churn_percent = 0.05 +} + +// This is meant to mimic handling many node_exporter instances. +prometheus.test.metrics "many" { + number_of_instances = 1000 + number_of_metrics = 2000 + number_of_labels = 5 + metrics_refresh = "10m" + churn_percent = 0.05 +} + +prometheus.test.metrics "large" { + number_of_instances = 2 + number_of_metrics = 1000000 + number_of_labels = 9 + metrics_refresh = "10m" + churn_percent = 0.05 +} + +prometheus.test.metrics "churn" { + number_of_instances = 2 + number_of_metrics = 200000 + number_of_labels = 12 + metrics_refresh = "10m" + churn_percent = 0.50 +} diff --git a/go.mod b/go.mod index 6c82f970e4b7..6fe72e9a1946 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/grafana/agent -go 1.21.0 +go 1.21 require ( cloud.google.com/go/pubsub v1.33.0