initial benchmark draft

grafana · Dec 28, 2023 · 247728d · 247728d
1 parent 29a4b7c
commit 247728d
Show file tree

Hide file tree

Showing 7 changed files with 312 additions and 3 deletions.
diff --git a/.gitignore b/.gitignore
@@ -7,7 +7,9 @@
 /.eventcache
 vendor
 data-agent
-
+/cmd/benchmark/data/
+/cmd/benchmark/main
+/cmd/benchmark/grafana-agent-flow
 /cmd/agent/agent
 /cmd/agentctl/agentctl
 /cmd/agent-operator/agent-operator
@@ -24,4 +26,4 @@ cover*.out
 .uptodate
 node_modules
 
-/docs/variables.mk.local
+/docs/variables.mk.local
diff --git a/cmd/benchmark/README.md b/cmd/benchmark/README.md
@@ -0,0 +1,47 @@
+# Benchmark notes
+
+These are synthetic benchmarks meant to represent common workloads. These are not meant to be exhaustive or fine grained.
+These will give a coarse idea of how the agent behaves in a sitations.
+
+## Running the benchmarks
+
+Running `PROM_USERNAME="" PROM_PASSWORD="" ./benchmark.sh` will start the benchmark and run for 8 hours. The duration and type of tests
+can be adjusted by editting the `benchmark.sh` file. This will start two Agents and the benchmark runner. Relevant CPU and memory metrics
+will be sent to the endpoint described in `normal.river`.
+
+TODO: Add mixin for graph I am using
+
+## Adjusting the benchmark
+
+Each benchmark can be adjusted within `test.river`. These settings allow fine tuning to a specific scenario. Each `prometheus.test.metric` component
+exposes a service discovery URL that is used to collect the targets.
+
+## Benchmark categories
+
+### prometheus.test.metrics "single"
+
+This roughly represents a single node exporter and is the simpliest use case. Every `10m` 5% of the metrics are replaced driven by `churn_percent`.
+
+### prometheus.test.metrics "many"
+
+This roughly represents scraping many node_exporter instances in say a Kubernetes environment.
+
+### prometheus.test.metrics "large"
+
+This represents scraping 2 very large instances with 1,000,000 series.
+
+### prometheus.test.metrics "churn"
+
+This represents a worst case scenario, 2 large instances with an extremely high churn rate.
+
+## Adjusting the tests
+
+`prometheus.relabel` is often a CPU bottleneck so adding additional rules allows you to test the impact of that.
+
+## Rules
+
+There are existing rules to only send to the prometheus remote write the specific metrics that matter. These are tagged with the `runtype` and the benchmark. For instance `normal-large`.
+
+The benchmark starts an endpoint to consume the metrics from `prometheus.test.metrics`, in half the tests it will return HTTP Status 200 and in the other half will return 500.
+
+TODO add optional pyroscope profiles
diff --git a/cmd/benchmark/benchmark.sh b/cmd/benchmark/benchmark.sh
@@ -0,0 +1,16 @@
+go build ./main.go
+
+# each test is ran with the first argument being the name , the second whether the endpoint accepts metrics, the third for the duration and the last being the discovery
+# endpont. See test.river for details on each endpoint.
+./main churn true 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.churn/discovery"
+./main churn false 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.churn/discovery"
+
+./main single true 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.single/discovery" 
+./main single false 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.single/discovery"
+
+./main many true 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.many/discovery" 
+./main many false 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.many/discovery"
+
+./main large true 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.large/discovery" 
+./main large false 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.large/discovery"
+
diff --git a/cmd/benchmark/main.go b/cmd/benchmark/main.go
@@ -0,0 +1,133 @@
+package main
+
+import (
+	"fmt"
+	"net/http"
+	"os"
+	"os/exec"
+	"strconv"
+	"syscall"
+	"time"
+
+	"github.com/gorilla/mux"
+)
+
+// main handles creating the benchmark.
+func main() {
+	username := os.Getenv("PROM_USERNAME")
+	if username == "" {
+		panic("PROM_USERNAME env must be set")
+	}
+	password := os.Getenv("PROM_PASSWORD")
+	if password == "" {
+		panic("PROM_PASSWORD env must be set")
+	}
+
+	// Start the HTTP server, that can swallow requests.
+	go httpServer()
+	// Build the agent
+	buildAgent()
+
+	name := os.Args[1]
+	allowWal := os.Args[2]
+	duration := os.Args[3]
+	discovery := os.Args[4]
+	allowWalBool, _ := strconv.ParseBool(allowWal)
+	parsedDuration, _ := time.ParseDuration(duration)
+	fmt.Println(name, allowWalBool, parsedDuration, discovery)
+	startRun(name, allowWalBool, parsedDuration, discovery)
+
+}
+
+func startRun(name string, allowWAL bool, run time.Duration, discovery string) {
+	os.RemoveAll("./data/normal-data")
+	os.RemoveAll("./data/test-data")
+
+	allow = allowWAL
+	_ = os.Setenv("NAME", name)
+	_ = os.Setenv("ALLOW_WAL", strconv.FormatBool(allowWAL))
+	_ = os.Setenv("DISCOVERY", discovery)
+
+	metric := startMetricsAgent()
+	fmt.Println("starting metric agent")
+	defer metric.Process.Kill()
+	defer metric.Process.Release()
+	defer metric.Wait()
+	defer syscall.Kill(-metric.Process.Pid, syscall.SIGKILL)
+	defer os.RemoveAll("./data/test-data")
+
+	old := startNormalAgent()
+	fmt.Println("starting normal agent")
+	defer old.Process.Kill()
+	defer old.Process.Release()
+	defer old.Wait()
+	defer syscall.Kill(-old.Process.Pid, syscall.SIGKILL)
+	defer os.RemoveAll("./data/normal-data")
+
+	time.Sleep(run)
+}
+
+func buildAgent() {
+	cmd := exec.Command("go", "build", "../grafana-agent-flow")
+	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+	err := cmd.Run()
+	if err != nil {
+		panic(err.Error())
+	}
+}
+
+func startNormalAgent() *exec.Cmd {
+	cmd := exec.Command("./grafana-agent-flow", "run", "./normal.river", "--storage.path=./data/normal-data", "--server.http.listen-addr=127.0.0.1:12346")
+	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
+	//cmd.Stdout = os.Stdout
+	//cmd.Stderr = os.Stderr
+	err := cmd.Start()
+	if err != nil {
+		panic(err.Error())
+	}
+	return cmd
+}
+
+func startMetricsAgent() *exec.Cmd {
+	cmd := exec.Command("./grafana-agent-flow", "run", "./test.river", "--storage.path=./data/test-data", "--server.http.listen-addr=127.0.0.1:9001")
+	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
+	err := cmd.Start()
+	if err != nil {
+		panic(err.Error())
+	}
+	return cmd
+}
+
+var allow = false
+
+func httpServer() {
+	r := mux.NewRouter()
+	r.HandleFunc("/post", func(w http.ResponseWriter, r *http.Request) {
+		handlePost(w, r)
+	})
+	r.HandleFunc("/allow", func(w http.ResponseWriter, r *http.Request) {
+		println("allowing")
+		allow = true
+	})
+	r.HandleFunc("/block", func(w http.ResponseWriter, r *http.Request) {
+		println("blocking")
+		allow = false
+	})
+	http.Handle("/", r)
+	println("Starting server")
+	err := http.ListenAndServe(":8888", nil)
+	if err != nil {
+		println(err)
+	}
+}
+
+func handlePost(w http.ResponseWriter, r *http.Request) {
+	if allow {
+		return
+	} else {
+		println("returning 500")
+		w.WriteHeader(500)
+	}
+}
diff --git a/cmd/benchmark/normal.river b/cmd/benchmark/normal.river
@@ -0,0 +1,78 @@
+
+
+logging {
+    level = "debug"
+}
+
+
+discovery.http "disco" {
+  url = env("DISCOVERY")
+}
+
+
+prometheus.scrape "scraper" {
+    targets = concat([{"__address__" = "localhost:12346"}])
+    forward_to = [prometheus.relabel.mutator.receiver]
+    scrape_interval = "60s"
+}
+
+
+prometheus.relabel "mutator" {
+    rule {
+        source_labels = ["__name__"]
+        regex = "(.+)"
+        replacement = "normal"
+        target_label = "runtype"
+    }
+    rule {
+        source_labels = ["__name__"]
+        regex = "(.+)"
+       replacement = env("NAME")
+               target_label = "test_name"
+    }
+  rule {
+        source_labels = ["__name__"]
+        regex = "(.+)"
+        replacement = env("ALLOW_WAL")
+        target_label = "remote_write_enable"
+    }
+     rule {
+        source_labels = ["__name__"]
+        regex = "(.+)"
+        replacement = env("DISCOVERY")
+        target_label = "discovery"
+    }
+
+
+rule {
+       source_labels = ["__name__"]
+       action = "keep"
+       regex = "(agent_wal_storage_active_series|agent_resources_process_cpu_seconds_total|go_memstats_alloc_bytes|go_gc_duration_seconds_sum|go_gc_duration_seconds_count)"
+    }
+
+
+    forward_to = [prometheus.remote_write.agent_stats.receiver]
+}
+
+prometheus.remote_write "agent_stats" {
+    endpoint {
+        url = "https://prometheus-us-central1.grafana.net/api/prom/push"
+        basic_auth {
+            username = env("PROM_USERNAME")
+            password = env("PROM_PASSWORD")
+        }
+    }
+}
+
+prometheus.scrape "data" {
+    targets = discovery.http.disco.targets
+    forward_to = [prometheus.remote_write.empty.receiver]
+    scrape_interval = "60s"
+}
+
+prometheus.remote_write "empty" {
+    endpoint {
+        url = "http://localhost:8888/post"
+    }
+}
+
diff --git a/cmd/benchmark/test.river b/cmd/benchmark/test.river
@@ -0,0 +1,33 @@
+// This is meant to mimic handling a single node_exporter instance.
+prometheus.test.metrics "single" {
+    number_of_instances = 1
+    number_of_metrics = 2000
+    number_of_labels = 5
+    metrics_refresh = "10m"
+    churn_percent = 0.05  
+}
+
+// This is meant to mimic handling many node_exporter instances.
+prometheus.test.metrics "many" {
+    number_of_instances = 1000
+    number_of_metrics = 2000
+    number_of_labels = 5
+    metrics_refresh = "10m"
+    churn_percent = 0.05
+}
+
+prometheus.test.metrics "large" {
+    number_of_instances = 2
+    number_of_metrics = 1000000
+    number_of_labels = 9
+    metrics_refresh = "10m"
+    churn_percent = 0.05
+}
+
+prometheus.test.metrics "churn" {
+    number_of_instances = 2
+    number_of_metrics = 200000
+    number_of_labels = 12
+    metrics_refresh = "10m"
+    churn_percent = 0.50
+}
diff --git a/go.mod b/go.mod
@@ -1,6 +1,6 @@
 module github.com/grafana/agent
 
-go 1.21.0
+go 1.21
 
 require (
 	cloud.google.com/go/pubsub v1.33.0