OMerge branch 'test_component' of github.com:grafana/agent into test_…

…component
grafana · Dec 30, 2023 · 266bc3f · 266bc3f
2 parents bdf2fb3 + dab4190
commit 266bc3f
Showing 15 changed files with 790 additions and 3 deletions.
diff --git a/.gitignore b/.gitignore
@@ -7,7 +7,9 @@
 /.eventcache
 vendor
 data-agent
-
+/cmd/benchmark/data/
+/cmd/benchmark/main
+/cmd/benchmark/grafana-agent-flow
 /cmd/agent/agent
 /cmd/agentctl/agentctl
 /cmd/agent-operator/agent-operator
@@ -24,4 +26,4 @@ cover*.out
 .uptodate
 node_modules
 
-/docs/variables.mk.local
+/docs/variables.mk.local
diff --git a/cmd/benchmark/README.md b/cmd/benchmark/README.md
@@ -0,0 +1,47 @@
+# Benchmark notes
+
+These are synthetic benchmarks meant to represent common workloads. These are not meant to be exhaustive or fine grained.
+These will give a coarse idea of how the agent behaves in a sitations.
+
+## Running the benchmarks
+
+Running `PROM_USERNAME="" PROM_PASSWORD="" ./benchmark.sh` will start the benchmark and run for 8 hours. The duration and type of tests
+can be adjusted by editting the `benchmark.sh` file. This will start two Agents and the benchmark runner. Relevant CPU and memory metrics
+will be sent to the endpoint described in `normal.river`.
+
+TODO: Add mixin for graph I am using
+
+## Adjusting the benchmark
+
+Each benchmark can be adjusted within `test.river`. These settings allow fine tuning to a specific scenario. Each `prometheus.test.metric` component
+exposes a service discovery URL that is used to collect the targets.
+
+## Benchmark categories
+
+### prometheus.test.metrics "single"
+
+This roughly represents a single node exporter and is the simpliest use case. Every `10m` 5% of the metrics are replaced driven by `churn_percent`.
+
+### prometheus.test.metrics "many"
+
+This roughly represents scraping many node_exporter instances in say a Kubernetes environment.
+
+### prometheus.test.metrics "large"
+
+This represents scraping 2 very large instances with 1,000,000 series.
+
+### prometheus.test.metrics "churn"
+
+This represents a worst case scenario, 2 large instances with an extremely high churn rate.
+
+## Adjusting the tests
+
+`prometheus.relabel` is often a CPU bottleneck so adding additional rules allows you to test the impact of that.
+
+## Rules
+
+There are existing rules to only send to the prometheus remote write the specific metrics that matter. These are tagged with the `runtype` and the benchmark. For instance `normal-large`.
+
+The benchmark starts an endpoint to consume the metrics from `prometheus.test.metrics`, in half the tests it will return HTTP Status 200 and in the other half will return 500.
+
+TODO add optional pyroscope profiles
diff --git a/cmd/benchmark/benchmark.sh b/cmd/benchmark/benchmark.sh
@@ -0,0 +1,16 @@
+go build -o main
+
+# each test is ran with the first argument being the name , the second whether the endpoint accepts metrics, the third for the duration and the last being the discovery
+# endpont. See test.river for details on each endpoint.
+./main metrics churn true 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.churn/discovery"
+./main metrics churn false 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.churn/discovery"
+
+./main metrics single true 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.single/discovery" 
+./main metrics single false 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.single/discovery"
+
+./main metrics many true 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.many/discovery" 
+./main metrics many false 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.many/discovery"
+
+./main metrics large true 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.large/discovery" 
+./main metrics large false 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.large/discovery"
+
diff --git a/cmd/benchmark/logs.go b/cmd/benchmark/logs.go
@@ -0,0 +1,58 @@
+package main
+
+import (
+	"fmt"
+	"os"
+	"os/exec"
+	"syscall"
+	"time"
+)
+
+func startLogsRun(run time.Duration) {
+	allow = true
+	_ = os.MkdirAll("./data/", 0777)
+	_ = os.RemoveAll("./data/")
+	_ = os.Setenv("NAME", "logs")
+	gen := startLogsGenAgent()
+	old := startLogsAgent()
+	fmt.Println("starting logs agent")
+	defer func() {
+		_ = old.Process.Kill()
+		_ = old.Process.Release()
+		_ = old.Wait()
+		_ = syscall.Kill(-old.Process.Pid, syscall.SIGKILL)
+		_ = gen.Process.Kill()
+		_ = gen.Process.Release()
+		_ = gen.Wait()
+		_ = syscall.Kill(-gen.Process.Pid, syscall.SIGKILL)
+		_ = os.RemoveAll("./data/")
+	}()
+
+	time.Sleep(run)
+}
+
+func startLogsAgent() *exec.Cmd {
+	cmd := exec.Command("./grafana-agent-flow", "run", "./logs.river", "--storage.path=./data/logs", "--server.http.listen-addr=127.0.0.1:12346")
+	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+
+	err := cmd.Start()
+	if err != nil {
+		panic(err.Error())
+	}
+	return cmd
+}
+
+func startLogsGenAgent() *exec.Cmd {
+	cmd := exec.Command("./grafana-agent-flow", "run", "./logsgen.river", "--storage.path=./data/logs-gen", "--server.http.listen-addr=127.0.0.1:12349")
+	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+
+	err := cmd.Start()
+	if err != nil {
+		panic(err.Error())
+	}
+	return cmd
+}
diff --git a/cmd/benchmark/logs.river b/cmd/benchmark/logs.river
@@ -0,0 +1,62 @@
+logging {
+	level = "debug"
+}
+
+prometheus.scrape "scraper" {
+	targets         = concat([{"__address__" = "localhost:12346"}])
+	forward_to      = [prometheus.relabel.mutator.receiver]
+	scrape_interval = "60s"
+}
+
+prometheus.relabel "mutator" {
+	rule {
+		source_labels = ["__name__"]
+		regex         = "(.+)"
+		replacement   = "normal"
+		target_label  = "runtype"
+	}
+
+	rule {
+		source_labels = ["__name__"]
+		regex         = "(.+)"
+		replacement   = env("NAME")
+		target_label  = "test_name"
+	}
+
+	rule {
+		source_labels = ["__name__"]
+		action        = "keep"
+		regex         = "(agent_wal_storage_active_series|agent_resources_process_cpu_seconds_total|go_memstats_alloc_bytes|go_gc_duration_seconds_sum|go_gc_duration_seconds_count|loki_source_file_files_active_total|loki_write_encoded_bytes_total|loki_write_sent_bytes_total|loki_source_file_file_bytes_total)"
+	}
+
+	forward_to = [prometheus.remote_write.agent_stats.receiver]
+}
+
+prometheus.remote_write "agent_stats" {
+	endpoint {
+		url = "https://prometheus-us-central1.grafana.net/api/prom/push"
+
+		basic_auth {
+			username = env("PROM_USERNAME")
+			password = env("PROM_PASSWORD")
+		}
+	}
+}
+
+
+local.file_match "logs" {
+	path_targets = [
+		{__path__ = "./data/logs-gen/loki.test.logs.logs/*.log"},
+	]
+}
+
+loki.source.file "tmpfiles" {
+	targets    = local.file_match.logs.targets
+	forward_to = [loki.write.local.receiver]
+}
+
+loki.write "local" {
+	endpoint {
+		url = "http://localhost:8888/post"
+	}
+}
diff --git a/cmd/benchmark/logs.sh b/cmd/benchmark/logs.sh
@@ -0,0 +1,5 @@
+go build -o main  
+
+# each test is ran with the first argument being the name , the second whether the endpoint accepts metrics, the third for the duration and the last being the discovery
+# endpont. See test.river for details on each endpoint.
+./main logs 1h
diff --git a/cmd/benchmark/logsgen.river b/cmd/benchmark/logsgen.river
@@ -0,0 +1,6 @@
+loki.test.logs "logs" {
+    number_of_files = 100
+    file_churn_percent = .25
+    file_refresh = "1m"
+    writes_per_cadence = 1000
+}
diff --git a/cmd/benchmark/main.go b/cmd/benchmark/main.go
@@ -0,0 +1,145 @@
+package main
+
+import (
+	"fmt"
+	"net/http"
+	"os"
+	"os/exec"
+	"strconv"
+	"syscall"
+	"time"
+
+	"github.com/gorilla/mux"
+)
+
+// main handles creating the benchmark.
+func main() {
+	username := os.Getenv("PROM_USERNAME")
+	if username == "" {
+		panic("PROM_USERNAME env must be set")
+	}
+	password := os.Getenv("PROM_PASSWORD")
+	if password == "" {
+		panic("PROM_PASSWORD env must be set")
+	}
+
+	// Start the HTTP server, that can swallow requests.
+	go httpServer()
+	// Build the agent
+	buildAgent()
+
+	benchType := os.Args[1]
+	if benchType == "metrics" {
+		name := os.Args[2]
+		allowWal := os.Args[3]
+		duration := os.Args[4]
+		discovery := os.Args[5]
+		allowWalBool, _ := strconv.ParseBool(allowWal)
+		parsedDuration, _ := time.ParseDuration(duration)
+		fmt.Println(name, allowWalBool, parsedDuration, discovery)
+
+		startMetricsRun(name, allowWalBool, parsedDuration, discovery)
+	} else if benchType == "logs" {
+		duration := os.Args[2]
+		parsedDuration, _ := time.ParseDuration(duration)
+		startLogsRun(parsedDuration)
+	} else {
+		panic("unknown benchmark type")
+	}
+}
+
+func startMetricsRun(name string, allowWAL bool, run time.Duration, discovery string) {
+	_ = os.RemoveAll("./data/normal-data")
+	_ = os.RemoveAll("./data/test-data")
+
+	allow = allowWAL
+	_ = os.Setenv("NAME", name)
+	_ = os.Setenv("ALLOW_WAL", strconv.FormatBool(allowWAL))
+	_ = os.Setenv("DISCOVERY", discovery)
+
+	metric := startMetricsAgent()
+	fmt.Println("starting metric agent")
+	defer func() {
+		_ = metric.Process.Kill()
+		_ = metric.Process.Release()
+		_ = metric.Wait()
+		_ = syscall.Kill(-metric.Process.Pid, syscall.SIGKILL)
+		_ = os.RemoveAll("./data/test-data")
+	}()
+	old := startNormalAgent()
+	fmt.Println("starting normal agent")
+
+	defer func() {
+		_ = old.Process.Kill()
+		_ = old.Process.Release()
+		_ = old.Wait()
+		_ = syscall.Kill(-old.Process.Pid, syscall.SIGKILL)
+		_ = os.RemoveAll("./data/normal-data")
+	}()
+	time.Sleep(run)
+}
+
+func buildAgent() {
+	cmd := exec.Command("go", "build", "../grafana-agent-flow")
+	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+	err := cmd.Run()
+	if err != nil {
+		panic(err.Error())
+	}
+}
+
+func startNormalAgent() *exec.Cmd {
+	cmd := exec.Command("./grafana-agent-flow", "run", "./normal.river", "--storage.path=./data/normal-data", "--server.http.listen-addr=127.0.0.1:12346")
+	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
+	//cmd.Stdout = os.Stdout
+	//cmd.Stderr = os.Stderr
+	err := cmd.Start()
+	if err != nil {
+		panic(err.Error())
+	}
+	return cmd
+}
+
+func startMetricsAgent() *exec.Cmd {
+	cmd := exec.Command("./grafana-agent-flow", "run", "./test.river", "--storage.path=./data/test-data", "--server.http.listen-addr=127.0.0.1:9001")
+	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
+	err := cmd.Start()
+	if err != nil {
+		panic(err.Error())
+	}
+	return cmd
+}
+
+var allow = false
+
+func httpServer() {
+	r := mux.NewRouter()
+	r.HandleFunc("/post", func(w http.ResponseWriter, r *http.Request) {
+		handlePost(w, r)
+	})
+	r.HandleFunc("/allow", func(w http.ResponseWriter, r *http.Request) {
+		println("allowing")
+		allow = true
+	})
+	r.HandleFunc("/block", func(w http.ResponseWriter, r *http.Request) {
+		println("blocking")
+		allow = false
+	})
+	http.Handle("/", r)
+	println("Starting server")
+	err := http.ListenAndServe(":8888", nil)
+	if err != nil {
+		println(err)
+	}
+}
+
+func handlePost(w http.ResponseWriter, _ *http.Request) {
+	if allow {
+		return
+	} else {
+		println("returning 500")
+		w.WriteHeader(500)
+	}
+}