Skip to content

Commit

Permalink
initial benchmark draft
Browse files Browse the repository at this point in the history
  • Loading branch information
mattdurham committed Dec 28, 2023
1 parent 29a4b7c commit 247728d
Show file tree
Hide file tree
Showing 7 changed files with 312 additions and 3 deletions.
6 changes: 4 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
/.eventcache
vendor
data-agent

/cmd/benchmark/data/
/cmd/benchmark/main
/cmd/benchmark/grafana-agent-flow
/cmd/agent/agent
/cmd/agentctl/agentctl
/cmd/agent-operator/agent-operator
Expand All @@ -24,4 +26,4 @@ cover*.out
.uptodate
node_modules

/docs/variables.mk.local
/docs/variables.mk.local
47 changes: 47 additions & 0 deletions cmd/benchmark/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Benchmark notes

These are synthetic benchmarks meant to represent common workloads. These are not meant to be exhaustive or fine grained.
These will give a coarse idea of how the agent behaves in a sitations.

## Running the benchmarks

Running `PROM_USERNAME="" PROM_PASSWORD="" ./benchmark.sh` will start the benchmark and run for 8 hours. The duration and type of tests
can be adjusted by editting the `benchmark.sh` file. This will start two Agents and the benchmark runner. Relevant CPU and memory metrics
will be sent to the endpoint described in `normal.river`.

TODO: Add mixin for graph I am using

## Adjusting the benchmark

Each benchmark can be adjusted within `test.river`. These settings allow fine tuning to a specific scenario. Each `prometheus.test.metric` component
exposes a service discovery URL that is used to collect the targets.

## Benchmark categories

### prometheus.test.metrics "single"

This roughly represents a single node exporter and is the simpliest use case. Every `10m` 5% of the metrics are replaced driven by `churn_percent`.

### prometheus.test.metrics "many"

This roughly represents scraping many node_exporter instances in say a Kubernetes environment.

### prometheus.test.metrics "large"

This represents scraping 2 very large instances with 1,000,000 series.

### prometheus.test.metrics "churn"

This represents a worst case scenario, 2 large instances with an extremely high churn rate.

## Adjusting the tests

`prometheus.relabel` is often a CPU bottleneck so adding additional rules allows you to test the impact of that.

## Rules

There are existing rules to only send to the prometheus remote write the specific metrics that matter. These are tagged with the `runtype` and the benchmark. For instance `normal-large`.

The benchmark starts an endpoint to consume the metrics from `prometheus.test.metrics`, in half the tests it will return HTTP Status 200 and in the other half will return 500.

TODO add optional pyroscope profiles
16 changes: 16 additions & 0 deletions cmd/benchmark/benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
go build ./main.go

# each test is ran with the first argument being the name , the second whether the endpoint accepts metrics, the third for the duration and the last being the discovery
# endpont. See test.river for details on each endpoint.
./main churn true 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.churn/discovery"
./main churn false 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.churn/discovery"

./main single true 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.single/discovery"
./main single false 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.single/discovery"

./main many true 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.many/discovery"
./main many false 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.many/discovery"

./main large true 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.large/discovery"
./main large false 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.large/discovery"

133 changes: 133 additions & 0 deletions cmd/benchmark/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
package main

import (
"fmt"
"net/http"
"os"
"os/exec"
"strconv"
"syscall"
"time"

"github.com/gorilla/mux"
)

// main handles creating the benchmark.
func main() {
username := os.Getenv("PROM_USERNAME")
if username == "" {
panic("PROM_USERNAME env must be set")
}
password := os.Getenv("PROM_PASSWORD")
if password == "" {
panic("PROM_PASSWORD env must be set")
}

// Start the HTTP server, that can swallow requests.
go httpServer()
// Build the agent
buildAgent()

name := os.Args[1]
allowWal := os.Args[2]
duration := os.Args[3]
discovery := os.Args[4]
allowWalBool, _ := strconv.ParseBool(allowWal)
parsedDuration, _ := time.ParseDuration(duration)
fmt.Println(name, allowWalBool, parsedDuration, discovery)
startRun(name, allowWalBool, parsedDuration, discovery)

}

func startRun(name string, allowWAL bool, run time.Duration, discovery string) {
os.RemoveAll("./data/normal-data")
os.RemoveAll("./data/test-data")

allow = allowWAL
_ = os.Setenv("NAME", name)
_ = os.Setenv("ALLOW_WAL", strconv.FormatBool(allowWAL))
_ = os.Setenv("DISCOVERY", discovery)

metric := startMetricsAgent()
fmt.Println("starting metric agent")
defer metric.Process.Kill()
defer metric.Process.Release()
defer metric.Wait()
defer syscall.Kill(-metric.Process.Pid, syscall.SIGKILL)
defer os.RemoveAll("./data/test-data")

old := startNormalAgent()
fmt.Println("starting normal agent")
defer old.Process.Kill()
defer old.Process.Release()
defer old.Wait()
defer syscall.Kill(-old.Process.Pid, syscall.SIGKILL)
defer os.RemoveAll("./data/normal-data")

time.Sleep(run)
}

func buildAgent() {
cmd := exec.Command("go", "build", "../grafana-agent-flow")
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
err := cmd.Run()
if err != nil {
panic(err.Error())
}
}

func startNormalAgent() *exec.Cmd {
cmd := exec.Command("./grafana-agent-flow", "run", "./normal.river", "--storage.path=./data/normal-data", "--server.http.listen-addr=127.0.0.1:12346")
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
//cmd.Stdout = os.Stdout
//cmd.Stderr = os.Stderr
err := cmd.Start()
if err != nil {
panic(err.Error())
}
return cmd
}

func startMetricsAgent() *exec.Cmd {
cmd := exec.Command("./grafana-agent-flow", "run", "./test.river", "--storage.path=./data/test-data", "--server.http.listen-addr=127.0.0.1:9001")
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
err := cmd.Start()
if err != nil {
panic(err.Error())
}
return cmd
}

var allow = false

func httpServer() {
r := mux.NewRouter()
r.HandleFunc("/post", func(w http.ResponseWriter, r *http.Request) {
handlePost(w, r)
})
r.HandleFunc("/allow", func(w http.ResponseWriter, r *http.Request) {
println("allowing")
allow = true
})
r.HandleFunc("/block", func(w http.ResponseWriter, r *http.Request) {
println("blocking")
allow = false
})
http.Handle("/", r)
println("Starting server")
err := http.ListenAndServe(":8888", nil)
if err != nil {
println(err)
}
}

func handlePost(w http.ResponseWriter, r *http.Request) {
if allow {
return
} else {
println("returning 500")
w.WriteHeader(500)
}
}
78 changes: 78 additions & 0 deletions cmd/benchmark/normal.river
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@


logging {
level = "debug"
}


discovery.http "disco" {
url = env("DISCOVERY")
}


prometheus.scrape "scraper" {
targets = concat([{"__address__" = "localhost:12346"}])
forward_to = [prometheus.relabel.mutator.receiver]
scrape_interval = "60s"
}


prometheus.relabel "mutator" {
rule {
source_labels = ["__name__"]
regex = "(.+)"
replacement = "normal"
target_label = "runtype"
}
rule {
source_labels = ["__name__"]
regex = "(.+)"
replacement = env("NAME")
target_label = "test_name"
}
rule {
source_labels = ["__name__"]
regex = "(.+)"
replacement = env("ALLOW_WAL")
target_label = "remote_write_enable"
}
rule {
source_labels = ["__name__"]
regex = "(.+)"
replacement = env("DISCOVERY")
target_label = "discovery"
}


rule {
source_labels = ["__name__"]
action = "keep"
regex = "(agent_wal_storage_active_series|agent_resources_process_cpu_seconds_total|go_memstats_alloc_bytes|go_gc_duration_seconds_sum|go_gc_duration_seconds_count)"
}


forward_to = [prometheus.remote_write.agent_stats.receiver]
}

prometheus.remote_write "agent_stats" {
endpoint {
url = "https://prometheus-us-central1.grafana.net/api/prom/push"
basic_auth {
username = env("PROM_USERNAME")
password = env("PROM_PASSWORD")
}
}
}

prometheus.scrape "data" {
targets = discovery.http.disco.targets
forward_to = [prometheus.remote_write.empty.receiver]
scrape_interval = "60s"
}

prometheus.remote_write "empty" {
endpoint {
url = "http://localhost:8888/post"
}
}

33 changes: 33 additions & 0 deletions cmd/benchmark/test.river
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// This is meant to mimic handling a single node_exporter instance.
prometheus.test.metrics "single" {
number_of_instances = 1
number_of_metrics = 2000
number_of_labels = 5
metrics_refresh = "10m"
churn_percent = 0.05
}

// This is meant to mimic handling many node_exporter instances.
prometheus.test.metrics "many" {
number_of_instances = 1000
number_of_metrics = 2000
number_of_labels = 5
metrics_refresh = "10m"
churn_percent = 0.05
}

prometheus.test.metrics "large" {
number_of_instances = 2
number_of_metrics = 1000000
number_of_labels = 9
metrics_refresh = "10m"
churn_percent = 0.05
}

prometheus.test.metrics "churn" {
number_of_instances = 2
number_of_metrics = 200000
number_of_labels = 12
metrics_refresh = "10m"
churn_percent = 0.50
}
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
module github.com/grafana/agent

go 1.21.0
go 1.21

require (
cloud.google.com/go/pubsub v1.33.0
Expand Down

0 comments on commit 247728d

Please sign in to comment.