Skip to content

Commit

Permalink
OMerge branch 'test_component' of github.com:grafana/agent into test_…
Browse files Browse the repository at this point in the history
…component
mattdurham committed Dec 30, 2023
2 parents bdf2fb3 + dab4190 commit 266bc3f
Showing 15 changed files with 790 additions and 3 deletions.
6 changes: 4 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -7,7 +7,9 @@
/.eventcache
vendor
data-agent

/cmd/benchmark/data/
/cmd/benchmark/main
/cmd/benchmark/grafana-agent-flow
/cmd/agent/agent
/cmd/agentctl/agentctl
/cmd/agent-operator/agent-operator
@@ -24,4 +26,4 @@ cover*.out
.uptodate
node_modules

/docs/variables.mk.local
/docs/variables.mk.local
47 changes: 47 additions & 0 deletions cmd/benchmark/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Benchmark notes

These are synthetic benchmarks meant to represent common workloads. These are not meant to be exhaustive or fine grained.
These will give a coarse idea of how the agent behaves in a sitations.

## Running the benchmarks

Running `PROM_USERNAME="" PROM_PASSWORD="" ./benchmark.sh` will start the benchmark and run for 8 hours. The duration and type of tests
can be adjusted by editting the `benchmark.sh` file. This will start two Agents and the benchmark runner. Relevant CPU and memory metrics
will be sent to the endpoint described in `normal.river`.

TODO: Add mixin for graph I am using

## Adjusting the benchmark

Each benchmark can be adjusted within `test.river`. These settings allow fine tuning to a specific scenario. Each `prometheus.test.metric` component
exposes a service discovery URL that is used to collect the targets.

## Benchmark categories

### prometheus.test.metrics "single"

This roughly represents a single node exporter and is the simpliest use case. Every `10m` 5% of the metrics are replaced driven by `churn_percent`.

### prometheus.test.metrics "many"

This roughly represents scraping many node_exporter instances in say a Kubernetes environment.

### prometheus.test.metrics "large"

This represents scraping 2 very large instances with 1,000,000 series.

### prometheus.test.metrics "churn"

This represents a worst case scenario, 2 large instances with an extremely high churn rate.

## Adjusting the tests

`prometheus.relabel` is often a CPU bottleneck so adding additional rules allows you to test the impact of that.

## Rules

There are existing rules to only send to the prometheus remote write the specific metrics that matter. These are tagged with the `runtype` and the benchmark. For instance `normal-large`.

The benchmark starts an endpoint to consume the metrics from `prometheus.test.metrics`, in half the tests it will return HTTP Status 200 and in the other half will return 500.

TODO add optional pyroscope profiles
16 changes: 16 additions & 0 deletions cmd/benchmark/benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
go build -o main

# each test is ran with the first argument being the name , the second whether the endpoint accepts metrics, the third for the duration and the last being the discovery
# endpont. See test.river for details on each endpoint.
./main metrics churn true 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.churn/discovery"
./main metrics churn false 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.churn/discovery"

./main metrics single true 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.single/discovery"
./main metrics single false 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.single/discovery"

./main metrics many true 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.many/discovery"
./main metrics many false 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.many/discovery"

./main metrics large true 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.large/discovery"
./main metrics large false 1h "http://127.0.0.1:9001/api/v0/component/prometheus.test.metrics.large/discovery"

58 changes: 58 additions & 0 deletions cmd/benchmark/logs.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
package main

import (
"fmt"
"os"
"os/exec"
"syscall"
"time"
)

func startLogsRun(run time.Duration) {
allow = true
_ = os.MkdirAll("./data/", 0777)
_ = os.RemoveAll("./data/")
_ = os.Setenv("NAME", "logs")
gen := startLogsGenAgent()
old := startLogsAgent()
fmt.Println("starting logs agent")
defer func() {
_ = old.Process.Kill()
_ = old.Process.Release()
_ = old.Wait()
_ = syscall.Kill(-old.Process.Pid, syscall.SIGKILL)
_ = gen.Process.Kill()
_ = gen.Process.Release()
_ = gen.Wait()
_ = syscall.Kill(-gen.Process.Pid, syscall.SIGKILL)
_ = os.RemoveAll("./data/")
}()

time.Sleep(run)
}

func startLogsAgent() *exec.Cmd {
cmd := exec.Command("./grafana-agent-flow", "run", "./logs.river", "--storage.path=./data/logs", "--server.http.listen-addr=127.0.0.1:12346")
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr

err := cmd.Start()
if err != nil {
panic(err.Error())
}
return cmd
}

func startLogsGenAgent() *exec.Cmd {
cmd := exec.Command("./grafana-agent-flow", "run", "./logsgen.river", "--storage.path=./data/logs-gen", "--server.http.listen-addr=127.0.0.1:12349")
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr

err := cmd.Start()
if err != nil {
panic(err.Error())
}
return cmd
}
62 changes: 62 additions & 0 deletions cmd/benchmark/logs.river
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
logging {
level = "debug"
}

prometheus.scrape "scraper" {
targets = concat([{"__address__" = "localhost:12346"}])
forward_to = [prometheus.relabel.mutator.receiver]
scrape_interval = "60s"
}

prometheus.relabel "mutator" {
rule {
source_labels = ["__name__"]
regex = "(.+)"
replacement = "normal"
target_label = "runtype"
}

rule {
source_labels = ["__name__"]
regex = "(.+)"
replacement = env("NAME")
target_label = "test_name"
}

rule {
source_labels = ["__name__"]
action = "keep"
regex = "(agent_wal_storage_active_series|agent_resources_process_cpu_seconds_total|go_memstats_alloc_bytes|go_gc_duration_seconds_sum|go_gc_duration_seconds_count|loki_source_file_files_active_total|loki_write_encoded_bytes_total|loki_write_sent_bytes_total|loki_source_file_file_bytes_total)"
}

forward_to = [prometheus.remote_write.agent_stats.receiver]
}

prometheus.remote_write "agent_stats" {
endpoint {
url = "https://prometheus-us-central1.grafana.net/api/prom/push"

basic_auth {
username = env("PROM_USERNAME")
password = env("PROM_PASSWORD")
}
}
}


local.file_match "logs" {
path_targets = [
{__path__ = "./data/logs-gen/loki.test.logs.logs/*.log"},
]
}

loki.source.file "tmpfiles" {
targets = local.file_match.logs.targets
forward_to = [loki.write.local.receiver]
}

loki.write "local" {
endpoint {
url = "http://localhost:8888/post"
}
}
5 changes: 5 additions & 0 deletions cmd/benchmark/logs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
go build -o main

# each test is ran with the first argument being the name , the second whether the endpoint accepts metrics, the third for the duration and the last being the discovery
# endpont. See test.river for details on each endpoint.
./main logs 1h
6 changes: 6 additions & 0 deletions cmd/benchmark/logsgen.river
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
loki.test.logs "logs" {
number_of_files = 100
file_churn_percent = .25
file_refresh = "1m"
writes_per_cadence = 1000
}
145 changes: 145 additions & 0 deletions cmd/benchmark/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
package main

import (
"fmt"
"net/http"
"os"
"os/exec"
"strconv"
"syscall"
"time"

"github.com/gorilla/mux"
)

// main handles creating the benchmark.
func main() {
username := os.Getenv("PROM_USERNAME")
if username == "" {
panic("PROM_USERNAME env must be set")
}
password := os.Getenv("PROM_PASSWORD")
if password == "" {
panic("PROM_PASSWORD env must be set")
}

// Start the HTTP server, that can swallow requests.
go httpServer()
// Build the agent
buildAgent()

benchType := os.Args[1]
if benchType == "metrics" {
name := os.Args[2]
allowWal := os.Args[3]
duration := os.Args[4]
discovery := os.Args[5]
allowWalBool, _ := strconv.ParseBool(allowWal)
parsedDuration, _ := time.ParseDuration(duration)
fmt.Println(name, allowWalBool, parsedDuration, discovery)

startMetricsRun(name, allowWalBool, parsedDuration, discovery)
} else if benchType == "logs" {
duration := os.Args[2]
parsedDuration, _ := time.ParseDuration(duration)
startLogsRun(parsedDuration)
} else {
panic("unknown benchmark type")
}
}

func startMetricsRun(name string, allowWAL bool, run time.Duration, discovery string) {
_ = os.RemoveAll("./data/normal-data")
_ = os.RemoveAll("./data/test-data")

allow = allowWAL
_ = os.Setenv("NAME", name)
_ = os.Setenv("ALLOW_WAL", strconv.FormatBool(allowWAL))
_ = os.Setenv("DISCOVERY", discovery)

metric := startMetricsAgent()
fmt.Println("starting metric agent")
defer func() {
_ = metric.Process.Kill()
_ = metric.Process.Release()
_ = metric.Wait()
_ = syscall.Kill(-metric.Process.Pid, syscall.SIGKILL)
_ = os.RemoveAll("./data/test-data")
}()
old := startNormalAgent()
fmt.Println("starting normal agent")

defer func() {
_ = old.Process.Kill()
_ = old.Process.Release()
_ = old.Wait()
_ = syscall.Kill(-old.Process.Pid, syscall.SIGKILL)
_ = os.RemoveAll("./data/normal-data")
}()
time.Sleep(run)
}

func buildAgent() {
cmd := exec.Command("go", "build", "../grafana-agent-flow")
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
err := cmd.Run()
if err != nil {
panic(err.Error())
}
}

func startNormalAgent() *exec.Cmd {
cmd := exec.Command("./grafana-agent-flow", "run", "./normal.river", "--storage.path=./data/normal-data", "--server.http.listen-addr=127.0.0.1:12346")
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
//cmd.Stdout = os.Stdout
//cmd.Stderr = os.Stderr
err := cmd.Start()
if err != nil {
panic(err.Error())
}
return cmd
}

func startMetricsAgent() *exec.Cmd {
cmd := exec.Command("./grafana-agent-flow", "run", "./test.river", "--storage.path=./data/test-data", "--server.http.listen-addr=127.0.0.1:9001")
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
err := cmd.Start()
if err != nil {
panic(err.Error())
}
return cmd
}

var allow = false

func httpServer() {
r := mux.NewRouter()
r.HandleFunc("/post", func(w http.ResponseWriter, r *http.Request) {
handlePost(w, r)
})
r.HandleFunc("/allow", func(w http.ResponseWriter, r *http.Request) {
println("allowing")
allow = true
})
r.HandleFunc("/block", func(w http.ResponseWriter, r *http.Request) {
println("blocking")
allow = false
})
http.Handle("/", r)
println("Starting server")
err := http.ListenAndServe(":8888", nil)
if err != nil {
println(err)
}
}

func handlePost(w http.ResponseWriter, _ *http.Request) {
if allow {
return
} else {
println("returning 500")
w.WriteHeader(500)
}
}
Loading

0 comments on commit 266bc3f

Please sign in to comment.