mattermost · agarciamontoro · Nov 11, 2024 · Oct 25, 2024 · Oct 25, 2024 · Oct 25, 2024
diff --git a/config/comparison.sample.toml b/config/comparison.sample.toml
@@ -0,0 +1,32 @@
+[BaseBuild]
+Label = 'master'
+URL = 'file://master.tar.gz'
+
+[NewBuild]
+Label = 'release'
+URL = 'file://release.tar.gz'
+
+[[LoadTests]]
+Type = 'unbounded'
+DBEngine = 'mysql'
+
+[[LoadTests]]
+Type = 'bounded'
+DBEngine = 'mysql'
+NumUsers = 1000
+Duration = '1h'
+
+[[LoadTests]]
+Type = 'unbounded'
+DBEngine = 'postgresql'
+
+[[LoadTests]]
+Type = 'bounded'
+DBEngine = 'postgresql'
+NumUsers = 1000
+Duration = '1h'
+
+[Output]
+UploadDashboard = true
+GenerateGraphs = false
+GenerateReport = true
diff --git a/config/coordinator.sample.toml b/config/coordinator.sample.toml
@@ -0,0 +1,96 @@
+NumUsersInc = 8
+NumUsersDec = 8
+RestTimeSec = 2
+
+[ClusterConfig]
+MaxActiveUsers = 2000
+
+    [[ClusterConfig.Agents]]
+    Id = 'lt0'
+    ApiURL = 'http://localhost:4000'
+
+[MonitorConfig]
+PrometheusURL = 'http://localhost:9090'
+UpdateIntervalMs = 2000
+
+    [[MonitorConfig.Queries]]
+    Description = 'Percentage of HTTP 5xx server errors'
+    Legend = 'Percent'
+    Query = '(sum(rate(mattermost_api_time_count{status_code=~"5.."}[1m]))/sum(rate(mattermost_api_time_count[1m])))*100'
+    Threshold = 0.025
+    MinIntervalSec = 60
+    Alert = true
+
+    [[MonitorConfig.Queries]]
+    Description = 'Average client request duration'
+    Legend = 'Avg duration (s)'
+    Query = 'sum(rate(loadtest_http_request_time_sum[1m]))/sum(rate(loadtest_http_request_time_count[1m]))'
+    Threshold = 0.1
+    MinIntervalSec = 60
+    Alert = true
+
+    [[MonitorConfig.Queries]]
+    Description = '99th percentile of client request duration'
+    Legend = 'P99 duration (s)'
+    Query = 'histogram_quantile(0.99, sum(rate(loadtest_http_request_time_bucket[1m])) by (le))'
+    Threshold = 2
+    MinIntervalSec = 60
+    Alert = true
+
+    [[MonitorConfig.Queries]]
+    Description = 'Percentage of HTTP 5xx client errors'
+    Legend = 'Percent'
+    Query = '(sum(rate(loadtest_http_errors_total{status_code=~"5.."}[1m]))/sum(rate(loadtest_http_request_time_count[1m])))*100'
+    Threshold = 0.025
+    MinIntervalSec = 60
+    Alert = true
+
+    [[MonitorConfig.Queries]]
+    Description = 'Percentage of client timeouts'
+    Legend = 'Percent'
+    Query = '(sum(rate(loadtest_http_timeouts_total[1m]))/sum(rate(loadtest_http_request_time_count[1m]))) * 100'
+    Threshold = 0.025
+    MinIntervalSec = 60
+    Alert = true
+
+    [[MonitorConfig.Queries]]
+    Description = 'CPU utilization - Average of app nodes'
+    Legend = 'Percent'
+    Query = '100 - 100 * (avg(irate(node_cpu_seconds_total{instance=~"app.*",mode="idle"}[5m])))'
+    Threshold = 85
+    MinIntervalSec = 60
+    Alert = true
+
+    [[MonitorConfig.Queries]]
+    Description = 'Memory utilization - Average of app nodes'
+    Legend = 'Percent'
+    Query = '100 - 100 * avg(node_memory_MemAvailable_bytes{instance=~"app.*"} / node_memory_MemTotal_bytes{instance=~"app.*"})'
+    Threshold = 85
+    MinIntervalSec = 60
+    Alert = true
+
+    [[MonitorConfig.Queries]]
+    Description = 'Percentage of TCP retransmissions in the app nodes'
+    Legend = 'Percent'
+    Query = '(avg(rate(node_netstat_Tcp_RetransSegs{instance=~"app.*"}[1m])) / avg(rate(node_netstat_Tcp_OutSegs{instance=~"app.*"}[1m]))) * 100'
+    Threshold = 0.5
+    MinIntervalSec = 60
+    Alert = true
+
+    [[MonitorConfig.Queries]]
+    Description = 'Percentage of TCP retransmissions in the proxy node'
+    Legend = 'Percent'
+    Query = '(avg(rate(node_netstat_Tcp_RetransSegs{instance=~"proxy:9100"}[1m])) / avg(rate(node_netstat_Tcp_OutSegs{instance=~"proxy:9100"}[1m]))) * 100'
+    Threshold = 0.5
+    MinIntervalSec = 60
+    Alert = true
+
+[LogSettings]
+EnableConsole = true
+ConsoleLevel = 'INFO'
+ConsoleJson = false
+EnableFile = true
+FileLevel = 'INFO'
+FileJson = true
+FileLocation = 'ltcoordinator.log'
+EnableColor = false
diff --git a/docs/readme.md b/docs/readme.md
@@ -47,3 +47,7 @@ Once you have familiarized yourself with the tool, and after you have successful
 - [Running an automated load-test comparison](comparison.md): a workflow specifically designed for when you need to compare two different versions of Mattermost while maintaining the rest of the variables fixed. This is what the Server team at Mattermost uses for the monthly release performance comparisons.
 - [Generating data](generating-data.md): for larger load-tests, you'll need larger datasets. This guide describes how you can use the gencontroller to create an arbitrary number of teams, channels, posts, reactions... to use as the starting point for future tests.
 
+
+## Configuration samples
+
+We know that the configuration of the load-test tool can be overwhelming, specially to newcomers. We have some sets of config templates we actively use and maintain up-to-date in the [`examples/config` directory](../examples/config). Take a look at the files there to learn from real-world config files.
diff --git a/examples/config/README.md b/examples/config/README.md
@@ -0,0 +1,5 @@
+# Configuration samples
+
+This directory contains sets of configuration templates that we use in different scenarios. Some fields are hard-coded to the values we use in our day-to-day processes (e.g. the path to the SSH keys), and others are marked as `#TBD` because they may change from run to run (e.g. the URLs to download Mattermost from). In any case, these sets can serve as starter packs for other, different workflows. For now, we have:
+- [Release testing](./release): configuration used when testing a new release of the load-test tool.
+- [Performance comparison](./perfcomp): configuration used for regression testing of new Mattermost releases. The results of these runs can be found in the [`performance-reports` repository](https://github.com/mattermost/performance-reports/tree/main/performance-comparisons).
diff --git a/examples/config/perfcomp/comparison.toml b/examples/config/perfcomp/comparison.toml
@@ -0,0 +1,36 @@
+[BaseBuild]
+Label = 'release-X.Y.Z' #TBD
+URL = 'https://releases.mattermost.com/X.Y.Z/mattermost-enterprise-X.Y.Z-linux-amd64.tar.gz' #TBD
+
+[NewBuild]
+Label = 'release-A.B.C-rcN' #TBD
+URL = 'https://releases.mattermost.com/A.B.C-rcN/mattermost-enterprise-A.B.C-rcN-linux-amd64.tar.gz' #TBD
+
+[[LoadTests]]
+Type = 'unbounded'
+DBEngine = 'postgresql'
+DBDumpURL = 'https://lt-public-data.s3.amazonaws.com/12M_610_psql.sql.gz'
+
+[[LoadTests]]
+Type = 'bounded'
+DBEngine = 'postgresql'
+DBDumpURL = 'https://lt-public-data.s3.amazonaws.com/12M_610_psql.sql.gz'
+NumUsers = 7500
+Duration = '90m'
+
+[[LoadTests]]
+Type = 'unbounded'
+DBEngine = 'mysql'
+DBDumpURL = 'https://lt-public-data.s3.amazonaws.com/12M_610_mysql.sql.gz'
+
+[[LoadTests]]
+Type = 'bounded'
+DBEngine = 'mysql'
+DBDumpURL = 'https://lt-public-data.s3.amazonaws.com/12M_610_mysql.sql.gz'
+NumUsers = 5000
+Duration = '90m'
+
+[Output]
+UploadDashboard = true
+GenerateGraphs = true
+GenerateReport = true
diff --git a/examples/config/perfcomp/config.toml b/examples/config/perfcomp/config.toml
@@ -0,0 +1,59 @@
+[ConnectionConfiguration]
+ServerURL = 'http://localhost:8065'
+WebSocketURL = 'ws://localhost:8065'
+AdminEmail = '[email protected]'
+AdminPassword = 'Sys@dmin-sample1'
+
+[UserControllerConfiguration]
+Type = 'simulative'
+
+[[UserControllerConfiguration.RatesDistribution]]
+Rate = 1.0
+Percentage = 0.05
+
+[[UserControllerConfiguration.RatesDistribution]]
+Rate = 2.0
+Percentage = 0.1
+
+[[UserControllerConfiguration.RatesDistribution]]
+Rate = 3.0
+Percentage = 0.15
+
+[[UserControllerConfiguration.RatesDistribution]]
+Rate = 6.0
+Percentage = 0.4
+
+[[UserControllerConfiguration.RatesDistribution]]
+Rate = 30.0
+Percentage = 0.3
+
+[InstanceConfiguration]
+NumTeams = 2
+NumChannels = 0
+NumPosts = 0
+NumReactions = 0
+NumAdmins = 0
+PercentReplies = 0.5
+PercentRepliesInLongThreads = 0.05
+PercentPublicChannels = 1
+PercentPrivateChannels = 0
+PercentDirectChannels = 0
+PercentGroupChannels = 0
+PercentUrgentPosts = 0.001
+
+[UsersConfiguration]
+InitialActiveUsers = 0
+UsersFilePath = ''
+MaxActiveUsers = 2000
+AvgSessionsPerUser = 1
+PercentOfUsersAreAdmin = 0.0005
+
+[LogSettings]
+EnableConsole = true
+ConsoleLevel = 'DEBUG'
+ConsoleJson = false
+EnableFile = true
+FileLevel = 'DEBUG'
+FileJson = true
+FileLocation = 'ltagent.log'
+EnableColor = true
diff --git a/examples/config/perfcomp/coordinator.toml b/examples/config/perfcomp/coordinator.toml
@@ -0,0 +1,96 @@
+NumUsersInc = 8
+NumUsersDec = 8
+RestTimeSec = 2
+
+[ClusterConfig]
+MaxActiveUsers = 20000
+
+    [[ClusterConfig.Agents]]
+    Id = 'lt0'
+    ApiURL = 'http://localhost:4000'
+
+[MonitorConfig]
+PrometheusURL = 'http://localhost:9090'
+UpdateIntervalMs = 2000
+
+    [[MonitorConfig.Queries]]
+    Description = 'Percentage of HTTP 5xx server errors'
+    Legend = 'Percent'
+    Query = '(sum(rate(mattermost_api_time_count{status_code=~"5.."}[1m]))/sum(rate(mattermost_api_time_count[1m])))*100'
+    Threshold = 0.025
+    MinIntervalSec = 60
+    Alert = true
+
+    [[MonitorConfig.Queries]]
+    Description = 'Average client request duration'
+    Legend = 'Avg duration (s)'
+    Query = 'sum(rate(loadtest_http_request_time_sum[1m]))/sum(rate(loadtest_http_request_time_count[1m]))'
+    Threshold = 0.1
+    MinIntervalSec = 60
+    Alert = true
+
+    [[MonitorConfig.Queries]]
+    Description = '99th percentile of client request duration'
+    Legend = 'P99 duration (s)'
+    Query = 'histogram_quantile(0.99, sum(rate(loadtest_http_request_time_bucket[1m])) by (le))'
+    Threshold = 2
+    MinIntervalSec = 60
+    Alert = true
+
+    [[MonitorConfig.Queries]]
+    Description = 'Percentage of HTTP 5xx client errors'
+    Legend = 'Percent'
+    Query = '(sum(rate(loadtest_http_errors_total{status_code=~"5.."}[1m]))/sum(rate(loadtest_http_request_time_count[1m])))*100'
+    Threshold = 0.025
+    MinIntervalSec = 60
+    Alert = true
+
+    [[MonitorConfig.Queries]]
+    Description = 'Percentage of client timeouts'
+    Legend = 'Percent'
+    Query = '(sum(rate(loadtest_http_timeouts_total[1m]))/sum(rate(loadtest_http_request_time_count[1m]))) * 100'
+    Threshold = 0.025
+    MinIntervalSec = 60
+    Alert = true
+
+    [[MonitorConfig.Queries]]
+    Description = 'CPU utilization - Average of app nodes'
+    Legend = 'Percent'
+    Query = '100 - 100 * (avg(irate(node_cpu_seconds_total{instance=~"app.*",mode="idle"}[5m])))'
+    Threshold = 85
+    MinIntervalSec = 60
+    Alert = true
+
+    [[MonitorConfig.Queries]]
+    Description = 'Memory utilization - Average of app nodes'
+    Legend = 'Percent'
+    Query = '100 - 100 * avg(node_memory_MemAvailable_bytes{instance=~"app.*"} / node_memory_MemTotal_bytes{instance=~"app.*"})'
+    Threshold = 85
+    MinIntervalSec = 60
+    Alert = true
+
+    [[MonitorConfig.Queries]]
+    Description = 'Percentage of TCP retransmissions in the app nodes'
+    Legend = 'Percent'
+    Query = '(avg(rate(node_netstat_Tcp_RetransSegs{instance=~"app.*"}[1m])) / avg(rate(node_netstat_Tcp_OutSegs{instance=~"app.*"}[1m]))) * 100'
+    Threshold = 0.5
+    MinIntervalSec = 60
+    Alert = true
+
+    [[MonitorConfig.Queries]]
+    Description = 'Percentage of TCP retransmissions in the proxy node'
+    Legend = 'Percent'
+    Query = '(avg(rate(node_netstat_Tcp_RetransSegs{instance=~"proxy:9100"}[1m])) / avg(rate(node_netstat_Tcp_OutSegs{instance=~"proxy:9100"}[1m]))) * 100'
+    Threshold = 0.5
+    MinIntervalSec = 60
+    Alert = true
+
+[LogSettings]
+EnableConsole = true
+ConsoleLevel = 'INFO'
+ConsoleJson = false
+EnableFile = true
+FileLevel = 'INFO'
+FileJson = true
+FileLocation = 'ltcoordinator.log'
+EnableColor = false