From 9fac68bee0530fc3ad31165977467ed08961567e Mon Sep 17 00:00:00 2001 From: Bilal Akhtar Date: Fri, 14 Jun 2024 15:23:00 -0400 Subject: [PATCH] cli: check cluster setting before crashing in log stall Previously, we'd unconditionally crash the process if the logger stalled, even if the cluster setting for not crashing the cockroach process on disk stalls was set. This change has the file logger respect that cluster setting, storage.max_sync_duration.fatal.enabled, as well. Fixes #124474. Epic: none Release note: None --- pkg/ccl/testccl/authccl/auth_test.go | 2 +- pkg/cli/log_flags.go | 5 ++++- pkg/cmd/roachtest/run.go | 2 +- pkg/sql/admin_audit_log_test.go | 2 +- pkg/sql/event_log_test.go | 2 +- pkg/sql/pgwire/auth_test.go | 2 +- pkg/util/log/clog_test.go | 4 ++-- pkg/util/log/file.go | 20 +++++++++++++------- pkg/util/log/file_log_gc_test.go | 2 +- pkg/util/log/flags.go | 5 +++-- pkg/util/log/flags_test.go | 2 +- pkg/util/log/fluent_client_test.go | 2 +- pkg/util/log/formats_test.go | 2 +- pkg/util/log/http_sink_test.go | 2 +- pkg/util/log/log_flush.go | 19 +++++++++++++++---- pkg/util/log/logtestutils/log_test_utils.go | 2 +- pkg/util/log/redact_test.go | 2 +- pkg/util/log/registry_test.go | 2 +- pkg/util/log/secondary_log_test.go | 2 +- pkg/util/log/test_log_scope.go | 4 ++-- pkg/util/log/testshout/shout_test.go | 2 +- pkg/workload/cli/run.go | 2 +- 22 files changed, 55 insertions(+), 34 deletions(-) diff --git a/pkg/ccl/testccl/authccl/auth_test.go b/pkg/ccl/testccl/authccl/auth_test.go index 2b9ad5dc2641..a2e682e392e1 100644 --- a/pkg/ccl/testccl/authccl/auth_test.go +++ b/pkg/ccl/testccl/authccl/auth_test.go @@ -173,7 +173,7 @@ func jwtRunTest(t *testing.T, insecure bool) { if err := cfg.Validate(&dir); err != nil { t.Fatal(err) } - cleanup, err := log.ApplyConfig(cfg, log.FileSinkMetrics{}) + cleanup, err := log.ApplyConfig(cfg, log.FileSinkMetrics{}, nil /* fatalOnLogStall */) if err != nil { t.Fatal(err) } diff --git a/pkg/cli/log_flags.go b/pkg/cli/log_flags.go index 5d7426c385f0..5f7fc92eb769 100644 --- a/pkg/cli/log_flags.go +++ b/pkg/cli/log_flags.go @@ -191,7 +191,10 @@ func setupLogging(ctx context.Context, cmd *cobra.Command, isServerCmd, applyCon logBytesWritten := serverCfg.DiskWriteStatsCollector.CreateStat(fs.CRDBLogWriteCategory) // Configuration ready and directories exist; apply it. - logShutdownFn, err := log.ApplyConfig(h.Config, log.FileSinkMetrics{LogBytesWritten: logBytesWritten}) + fatalOnLogStall := func() bool { + return fs.MaxSyncDurationFatalOnExceeded.Get(&serverCfg.Settings.SV) + } + logShutdownFn, err := log.ApplyConfig(h.Config, log.FileSinkMetrics{LogBytesWritten: logBytesWritten}, fatalOnLogStall) if err != nil { return err } diff --git a/pkg/cmd/roachtest/run.go b/pkg/cmd/roachtest/run.go index da96a8f23016..3860f591ea58 100644 --- a/pkg/cmd/roachtest/run.go +++ b/pkg/cmd/roachtest/run.go @@ -218,7 +218,7 @@ func setLogConfig(baseDir string) { if err := logConf.Validate(&baseDir); err != nil { panic(err) } - if _, err := log.ApplyConfig(logConf, log.FileSinkMetrics{}); err != nil { + if _, err := log.ApplyConfig(logConf, log.FileSinkMetrics{}, nil /* fatalOnLogStall */); err != nil { panic(err) } } diff --git a/pkg/sql/admin_audit_log_test.go b/pkg/sql/admin_audit_log_test.go index 3490694a0b4b..4a929b1a91ba 100644 --- a/pkg/sql/admin_audit_log_test.go +++ b/pkg/sql/admin_audit_log_test.go @@ -44,7 +44,7 @@ func installSensitiveAccessLogFileSink(sc *log.TestLogScope, t *testing.T) func( if err := cfg.Validate(&dir); err != nil { t.Fatal(err) } - cleanup, err := log.ApplyConfig(cfg, log.FileSinkMetrics{}) + cleanup, err := log.ApplyConfig(cfg, log.FileSinkMetrics{}, nil /* fatalOnLogStall */) if err != nil { t.Fatal(err) } diff --git a/pkg/sql/event_log_test.go b/pkg/sql/event_log_test.go index 3ccde5af3100..84a210c8f15a 100644 --- a/pkg/sql/event_log_test.go +++ b/pkg/sql/event_log_test.go @@ -686,7 +686,7 @@ func TestPerfLogging(t *testing.T) { if err := cfg.Validate(&dir); err != nil { t.Fatal(err) } - cleanup, err := log.ApplyConfig(cfg, log.FileSinkMetrics{}) + cleanup, err := log.ApplyConfig(cfg, log.FileSinkMetrics{}, nil /* fatalOnLogStall */) if err != nil { t.Fatal(err) } diff --git a/pkg/sql/pgwire/auth_test.go b/pkg/sql/pgwire/auth_test.go index ffbddadcf21f..d7c92faf05a0 100644 --- a/pkg/sql/pgwire/auth_test.go +++ b/pkg/sql/pgwire/auth_test.go @@ -210,7 +210,7 @@ func hbaRunTest(t *testing.T, insecure bool) { if err := cfg.Validate(&dir); err != nil { t.Fatal(err) } - cleanup, err := log.ApplyConfig(cfg, log.FileSinkMetrics{}) + cleanup, err := log.ApplyConfig(cfg, log.FileSinkMetrics{}, nil /* fatalOnLogStall */) if err != nil { t.Fatal(err) } diff --git a/pkg/util/log/clog_test.go b/pkg/util/log/clog_test.go index df678c48960c..92e37637f3b2 100644 --- a/pkg/util/log/clog_test.go +++ b/pkg/util/log/clog_test.go @@ -410,7 +410,7 @@ func TestGetLogReader(t *testing.T) { // Validate and apply the config. require.NoError(t, config.Validate(&sc.logDir)) TestingResetActive() - cleanupFn, err := ApplyConfig(config, FileSinkMetrics{}) + cleanupFn, err := ApplyConfig(config, FileSinkMetrics{}, nil /* fatalOnLogStall */) require.NoError(t, err) defer cleanupFn() @@ -622,7 +622,7 @@ func TestFd2Capture(t *testing.T) { t.Fatal(err) } TestingResetActive() - cleanupFn, err := ApplyConfig(cfg, FileSinkMetrics{}) + cleanupFn, err := ApplyConfig(cfg, FileSinkMetrics{}, nil /* fatalOnLogStall */) if err != nil { t.Fatal(err) } diff --git a/pkg/util/log/file.go b/pkg/util/log/file.go index 76c4107e818f..6d3540ea0076 100644 --- a/pkg/util/log/file.go +++ b/pkg/util/log/file.go @@ -90,6 +90,8 @@ type fileSink struct { // include at the start of a log file. getStartLines func(time.Time) []*buffer + fatalOnLogStall func() bool + filePermissions fs.FileMode logBytesWritten *atomic.Uint64 @@ -259,14 +261,18 @@ func (l *fileSink) flushAndMaybeSyncLocked(doSync bool) { // recursive back-and-forth between the copy of FATAL events to // OPS and disk slowness detection here. (See the implementation // of logfDepth for details.) + sev := severity.ERROR + // We default to assuming a fatal on log stall. + if l.fatalOnLogStall == nil || l.fatalOnLogStall() { + sev = severity.FATAL + // The write stall may prevent the process from exiting. If the process + // won't exit, we can at least terminate all our RPC connections first. + // + // See pkg/cli.runStart for where this function is hooked up. + MakeProcessUnavailable() + } - // The write stall may prevent the process from exiting. If the process - // won't exit, we can at least terminate all our RPC connections first. - // - // See pkg/cli.runStart for where this function is hooked up. - MakeProcessUnavailable() - - Ops.Shoutf(context.Background(), severity.FATAL, + Ops.Shoutf(context.Background(), sev, "disk stall detected: unable to sync log files within %s", maxSyncDuration, ) }) diff --git a/pkg/util/log/file_log_gc_test.go b/pkg/util/log/file_log_gc_test.go index 7aabb5af5e70..d0f609afded3 100644 --- a/pkg/util/log/file_log_gc_test.go +++ b/pkg/util/log/file_log_gc_test.go @@ -66,7 +66,7 @@ func TestSecondaryGC(t *testing.T) { // Validate and apply the config. require.NoError(t, config.Validate(&s.logDir)) TestingResetActive() - cleanupFn, err := ApplyConfig(config, FileSinkMetrics{}) + cleanupFn, err := ApplyConfig(config, FileSinkMetrics{}, nil /* fatalOnLogStall */) require.NoError(t, err) defer cleanupFn() diff --git a/pkg/util/log/flags.go b/pkg/util/log/flags.go index 4ee2c3b59078..37ef26e21072 100644 --- a/pkg/util/log/flags.go +++ b/pkg/util/log/flags.go @@ -72,7 +72,7 @@ func init() { // using TestLogScope. cfg := getTestConfig(nil /* output to files disabled */, true /* mostly inline */) - if _, err := ApplyConfig(cfg, FileSinkMetrics{}); err != nil { + if _, err := ApplyConfig(cfg, FileSinkMetrics{}, nil /* fatalOnLogStall */); err != nil { panic(err) } @@ -97,7 +97,7 @@ func IsActive() (active bool, firstUse string) { // // The returned logShutdownFn can be used to gracefully shut down logging facilities. func ApplyConfig( - config logconfig.Config, metrics FileSinkMetrics, + config logconfig.Config, metrics FileSinkMetrics, fatalOnLogStall func() bool, ) (logShutdownFn func(), err error) { // Sanity check. if active, firstUse := IsActive(); active { @@ -319,6 +319,7 @@ func ApplyConfig( if err != nil { return nil, err } + fileSink.fatalOnLogStall = fatalOnLogStall attachBufferWrapper(fileSinkInfo, fc.CommonSinkConfig.Buffering, closer) attachSinkInfo(fileSinkInfo, &fc.Channels) diff --git a/pkg/util/log/flags_test.go b/pkg/util/log/flags_test.go index 402a9f1d7b78..0365d0ec861a 100644 --- a/pkg/util/log/flags_test.go +++ b/pkg/util/log/flags_test.go @@ -68,7 +68,7 @@ func TestAppliedConfig(t *testing.T) { } TestingResetActive() - cleanup, err := ApplyConfig(h.Config, FileSinkMetrics{}) + cleanup, err := ApplyConfig(h.Config, FileSinkMetrics{}, nil /* fatalOnLogStall */) if err != nil { t.Fatal(err) } diff --git a/pkg/util/log/fluent_client_test.go b/pkg/util/log/fluent_client_test.go index 645912794a6e..0de1bfb13c0a 100644 --- a/pkg/util/log/fluent_client_test.go +++ b/pkg/util/log/fluent_client_test.go @@ -69,7 +69,7 @@ func TestFluentClient(t *testing.T) { // Apply the configuration. TestingResetActive() - cleanup, err := ApplyConfig(cfg, FileSinkMetrics{}) + cleanup, err := ApplyConfig(cfg, FileSinkMetrics{}, nil /* fatalOnLogStall */) require.NoError(t, err) defer cleanup() diff --git a/pkg/util/log/formats_test.go b/pkg/util/log/formats_test.go index 6d86cb5b8f66..7913200b3c1f 100644 --- a/pkg/util/log/formats_test.go +++ b/pkg/util/log/formats_test.go @@ -66,7 +66,7 @@ func TestFormatRedaction(t *testing.T) { // Validate and apply the config. require.NoError(t, config.Validate(&sc.logDir)) TestingResetActive() - cleanupFn, err := ApplyConfig(config, FileSinkMetrics{}) + cleanupFn, err := ApplyConfig(config, FileSinkMetrics{}, nil /* fatalOnLogStall */) require.NoError(t, err) defer cleanupFn() diff --git a/pkg/util/log/http_sink_test.go b/pkg/util/log/http_sink_test.go index 244ca4258d53..e642b0deca44 100644 --- a/pkg/util/log/http_sink_test.go +++ b/pkg/util/log/http_sink_test.go @@ -148,7 +148,7 @@ func testBase( // Apply the configuration. TestingResetActive() - cleanup, err := ApplyConfig(cfg, FileSinkMetrics{}) + cleanup, err := ApplyConfig(cfg, FileSinkMetrics{}, nil /* fatalOnLogStall */) require.NoError(t, err) defer cleanup() diff --git a/pkg/util/log/log_flush.go b/pkg/util/log/log_flush.go index 083be1752292..260036b8d3a8 100644 --- a/pkg/util/log/log_flush.go +++ b/pkg/util/log/log_flush.go @@ -90,10 +90,21 @@ const flushInterval = time.Second // syncInterval is the multiple of flushInterval where the log is also synced to disk. const syncInterval = 30 -// maxSyncDuration is set to a conservative value since this is a new mechanism. -// In practice, even a fraction of that would indicate a problem. This metric's -// default should ideally match its sister metric in the storage engine, set by -// COCKROACH_ENGINE_MAX_SYNC_DURATION. +// maxSyncDuration is the maximum duration the file sink is allowed to take to +// write a log entry before we fatal the process for a disk stall. Note that +// this fataling behaviour can be disabled by the cluster setting +// storage.max_sync_duration.fatal.enabled for most uses of the logger. +// +// This setting may sound similar to `ExitTimeoutForFatalLog`, however that +// parameter configures how long we wait to write a *fatal* log entry to _any_ +// log sink before we exit the process. This one configures how long we wait +// to write any log entry to a _file_ sink before we write a fatal log entry +// instead (that could then take up to `ExitTimeoutForFatalLog` before crashing +// the process). +// +// This metric's default should ideally match its sister metrics: one in the +// storage engine, set by COCKROACH_ENGINE_MAX_SYNC_DURATION_DEFAULT and the +// storage.max_sync_duration cluster setting, and another in `ExitTimeoutForFatalLog`. var maxSyncDuration = envutil.EnvOrDefaultDuration("COCKROACH_LOG_MAX_SYNC_DURATION", 20*time.Second) // syncWarnDuration is the threshold after which a slow disk warning is written diff --git a/pkg/util/log/logtestutils/log_test_utils.go b/pkg/util/log/logtestutils/log_test_utils.go index 7a4b75de74e2..28d3a0894952 100644 --- a/pkg/util/log/logtestutils/log_test_utils.go +++ b/pkg/util/log/logtestutils/log_test_utils.go @@ -40,7 +40,7 @@ func InstallLogFileSink(sc *log.TestLogScope, t *testing.T, channel logpb.Channe if err := cfg.Validate(&dir); err != nil { t.Fatal(err) } - cleanup, err := log.ApplyConfig(cfg, log.FileSinkMetrics{}) + cleanup, err := log.ApplyConfig(cfg, log.FileSinkMetrics{}, nil /* fatalOnLogStall */) if err != nil { t.Fatal(err) } diff --git a/pkg/util/log/redact_test.go b/pkg/util/log/redact_test.go index 1cafef95afbe..a3fb95f7737a 100644 --- a/pkg/util/log/redact_test.go +++ b/pkg/util/log/redact_test.go @@ -136,7 +136,7 @@ func TestSafeManaged(t *testing.T) { if err := cfg.Validate(&s.logDir); err != nil { t.Fatal(err) } - cleanupFn, err := ApplyConfig(cfg, FileSinkMetrics{}) + cleanupFn, err := ApplyConfig(cfg, FileSinkMetrics{}, nil /* fatalOnLogStall */) if err != nil { t.Fatal(err) } diff --git a/pkg/util/log/registry_test.go b/pkg/util/log/registry_test.go index c3642535ba5f..5709fb33c300 100644 --- a/pkg/util/log/registry_test.go +++ b/pkg/util/log/registry_test.go @@ -66,7 +66,7 @@ func TestIterHTTPSinks(t *testing.T) { // Apply the configuration TestingResetActive() - cleanup, err := ApplyConfig(cfg, FileSinkMetrics{}) + cleanup, err := ApplyConfig(cfg, FileSinkMetrics{}, nil /* fatalOnLogStall */) require.NoError(t, err) defer cleanup() diff --git a/pkg/util/log/secondary_log_test.go b/pkg/util/log/secondary_log_test.go index 0be1b5aa0dbd..0a8e4a496cca 100644 --- a/pkg/util/log/secondary_log_test.go +++ b/pkg/util/log/secondary_log_test.go @@ -48,7 +48,7 @@ func installSessionsFileSink(sc *TestLogScope, t *testing.T) func() { // Apply the configuration. TestingResetActive() - cleanup, err := ApplyConfig(cfg, FileSinkMetrics{}) + cleanup, err := ApplyConfig(cfg, FileSinkMetrics{}, nil /* fatalOnLogStall */) if err != nil { t.Fatal(err) } diff --git a/pkg/util/log/test_log_scope.go b/pkg/util/log/test_log_scope.go index 825a9119c80e..0dace6266e67 100644 --- a/pkg/util/log/test_log_scope.go +++ b/pkg/util/log/test_log_scope.go @@ -162,7 +162,7 @@ func newLogScope(t tShim, mostlyInline bool) (sc *TestLogScope) { // Switch to the new configuration. TestingResetActive() - sc.cleanupFn, err = ApplyConfig(cfg, FileSinkMetrics{}) + sc.cleanupFn, err = ApplyConfig(cfg, FileSinkMetrics{}, nil /* fatalOnLogStall */) if err != nil { return err } @@ -355,7 +355,7 @@ func (l *TestLogScope) SetupSingleFileLogging() (cleanup func()) { // Apply the configuration. TestingResetActive() - cleanup, err := ApplyConfig(cfg, FileSinkMetrics{}) + cleanup, err := ApplyConfig(cfg, FileSinkMetrics{}, nil /* fatalOnLogStall */) if err != nil { panic(errors.NewAssertionErrorWithWrappedErrf(err, "unexpected error in predefined log config")) } diff --git a/pkg/util/log/testshout/shout_test.go b/pkg/util/log/testshout/shout_test.go index 6f29ef29c98a..14e792120fcf 100644 --- a/pkg/util/log/testshout/shout_test.go +++ b/pkg/util/log/testshout/shout_test.go @@ -33,7 +33,7 @@ func Example_shout_before_log() { panic(err) } cfg.Sinks.Stderr.Filter = severity.WARNING - cleanup, err := log.ApplyConfig(cfg, log.FileSinkMetrics{}) + cleanup, err := log.ApplyConfig(cfg, log.FileSinkMetrics{}, nil /* fatalOnLogStall */) if err != nil { panic(err) } diff --git a/pkg/workload/cli/run.go b/pkg/workload/cli/run.go index ee0c27c6eb49..89637f6c164c 100644 --- a/pkg/workload/cli/run.go +++ b/pkg/workload/cli/run.go @@ -181,7 +181,7 @@ func CmdHelper( if err := cfg.Validate(nil /* no default log directory */); err != nil { return err } - if _, err := log.ApplyConfig(cfg, log.FileSinkMetrics{}); err != nil { + if _, err := log.ApplyConfig(cfg, log.FileSinkMetrics{}, nil /* fatalOnLogStall */); err != nil { return err } }