diff --git a/pkg/ccl/testccl/authccl/auth_test.go b/pkg/ccl/testccl/authccl/auth_test.go index 2b9ad5dc2641..a2e682e392e1 100644 --- a/pkg/ccl/testccl/authccl/auth_test.go +++ b/pkg/ccl/testccl/authccl/auth_test.go @@ -173,7 +173,7 @@ func jwtRunTest(t *testing.T, insecure bool) { if err := cfg.Validate(&dir); err != nil { t.Fatal(err) } - cleanup, err := log.ApplyConfig(cfg, log.FileSinkMetrics{}) + cleanup, err := log.ApplyConfig(cfg, log.FileSinkMetrics{}, nil /* fatalOnLogStall */) if err != nil { t.Fatal(err) } diff --git a/pkg/cli/log_flags.go b/pkg/cli/log_flags.go index 5d7426c385f0..5f7fc92eb769 100644 --- a/pkg/cli/log_flags.go +++ b/pkg/cli/log_flags.go @@ -191,7 +191,10 @@ func setupLogging(ctx context.Context, cmd *cobra.Command, isServerCmd, applyCon logBytesWritten := serverCfg.DiskWriteStatsCollector.CreateStat(fs.CRDBLogWriteCategory) // Configuration ready and directories exist; apply it. - logShutdownFn, err := log.ApplyConfig(h.Config, log.FileSinkMetrics{LogBytesWritten: logBytesWritten}) + fatalOnLogStall := func() bool { + return fs.MaxSyncDurationFatalOnExceeded.Get(&serverCfg.Settings.SV) + } + logShutdownFn, err := log.ApplyConfig(h.Config, log.FileSinkMetrics{LogBytesWritten: logBytesWritten}, fatalOnLogStall) if err != nil { return err } diff --git a/pkg/cmd/roachtest/run.go b/pkg/cmd/roachtest/run.go index da96a8f23016..3860f591ea58 100644 --- a/pkg/cmd/roachtest/run.go +++ b/pkg/cmd/roachtest/run.go @@ -218,7 +218,7 @@ func setLogConfig(baseDir string) { if err := logConf.Validate(&baseDir); err != nil { panic(err) } - if _, err := log.ApplyConfig(logConf, log.FileSinkMetrics{}); err != nil { + if _, err := log.ApplyConfig(logConf, log.FileSinkMetrics{}, nil /* fatalOnLogStall */); err != nil { panic(err) } } diff --git a/pkg/sql/admin_audit_log_test.go b/pkg/sql/admin_audit_log_test.go index 3490694a0b4b..4a929b1a91ba 100644 --- a/pkg/sql/admin_audit_log_test.go +++ b/pkg/sql/admin_audit_log_test.go @@ -44,7 +44,7 @@ func installSensitiveAccessLogFileSink(sc *log.TestLogScope, t *testing.T) func( if err := cfg.Validate(&dir); err != nil { t.Fatal(err) } - cleanup, err := log.ApplyConfig(cfg, log.FileSinkMetrics{}) + cleanup, err := log.ApplyConfig(cfg, log.FileSinkMetrics{}, nil /* fatalOnLogStall */) if err != nil { t.Fatal(err) } diff --git a/pkg/sql/event_log_test.go b/pkg/sql/event_log_test.go index 3ccde5af3100..84a210c8f15a 100644 --- a/pkg/sql/event_log_test.go +++ b/pkg/sql/event_log_test.go @@ -686,7 +686,7 @@ func TestPerfLogging(t *testing.T) { if err := cfg.Validate(&dir); err != nil { t.Fatal(err) } - cleanup, err := log.ApplyConfig(cfg, log.FileSinkMetrics{}) + cleanup, err := log.ApplyConfig(cfg, log.FileSinkMetrics{}, nil /* fatalOnLogStall */) if err != nil { t.Fatal(err) } diff --git a/pkg/sql/pgwire/auth_test.go b/pkg/sql/pgwire/auth_test.go index ffbddadcf21f..d7c92faf05a0 100644 --- a/pkg/sql/pgwire/auth_test.go +++ b/pkg/sql/pgwire/auth_test.go @@ -210,7 +210,7 @@ func hbaRunTest(t *testing.T, insecure bool) { if err := cfg.Validate(&dir); err != nil { t.Fatal(err) } - cleanup, err := log.ApplyConfig(cfg, log.FileSinkMetrics{}) + cleanup, err := log.ApplyConfig(cfg, log.FileSinkMetrics{}, nil /* fatalOnLogStall */) if err != nil { t.Fatal(err) } diff --git a/pkg/util/log/clog_test.go b/pkg/util/log/clog_test.go index df678c48960c..92e37637f3b2 100644 --- a/pkg/util/log/clog_test.go +++ b/pkg/util/log/clog_test.go @@ -410,7 +410,7 @@ func TestGetLogReader(t *testing.T) { // Validate and apply the config. require.NoError(t, config.Validate(&sc.logDir)) TestingResetActive() - cleanupFn, err := ApplyConfig(config, FileSinkMetrics{}) + cleanupFn, err := ApplyConfig(config, FileSinkMetrics{}, nil /* fatalOnLogStall */) require.NoError(t, err) defer cleanupFn() @@ -622,7 +622,7 @@ func TestFd2Capture(t *testing.T) { t.Fatal(err) } TestingResetActive() - cleanupFn, err := ApplyConfig(cfg, FileSinkMetrics{}) + cleanupFn, err := ApplyConfig(cfg, FileSinkMetrics{}, nil /* fatalOnLogStall */) if err != nil { t.Fatal(err) } diff --git a/pkg/util/log/file.go b/pkg/util/log/file.go index 76c4107e818f..6d3540ea0076 100644 --- a/pkg/util/log/file.go +++ b/pkg/util/log/file.go @@ -90,6 +90,8 @@ type fileSink struct { // include at the start of a log file. getStartLines func(time.Time) []*buffer + fatalOnLogStall func() bool + filePermissions fs.FileMode logBytesWritten *atomic.Uint64 @@ -259,14 +261,18 @@ func (l *fileSink) flushAndMaybeSyncLocked(doSync bool) { // recursive back-and-forth between the copy of FATAL events to // OPS and disk slowness detection here. (See the implementation // of logfDepth for details.) + sev := severity.ERROR + // We default to assuming a fatal on log stall. + if l.fatalOnLogStall == nil || l.fatalOnLogStall() { + sev = severity.FATAL + // The write stall may prevent the process from exiting. If the process + // won't exit, we can at least terminate all our RPC connections first. + // + // See pkg/cli.runStart for where this function is hooked up. + MakeProcessUnavailable() + } - // The write stall may prevent the process from exiting. If the process - // won't exit, we can at least terminate all our RPC connections first. - // - // See pkg/cli.runStart for where this function is hooked up. - MakeProcessUnavailable() - - Ops.Shoutf(context.Background(), severity.FATAL, + Ops.Shoutf(context.Background(), sev, "disk stall detected: unable to sync log files within %s", maxSyncDuration, ) }) diff --git a/pkg/util/log/file_log_gc_test.go b/pkg/util/log/file_log_gc_test.go index 7aabb5af5e70..d0f609afded3 100644 --- a/pkg/util/log/file_log_gc_test.go +++ b/pkg/util/log/file_log_gc_test.go @@ -66,7 +66,7 @@ func TestSecondaryGC(t *testing.T) { // Validate and apply the config. require.NoError(t, config.Validate(&s.logDir)) TestingResetActive() - cleanupFn, err := ApplyConfig(config, FileSinkMetrics{}) + cleanupFn, err := ApplyConfig(config, FileSinkMetrics{}, nil /* fatalOnLogStall */) require.NoError(t, err) defer cleanupFn() diff --git a/pkg/util/log/flags.go b/pkg/util/log/flags.go index 4ee2c3b59078..37ef26e21072 100644 --- a/pkg/util/log/flags.go +++ b/pkg/util/log/flags.go @@ -72,7 +72,7 @@ func init() { // using TestLogScope. cfg := getTestConfig(nil /* output to files disabled */, true /* mostly inline */) - if _, err := ApplyConfig(cfg, FileSinkMetrics{}); err != nil { + if _, err := ApplyConfig(cfg, FileSinkMetrics{}, nil /* fatalOnLogStall */); err != nil { panic(err) } @@ -97,7 +97,7 @@ func IsActive() (active bool, firstUse string) { // // The returned logShutdownFn can be used to gracefully shut down logging facilities. func ApplyConfig( - config logconfig.Config, metrics FileSinkMetrics, + config logconfig.Config, metrics FileSinkMetrics, fatalOnLogStall func() bool, ) (logShutdownFn func(), err error) { // Sanity check. if active, firstUse := IsActive(); active { @@ -319,6 +319,7 @@ func ApplyConfig( if err != nil { return nil, err } + fileSink.fatalOnLogStall = fatalOnLogStall attachBufferWrapper(fileSinkInfo, fc.CommonSinkConfig.Buffering, closer) attachSinkInfo(fileSinkInfo, &fc.Channels) diff --git a/pkg/util/log/flags_test.go b/pkg/util/log/flags_test.go index 402a9f1d7b78..0365d0ec861a 100644 --- a/pkg/util/log/flags_test.go +++ b/pkg/util/log/flags_test.go @@ -68,7 +68,7 @@ func TestAppliedConfig(t *testing.T) { } TestingResetActive() - cleanup, err := ApplyConfig(h.Config, FileSinkMetrics{}) + cleanup, err := ApplyConfig(h.Config, FileSinkMetrics{}, nil /* fatalOnLogStall */) if err != nil { t.Fatal(err) } diff --git a/pkg/util/log/fluent_client_test.go b/pkg/util/log/fluent_client_test.go index 645912794a6e..0de1bfb13c0a 100644 --- a/pkg/util/log/fluent_client_test.go +++ b/pkg/util/log/fluent_client_test.go @@ -69,7 +69,7 @@ func TestFluentClient(t *testing.T) { // Apply the configuration. TestingResetActive() - cleanup, err := ApplyConfig(cfg, FileSinkMetrics{}) + cleanup, err := ApplyConfig(cfg, FileSinkMetrics{}, nil /* fatalOnLogStall */) require.NoError(t, err) defer cleanup() diff --git a/pkg/util/log/formats_test.go b/pkg/util/log/formats_test.go index 6d86cb5b8f66..7913200b3c1f 100644 --- a/pkg/util/log/formats_test.go +++ b/pkg/util/log/formats_test.go @@ -66,7 +66,7 @@ func TestFormatRedaction(t *testing.T) { // Validate and apply the config. require.NoError(t, config.Validate(&sc.logDir)) TestingResetActive() - cleanupFn, err := ApplyConfig(config, FileSinkMetrics{}) + cleanupFn, err := ApplyConfig(config, FileSinkMetrics{}, nil /* fatalOnLogStall */) require.NoError(t, err) defer cleanupFn() diff --git a/pkg/util/log/http_sink_test.go b/pkg/util/log/http_sink_test.go index 244ca4258d53..e642b0deca44 100644 --- a/pkg/util/log/http_sink_test.go +++ b/pkg/util/log/http_sink_test.go @@ -148,7 +148,7 @@ func testBase( // Apply the configuration. TestingResetActive() - cleanup, err := ApplyConfig(cfg, FileSinkMetrics{}) + cleanup, err := ApplyConfig(cfg, FileSinkMetrics{}, nil /* fatalOnLogStall */) require.NoError(t, err) defer cleanup() diff --git a/pkg/util/log/log_flush.go b/pkg/util/log/log_flush.go index 083be1752292..260036b8d3a8 100644 --- a/pkg/util/log/log_flush.go +++ b/pkg/util/log/log_flush.go @@ -90,10 +90,21 @@ const flushInterval = time.Second // syncInterval is the multiple of flushInterval where the log is also synced to disk. const syncInterval = 30 -// maxSyncDuration is set to a conservative value since this is a new mechanism. -// In practice, even a fraction of that would indicate a problem. This metric's -// default should ideally match its sister metric in the storage engine, set by -// COCKROACH_ENGINE_MAX_SYNC_DURATION. +// maxSyncDuration is the maximum duration the file sink is allowed to take to +// write a log entry before we fatal the process for a disk stall. Note that +// this fataling behaviour can be disabled by the cluster setting +// storage.max_sync_duration.fatal.enabled for most uses of the logger. +// +// This setting may sound similar to `ExitTimeoutForFatalLog`, however that +// parameter configures how long we wait to write a *fatal* log entry to _any_ +// log sink before we exit the process. This one configures how long we wait +// to write any log entry to a _file_ sink before we write a fatal log entry +// instead (that could then take up to `ExitTimeoutForFatalLog` before crashing +// the process). +// +// This metric's default should ideally match its sister metrics: one in the +// storage engine, set by COCKROACH_ENGINE_MAX_SYNC_DURATION_DEFAULT and the +// storage.max_sync_duration cluster setting, and another in `ExitTimeoutForFatalLog`. var maxSyncDuration = envutil.EnvOrDefaultDuration("COCKROACH_LOG_MAX_SYNC_DURATION", 20*time.Second) // syncWarnDuration is the threshold after which a slow disk warning is written diff --git a/pkg/util/log/logtestutils/log_test_utils.go b/pkg/util/log/logtestutils/log_test_utils.go index 7a4b75de74e2..28d3a0894952 100644 --- a/pkg/util/log/logtestutils/log_test_utils.go +++ b/pkg/util/log/logtestutils/log_test_utils.go @@ -40,7 +40,7 @@ func InstallLogFileSink(sc *log.TestLogScope, t *testing.T, channel logpb.Channe if err := cfg.Validate(&dir); err != nil { t.Fatal(err) } - cleanup, err := log.ApplyConfig(cfg, log.FileSinkMetrics{}) + cleanup, err := log.ApplyConfig(cfg, log.FileSinkMetrics{}, nil /* fatalOnLogStall */) if err != nil { t.Fatal(err) } diff --git a/pkg/util/log/redact_test.go b/pkg/util/log/redact_test.go index 1cafef95afbe..a3fb95f7737a 100644 --- a/pkg/util/log/redact_test.go +++ b/pkg/util/log/redact_test.go @@ -136,7 +136,7 @@ func TestSafeManaged(t *testing.T) { if err := cfg.Validate(&s.logDir); err != nil { t.Fatal(err) } - cleanupFn, err := ApplyConfig(cfg, FileSinkMetrics{}) + cleanupFn, err := ApplyConfig(cfg, FileSinkMetrics{}, nil /* fatalOnLogStall */) if err != nil { t.Fatal(err) } diff --git a/pkg/util/log/registry_test.go b/pkg/util/log/registry_test.go index c3642535ba5f..5709fb33c300 100644 --- a/pkg/util/log/registry_test.go +++ b/pkg/util/log/registry_test.go @@ -66,7 +66,7 @@ func TestIterHTTPSinks(t *testing.T) { // Apply the configuration TestingResetActive() - cleanup, err := ApplyConfig(cfg, FileSinkMetrics{}) + cleanup, err := ApplyConfig(cfg, FileSinkMetrics{}, nil /* fatalOnLogStall */) require.NoError(t, err) defer cleanup() diff --git a/pkg/util/log/secondary_log_test.go b/pkg/util/log/secondary_log_test.go index 0be1b5aa0dbd..0a8e4a496cca 100644 --- a/pkg/util/log/secondary_log_test.go +++ b/pkg/util/log/secondary_log_test.go @@ -48,7 +48,7 @@ func installSessionsFileSink(sc *TestLogScope, t *testing.T) func() { // Apply the configuration. TestingResetActive() - cleanup, err := ApplyConfig(cfg, FileSinkMetrics{}) + cleanup, err := ApplyConfig(cfg, FileSinkMetrics{}, nil /* fatalOnLogStall */) if err != nil { t.Fatal(err) } diff --git a/pkg/util/log/test_log_scope.go b/pkg/util/log/test_log_scope.go index 825a9119c80e..0dace6266e67 100644 --- a/pkg/util/log/test_log_scope.go +++ b/pkg/util/log/test_log_scope.go @@ -162,7 +162,7 @@ func newLogScope(t tShim, mostlyInline bool) (sc *TestLogScope) { // Switch to the new configuration. TestingResetActive() - sc.cleanupFn, err = ApplyConfig(cfg, FileSinkMetrics{}) + sc.cleanupFn, err = ApplyConfig(cfg, FileSinkMetrics{}, nil /* fatalOnLogStall */) if err != nil { return err } @@ -355,7 +355,7 @@ func (l *TestLogScope) SetupSingleFileLogging() (cleanup func()) { // Apply the configuration. TestingResetActive() - cleanup, err := ApplyConfig(cfg, FileSinkMetrics{}) + cleanup, err := ApplyConfig(cfg, FileSinkMetrics{}, nil /* fatalOnLogStall */) if err != nil { panic(errors.NewAssertionErrorWithWrappedErrf(err, "unexpected error in predefined log config")) } diff --git a/pkg/util/log/testshout/shout_test.go b/pkg/util/log/testshout/shout_test.go index 6f29ef29c98a..14e792120fcf 100644 --- a/pkg/util/log/testshout/shout_test.go +++ b/pkg/util/log/testshout/shout_test.go @@ -33,7 +33,7 @@ func Example_shout_before_log() { panic(err) } cfg.Sinks.Stderr.Filter = severity.WARNING - cleanup, err := log.ApplyConfig(cfg, log.FileSinkMetrics{}) + cleanup, err := log.ApplyConfig(cfg, log.FileSinkMetrics{}, nil /* fatalOnLogStall */) if err != nil { panic(err) } diff --git a/pkg/workload/cli/run.go b/pkg/workload/cli/run.go index ee0c27c6eb49..89637f6c164c 100644 --- a/pkg/workload/cli/run.go +++ b/pkg/workload/cli/run.go @@ -181,7 +181,7 @@ func CmdHelper( if err := cfg.Validate(nil /* no default log directory */); err != nil { return err } - if _, err := log.ApplyConfig(cfg, log.FileSinkMetrics{}); err != nil { + if _, err := log.ApplyConfig(cfg, log.FileSinkMetrics{}, nil /* fatalOnLogStall */); err != nil { return err } }