Skip to content

Commit

Permalink
Retry sending signals to pid_file_name before logging errors
Browse files Browse the repository at this point in the history
There is a race where the process pointed to by pid_file_name could be
restarted and read the old certificates but not have its new pid written
into pid_file_name yet. In this case, we would try to signal the old
process and fail, then give up, so the new process would never get its
certs refreshed.

On initial startup, unavoidable errors may also be logged when the
process to be launched has not yet been started (because its certs don't
exist yet) so pid_file_name is missing or invalid. These shouldn't be
logged as errors, but we don't really want to suppress all errors for
all pid file signalling since that would hide misconfigurations.

Handle both issues by adding a retry-backoff loop for signalling
pid_file_name. On failure, messages are logged at Info level until
the retry limit is exceeded, then an Error is logged.

During startup the controlling process is expected to be watching for
the creation of the certificates, and to launch the controlled process
and create the pid file in a timely manner once the certs appear. So it
becomes possible to start up without "normal" errors appearing.

For the restart-race case, we will initially fail to signal the stale
pid in the pid file, so we will retry until the new pid is written, so
the new process that loaded stale certs will be properly signalled to
reload.

To enable better testing for this, the retry count is exposed in the
test channel for process signalling.

Fixes spiffe#250, spiffe#251

Signed-off-by: Craig Ringer <[email protected]>
  • Loading branch information
ringerc committed Feb 5, 2025
1 parent 0af5cbf commit f3c7cec
Show file tree
Hide file tree
Showing 2 changed files with 134 additions and 59 deletions.
119 changes: 90 additions & 29 deletions pkg/sidecar/sidecar.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@ import (
// Whenever an attempt is made to signal pid_file_name, the outcome is sent
// in messages on a channel with this type. Mainly for test purposes.
type pidFileSignalledResult struct {
pid int
err error
pid int
err error
retry int
maxRetries int
}

// Sidecar is the component that consumes the Workload API and renews certs
Expand Down Expand Up @@ -71,6 +73,10 @@ const (
writeStatusUnwritten = "unwritten"
writeStatusFailed = "failed"
writeStatusWritten = "written"

pidFileMaxRetries = 7
pidFileRetryBackoffInitial = 100 * time.Millisecond
pidFileRetryBackoffExponent = 2
)

// New creates a new SPIFFE sidecar
Expand Down Expand Up @@ -233,15 +239,13 @@ func (s *Sidecar) updateCertificates(svidResponse *workloadapi.X509Context) {
s.config.Log.Info("X.509 certificates updated")

if s.config.Cmd != "" {
if err := s.signalProcess(); err != nil {
if err := s.startOrSignalProcess(); err != nil {
s.config.Log.WithError(err).Error("Unable to signal process")
}
}

if s.config.PIDFileName != "" {
if err := s.signalPID(); err != nil {
s.config.Log.WithError(err).Error("Unable to signal PID file")
}
s.retrySignalPIDFile()
}

// TODO: is ReloadExternalProcess still used?
Expand All @@ -257,8 +261,8 @@ func (s *Sidecar) updateCertificates(svidResponse *workloadapi.X509Context) {
}
}

// signalProcessCMD sends the renew signal to the process or starts it if its first time
func (s *Sidecar) signalProcess() error {
// startOrSignalProcess sends the renew signal to the process or starts it if its first time
func (s *Sidecar) startOrSignalProcess() error {
if atomic.LoadInt32(&s.processRunning) == 0 {
cmdArgs, err := getCmdArgs(s.config.CmdArgs)
if err != nil {
Expand All @@ -283,31 +287,88 @@ func (s *Sidecar) signalProcess() error {
return nil
}

// signalPID sends the renew signal to the PID file
func (s *Sidecar) signalPID() error {
pid, err := func() (int, error) {
fileBytes, err := os.ReadFile(s.config.PIDFileName)
if err != nil {
return 0, fmt.Errorf("failed to read pid file \"%s\": %w", s.config.PIDFileName, err)
}
// signalPIDFile sends the renew signal to the PID file
func (s *Sidecar) signalPIDFile() (int, error) {
fileBytes, err := os.ReadFile(s.config.PIDFileName)
if err != nil {
return 0, fmt.Errorf("failed to read pid file \"%s\": %w", s.config.PIDFileName, err)
}

pid, err := strconv.Atoi(string(bytes.TrimSpace(fileBytes)))
if err != nil {
return 0, fmt.Errorf("failed to parse pid file \"%s\": %w", s.config.PIDFileName, err)
}
pid, err := strconv.Atoi(string(bytes.TrimSpace(fileBytes)))
if err != nil {
return 0, fmt.Errorf("failed to parse pid file \"%s\": %w", s.config.PIDFileName, err)
}

pidProcess, err := os.FindProcess(pid)
if err != nil {
return pid, fmt.Errorf("failed to find process id %d: %w", pid, err)
}
pidProcess, err := os.FindProcess(pid)
if err != nil {
return pid, fmt.Errorf("failed to find process id %d: %w", pid, err)
}

return pid, SignalProcess(pidProcess, s.config.RenewSignal)
}()
// Allow tests to observe the outcome of signalling the pid file
if s.pidFileSignalledChan != nil {
s.pidFileSignalledChan <- pidFileSignalledResult{pid: pid, err: err}
err = SignalProcess(pidProcess, s.config.RenewSignal)
if err != nil {
return pidProcess.Pid, err
}
return err
return pidProcess.Pid, nil
}

// retrySignalPIDFile retries signalPIDFile in a short backoff loop to defend
// against races between certificate renewal and pid file creation or update.
//
// It is possible that the pid file may not yet exist if spiffe-helper has been
// started to initially acquire the certs required to start the process that
// the pid file will point to. In this case we want to retry until the pid file
// exists. Though signalling the process should really be unnecessary if we
// just wrote the certificates anyway, this confirms that the pid file is valid
// so misconfiguration can be detected and logged early, rather than waiting
// until the certificate expiry and renewal interval.
//
// It is also possible that the pid pointed to by the pid file might exit and a
// new process be started just as we write a new certificate. In this case it's
// possible that the new process read the old certificates before we overwrote
// them, but the pid file has not been updated before we try to signal it. We
// must retry if signalling the old process fails, until the pid file is
// updated to point to the new process.
//
// Retrying happens in the background to avoid blocking the main loop. This
// could cause multiple signal delivery in the unlikely case of repeated
// renewals within a short period, but that should be harmless.
//
// This should probably be an indefinte retry-until-success loop that starts
// logging errors after a certain number of retries, with configurable initial
// and max retry delays. But it's expected that the pid file will be updated
// quickly, so this is a simple solution for now.
func (s *Sidecar) retrySignalPIDFile() {
go func() {
retryBackoff := pidFileRetryBackoffInitial
i := 0
for {
pid, err := s.signalPIDFile()
// for test observer
if s.pidFileSignalledChan != nil {
s.pidFileSignalledChan <- pidFileSignalledResult{pid: pid, err: err, retry: i, maxRetries: pidFileMaxRetries}
}
msg := s.config.Log.
WithField("pid_file_name", s.config.PIDFileName).
WithField("renew_signal", s.config.RenewSignal).
WithField("pid", pid).
WithField("retry", i).
WithField("max_retries", pidFileMaxRetries)
if err == nil {
msg.Info("Signaled PID file")
return
}
msg = msg.WithError(err)
if i < pidFileMaxRetries {
msg.Info("Error signaling PID file, retrying...")
} else {
msg.Error("Error signaling PID file after multiple retries")
return
}
time.Sleep(retryBackoff)
retryBackoff *= pidFileRetryBackoffExponent
i++
}
}()
}

// Goroutine to watch a running process until it exits and report its exit status
Expand Down
74 changes: 44 additions & 30 deletions pkg/sidecar/sidecar_posix_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,12 +115,14 @@ func TestSidecar_TestPidFileNameSignalling(t *testing.T) {
{
// pid_file_name signalling - we expect the sidecar to send
// a signal to the pid pointed to in the pid file.
name: "pid_file_name signalling",
name: "pid file valid",
signal: syscall.SIGUSR1,
},
{
// Repeat the test with the pid file missing the first time around
name: "pid_file_name signalling",
// Repeat the test with the pid file missing the first time around,
// then create it after confirming the first signal attempt(s)
// failed.
name: "pid file missing then created",
signal: syscall.SIGUSR1,
missingPidfile: true,
},
Expand Down Expand Up @@ -179,39 +181,51 @@ func TestSidecar_TestPidFileNameSignalling(t *testing.T) {
}

if tc.missingPidfile {
// The pid file was missing so signalling will fail.
// The pid file was missing the first time around, so we should
// have received an error with no pid specified.
require.Equal(t, 0, pidFileResult.pid)
require.Error(t, pidFileResult.err)
// Creating the pid file won't help now, so we're done
// until the next renew.
require.Equal(t, 0, pidFileResult.retry)
require.Equal(t, pidFileMaxRetries, pidFileResult.maxRetries)
// Create the pid file. Retry back-off means the sidecar will
// still be waiting and retrying.
writePidFile()
select {
case <-ctx.Done():
// overall context has expired; this will fail the test.
require.NoError(t, ctx.Err())
return
case pidFileResult = <-sidecar.pidFileSignalledChan:
t.Logf("sidecar reports it sent a signal: %+v", pidFileResult)
require.Fail(t, "should not have signalled, since we don't retry signals")
case <-time.After(200 * time.Millisecond):
// A signal retry would've arried by now if there was going to be one
t.Logf("no signal re-delivery after pid file creation")
}
// A cert renewal will trigger another attempt to signal, which should succeed
s.MockUpdateX509Certificate(ctx, t, svid)
select {
case <-ctx.Done():
require.NoError(t, ctx.Err())
return
case pidFileResult = <-sidecar.pidFileSignalledChan:
t.Logf("sidecar reports it sent a signal: %+v", pidFileResult)
require.NoError(t, pidFileResult.err)
// Consume signal attempt notifications until we get a
// successful one or a final maxretries failure. We aren't
// checking if we actually got the forwarded signal yet as
// there could be races there. Only check what the sidecar
// manager thinks it has done.
SIGNALLED:
for {
select {
case <-ctx.Done():
// overall context has expired; this will fail the test.
require.NoError(t, ctx.Err())
return
case pidFileResult = <-sidecar.pidFileSignalledChan:
t.Logf("sidecar reports it attempted to send a signal: %+v", pidFileResult)
if pidFileResult.err == nil {
// Sidecar thinks it sent a signal successfully on retry
break SIGNALLED
}
if pidFileResult.retry == pidFileMaxRetries {
// Sidecar ran out of retries and gave up
require.Fail(t, "pid file signalling failed after max retries")
}
}
}
// Must not be the first retry
require.Greater(t, pidFileResult.retry, 0)
} else {
// Since a valid pid file was supplied, signalling must succeed
// on the first attempt
require.Equal(t, 0, pidFileResult.retry)
}
// Since a valid pid file was supplied, signalling must succeed
// on the first attempt
require.Equal(t, pid, pidFileResult.pid)

// The pid file was present, so we should have received a signal
require.NoError(t, pidFileResult.err)
require.Equal(t, pid, pidFileResult.pid)
require.Equal(t, pidFileMaxRetries, pidFileResult.maxRetries)

// Did we actually receive the signal?
select {
Expand Down

0 comments on commit f3c7cec

Please sign in to comment.