From fc966d576e640a3b64788035dd56cf3f90e5210e Mon Sep 17 00:00:00 2001 From: Lennart Altenhof Date: Wed, 8 Nov 2023 09:45:13 +0100 Subject: [PATCH] fix(reaper): fix race condition when reusing reapers (#1904) * fix(reaper): fix race condition when reusing reapers Reaper reuse/creation logic has been adjusted to facilitate the reuse of already running reapers. This includes using a specific naming convention based on the session id, and handling failed container creation attempts due to name conflicts by retrieving the already running reaper container. This fixes a race condition when tests are run in parallel in multiple packages which renders global locks ineffective. * docs(reaper): make reaper recovery log entries the same The log message related to the reaper container in the reaper.go file has been updated for better clarity. The redundant phrase "Canceling creation -" has been removed as it does not provide additional relevant information, aiming to improve log readability. * docs(reaper): improve documentation for issues during reaper recovery Further comments where added to error handling in the reaper.go file, to better account for possible race conditions. This includes conditions where a container creation fails due to name conflict but no containers are visible in list-requests. Additionally, a possible scenario where the container may have died between requests has been covered. * style(reaper): change error message style in reaper recovery --- reaper.go | 88 ++++++++++++++++++++++++++++++++++++++++++-------- reaper_test.go | 46 ++++++++++++++++++++++++++ 2 files changed, 121 insertions(+), 13 deletions(-) diff --git a/reaper.go b/reaper.go index f5d571f2c0..dda10c281e 100644 --- a/reaper.go +++ b/reaper.go @@ -6,6 +6,7 @@ import ( "fmt" "math/rand" "net" + "regexp" "strings" "sync" "time" @@ -50,6 +51,14 @@ func NewReaper(ctx context.Context, sessionID string, provider ReaperProvider, r return reuseOrCreateReaper(ctx, sessionID, provider, WithImageName(reaperImageName)) } +// reaperContainerNameFromSessionID returns the container name that uniquely +// identifies the container based on the session id. +func reaperContainerNameFromSessionID(sessionID string) string { + // The session id is 64 characters, so we will not hit the limit of 128 + // characters for container names. + return fmt.Sprintf("reaper_%s", sessionID) +} + // lookUpReaperContainer returns a DockerContainer type with the reaper container in the case // it's found in the running state, and including the labels for sessionID, reaper, and ryuk. // It will perform a retry with exponential backoff to allow for the container to be started and @@ -67,7 +76,7 @@ func lookUpReaperContainer(ctx context.Context, sessionID string) (*DockerContai // we want random intervals between 100ms and 500ms for concurrent executions // to not be synchronized: it could be the case that multiple executions of this - // function happen at the same time (specially when called from a different test + // function happen at the same time (specifically when called from a different test // process execution), and we want to avoid that they all try to find the reaper // container at the same time. exp.InitialInterval = time.Duration(rand.Intn(5)*100) * time.Millisecond @@ -82,6 +91,7 @@ func lookUpReaperContainer(ctx context.Context, sessionID string) (*DockerContai filters.Arg("label", fmt.Sprintf("%s=%s", testcontainersdocker.LabelSessionID, sessionID)), filters.Arg("label", fmt.Sprintf("%s=%t", testcontainersdocker.LabelReaper, true)), filters.Arg("label", fmt.Sprintf("%s=%t", testcontainersdocker.LabelRyuk, true)), + filters.Arg("name", reaperContainerNameFromSessionID(sessionID)), } resp, err := dockerClient.ContainerList(ctx, types.ContainerListOptions{ @@ -146,19 +156,11 @@ func reuseOrCreateReaper(ctx context.Context, sessionID string, provider ReaperP reaperContainer, err := lookUpReaperContainer(context.Background(), sessionID) if err == nil && reaperContainer != nil { // The reaper container exists as a Docker container: re-use it - endpoint, err := reaperContainer.PortEndpoint(ctx, "8080", "") + Logger.Printf("🔥 Reaper obtained from Docker for this test session %s", reaperContainer.ID) + reaperInstance, err = reuseReaperContainer(ctx, sessionID, provider, reaperContainer) if err != nil { return nil, err } - - Logger.Printf("🔥 Reaper obtained from Docker for this test session %s", reaperContainer.ID) - reaperInstance = &Reaper{ - Provider: provider, - SessionID: sessionID, - Endpoint: endpoint, - container: reaperContainer, - } - return reaperInstance, nil } @@ -182,8 +184,25 @@ func reuseOrCreateReaper(ctx context.Context, sessionID string, provider ReaperP return reaperInstance, nil } -// newReaper creates a Reaper with a sessionID to identify containers and a provider to use -// Do not call this directly, use reuseOrCreateReaper instead +var createContainerFailDueToNameConflictRegex = regexp.MustCompile("Conflict. The container name .* is already in use by container .*") + +// reuseReaperContainer constructs a Reaper from an already running reaper +// DockerContainer. +func reuseReaperContainer(ctx context.Context, sessionID string, provider ReaperProvider, reaperContainer *DockerContainer) (*Reaper, error) { + endpoint, err := reaperContainer.PortEndpoint(ctx, "8080", "") + if err != nil { + return nil, err + } + return &Reaper{ + Provider: provider, + SessionID: sessionID, + Endpoint: endpoint, + container: reaperContainer, + }, nil +} + +// newReaper creates a Reaper with a sessionID to identify containers and a +// provider to use. Do not call this directly, use reuseOrCreateReaper instead. func newReaper(ctx context.Context, sessionID string, provider ReaperProvider, opts ...ContainerOption) (*Reaper, error) { dockerHostMount := testcontainersdocker.ExtractDockerSocket(ctx) @@ -209,6 +228,7 @@ func newReaper(ctx context.Context, sessionID string, provider ReaperProvider, o Mounts: Mounts(BindMount(dockerHostMount, "/var/run/docker.sock")), Privileged: tcConfig.RyukPrivileged, WaitingFor: wait.ForListeningPort(listeningPort), + Name: reaperContainerNameFromSessionID(sessionID), ReaperOptions: opts, HostConfigModifier: func(hc *container.HostConfig) { hc.AutoRemove = true @@ -237,6 +257,48 @@ func newReaper(ctx context.Context, sessionID string, provider ReaperProvider, o c, err := provider.RunContainer(ctx, req) if err != nil { + // We need to check whether the error is caused by a container with the same name + // already existing due to race conditions. We manually match the error message + // as we do not have any error types to check against. + if createContainerFailDueToNameConflictRegex.MatchString(err.Error()) { + // Manually retrieve the already running reaper container. However, we need to + // use retries here as there are two possible race conditions that might lead to + // errors: In most cases, there is a small delay between container creation and + // actually being visible in list-requests. This means that creation might fail + // due to name conflicts, but when we list containers with this name, we do not + // get any results. In another case, the container might have simply died in the + // meantime and therefore cannot be found. + const timeout = 5 * time.Second + const cooldown = 100 * time.Millisecond + start := time.Now() + var reaperContainer *DockerContainer + for time.Since(start) < timeout { + reaperContainer, err = lookUpReaperContainer(ctx, sessionID) + if err == nil && reaperContainer != nil { + break + } + select { + case <-ctx.Done(): + case <-time.After(cooldown): + } + } + if err != nil { + return nil, fmt.Errorf("look up reaper container due to name conflict failed: %w", err) + } + // If the reaper container was not found, it is most likely to have died in + // between as we can exclude any client errors because of the previous error + // check. Because the reaper should only die if it performed clean-ups, we can + // fail here as the reaper timeout needs to be increased, anyway. + if reaperContainer == nil { + return nil, fmt.Errorf("look up reaper container returned nil although creation failed due to name conflict") + } + Logger.Printf("🔥 Reaper obtained from Docker for this test session %s", reaperContainer.ID) + reaper, err := reuseReaperContainer(ctx, sessionID, provider, reaperContainer) + if err != nil { + return nil, err + } + return reaper, nil + } return nil, err } reaper.container = c diff --git a/reaper_test.go b/reaper_test.go index 1aa361b179..22572b375a 100644 --- a/reaper_test.go +++ b/reaper_test.go @@ -511,3 +511,49 @@ func TestReaper_reuseItFromOtherTestProgramUsingDocker(t *testing.T) { terminateContainerOnEnd(t, ctx, reaper.container) } } + +// TestReaper_ReuseRunning tests whether reusing the reaper if using +// testcontainers from concurrently multiple packages works as expected. In this +// case, global locks are without any effect as Go tests different packages +// isolated. Therefore, this test does not use the same logic with locks on +// purpose. We expect reaper creation to still succeed in case a reaper is +// already running for the same session id by returning its container instance +// instead. +func TestReaper_ReuseRunning(t *testing.T) { + const concurrency = 64 + + timeout, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() + + sessionID := SessionID() + + dockerProvider, err := NewDockerProvider() + require.NoError(t, err, "new docker provider should not fail") + + obtainedReaperContainerIDs := make([]string, concurrency) + var wg sync.WaitGroup + for i := 0; i < concurrency; i++ { + i := i + wg.Add(1) + go func() { + defer wg.Done() + reaperContainer, err := lookUpReaperContainer(timeout, sessionID) + if err == nil && reaperContainer != nil { + // Found. + obtainedReaperContainerIDs[i] = reaperContainer.GetContainerID() + return + } + // Not found -> create. + createdReaper, err := newReaper(timeout, sessionID, dockerProvider) + require.NoError(t, err, "new reaper should not fail") + obtainedReaperContainerIDs[i] = createdReaper.container.GetContainerID() + }() + } + wg.Wait() + + // Assure that all calls returned the same container. + firstContainerID := obtainedReaperContainerIDs[0] + for i, containerID := range obtainedReaperContainerIDs { + assert.Equal(t, firstContainerID, containerID, "call %d should have returned same container id", i) + } +}