ciao-launcher: Fix unit test deadlocks

The code in instance_test.go that waited for the instance loop to close down was incorrect. There was a possibility of deadlock if the instance loop was sending some stats down the overseer channel at the same time as the test was trying to shut down the instance. There is similar code in the overseer which actually shuts down the instance loop correctly. This commit simply ports the good overseer code over to instance_test.go. Partial fix for #434 Signed-off-by: Mark Ryan <[email protected]>
ciao-project · Aug 17, 2016 · b764708 · b764708
1 parent 2abcb88
commit b764708
Showing 1 changed file with 49 additions and 43 deletions.
diff --git a/ciao-launcher/instance_test.go b/ciao-launcher/instance_test.go
@@ -267,9 +267,10 @@ func (v *instanceTestState) ClusterConfiguration() (payloads.Configure, error) {
 	return payloads.Configure{}, nil
 }
 
-func cleanupShutdownFail(t *testing.T, instance string, doneCh chan struct{}, ovsCh chan interface{}) {
+func cleanupShutdownFail(t *testing.T, instance string, doneCh chan struct{}, ovsCh chan interface{}, wg *sync.WaitGroup) {
 	_ = os.RemoveAll(path.Join(instancesDir, instance))
-	shutdownInstanceLoop(doneCh, ovsCh)
+
+	shutdownInstanceLoop(doneCh, ovsCh, wg, t)
 	t.FailNow()
 }
 
@@ -383,16 +384,26 @@ func (v *instanceTestState) restartInstance(t *testing.T, ovsCh chan interface{}
 	}
 }
 
-func shutdownInstanceLoop(doneCh chan struct{}, ovsCh chan interface{}) {
+func shutdownInstanceLoop(doneCh chan struct{}, ovsCh chan interface{}, wg *sync.WaitGroup,
+	t *testing.T) {
 	close(doneCh)
+
+	timeout := time.After(time.Second * 5)
 DONE:
 	for {
 		select {
-		case _, ok := <-ovsCh:
-			if !ok {
-				break DONE
-			}
-		default:
+		case <-ovsCh:
+		case <-timeout:
+			t.Error("Timedout waiting for instance loop to exit")
+			break DONE
+		case <-func() chan struct{} {
+			ch := make(chan struct{})
+			go func() {
+				wg.Wait()
+				close(ch)
+			}()
+			return ch
+		}():
 			break DONE
 		}
 	}
@@ -418,11 +429,10 @@ func TestStartInstanceLoop(t *testing.T) {
 	ac := &agentClient{conn: state, cmdCh: cmdWrapCh}
 	_ = startInstanceWithVM(state.instance, cfg, &wg, doneCh, ac, ovsCh, state, &storage.NoopDriver{})
 	ok := state.expectStatsUpdate(t, ovsCh)
-	shutdownInstanceLoop(doneCh, ovsCh)
+	shutdownInstanceLoop(doneCh, ovsCh, &wg, t)
 	if !ok {
 		t.FailNow()
 	}
-	wg.Wait()
 }
 
 // Checks an instance loop can be deleted before an instance is launched.
@@ -449,12 +459,12 @@ func TestDeleteInstanceLoop(t *testing.T) {
 
 	ok := state.expectStatsUpdate(t, ovsCh)
 	if !ok {
-		shutdownInstanceLoop(doneCh, ovsCh)
+		shutdownInstanceLoop(doneCh, ovsCh, &wg, t)
 		t.FailNow()
 	}
 
 	if !state.deleteInstance(t, ovsCh, cmdCh) {
-		shutdownInstanceLoop(doneCh, ovsCh)
+		shutdownInstanceLoop(doneCh, ovsCh, &wg, t)
 		t.FailNow()
 	}
 	wg.Wait()
@@ -483,22 +493,22 @@ func TestStopNotRunning(t *testing.T) {
 
 	ok := state.expectStatsUpdate(t, ovsCh)
 	if !ok {
-		shutdownInstanceLoop(doneCh, ovsCh)
+		shutdownInstanceLoop(doneCh, ovsCh, &wg, t)
 		t.FailNow()
 	}
 
 	select {
 	case cmdCh <- &insStopCmd{}:
 	case <-time.After(time.Second):
-		shutdownInstanceLoop(doneCh, ovsCh)
+		shutdownInstanceLoop(doneCh, ovsCh, &wg, t)
 		t.Fatal("Timed out sending Stop command")
 	}
 
 	select {
 	case <-state.errorCh:
 		state.errorCh = nil
 	case <-time.After(time.Second):
-		shutdownInstanceLoop(doneCh, ovsCh)
+		shutdownInstanceLoop(doneCh, ovsCh, &wg, t)
 		t.Fatal("Timed out waiting for error channel")
 	}
 
@@ -508,7 +518,7 @@ func TestStopNotRunning(t *testing.T) {
 	}
 
 	if !state.deleteInstance(t, ovsCh, cmdCh) {
-		shutdownInstanceLoop(doneCh, ovsCh)
+		shutdownInstanceLoop(doneCh, ovsCh, &wg, t)
 		t.FailNow()
 	}
 	wg.Wait()
@@ -527,12 +537,12 @@ func startVMWithCFG(t *testing.T, wg *sync.WaitGroup, cfg *vmConfig, connect boo
 	state.ac = &agentClient{conn: state, cmdCh: make(chan *cmdWrapper)}
 	cmdCh := startInstanceWithVM(state.instance, cfg, wg, doneCh, state.ac, ovsCh, state, &storage.NoopDriver{})
 	if !state.expectStatsUpdate(t, ovsCh) {
-		shutdownInstanceLoop(doneCh, ovsCh)
+		shutdownInstanceLoop(doneCh, ovsCh, wg, t)
 		t.FailNow()
 	}
 
 	if !state.startInstance(t, ovsCh, cmdCh, cfg, errorOk) {
-		cleanupShutdownFail(t, cfg.Instance, doneCh, ovsCh)
+		cleanupShutdownFail(t, cfg.Instance, doneCh, ovsCh, wg)
 	}
 	return state, ovsCh, cmdCh, doneCh
 }
@@ -552,7 +562,7 @@ func TestStartNotRunning(t *testing.T) {
 	state, ovsCh, cmdCh, doneCh := startVMWithCFG(t, &wg, &cfg, true, false)
 
 	if !state.deleteInstance(t, ovsCh, cmdCh) {
-		cleanupShutdownFail(t, cfg.Instance, doneCh, ovsCh)
+		cleanupShutdownFail(t, cfg.Instance, doneCh, ovsCh, &wg)
 	}
 
 	wg.Wait()
@@ -593,14 +603,12 @@ func TestLoopShutdownWithRunningInstance(t *testing.T) {
 	cfg := standardCfg
 	_, ovsCh, _, doneCh := startVMWithCFG(t, &wg, &cfg, true, false)
 
-	shutdownInstanceLoop(doneCh, ovsCh)
+	shutdownInstanceLoop(doneCh, ovsCh, &wg, t)
 
 	// We need to remove the instance manually to have a clean setup for the
 	// subsequent tests.
 
 	_ = os.RemoveAll(path.Join(instancesDir, cfg.Instance))
-
-	wg.Wait()
 }
 
 // Check we can restart an instance
@@ -628,17 +636,16 @@ func TestRestart(t *testing.T) {
 	cmdCh := startInstanceWithVM(state.instance, &cfg, &wg, doneCh, ac, ovsCh, state, &storage.NoopDriver{})
 	ok := state.expectStatsUpdate(t, ovsCh)
 	if !ok {
-		shutdownInstanceLoop(doneCh, ovsCh)
+		shutdownInstanceLoop(doneCh, ovsCh, &wg, t)
 		t.FailNow()
 	}
 
 	if !state.restartInstance(t, ovsCh, cmdCh, false) {
-		shutdownInstanceLoop(doneCh, ovsCh)
+		shutdownInstanceLoop(doneCh, ovsCh, &wg, t)
 		t.FailNow()
 	}
 
-	shutdownInstanceLoop(doneCh, ovsCh)
-	wg.Wait()
+	shutdownInstanceLoop(doneCh, ovsCh, &wg, t)
 }
 
 // Check we can handle a restart error
@@ -666,7 +673,7 @@ func TestRestartFail(t *testing.T) {
 	cmdCh := startInstanceWithVM(state.instance, &cfg, &wg, doneCh, ac, ovsCh, state, &storage.NoopDriver{})
 	ok := state.expectStatsUpdate(t, ovsCh)
 	if !ok {
-		shutdownInstanceLoop(doneCh, ovsCh)
+		shutdownInstanceLoop(doneCh, ovsCh, &wg, t)
 		t.FailNow()
 	}
 
@@ -679,8 +686,7 @@ func TestRestartFail(t *testing.T) {
 			state.rf.Reason, payloads.RestartLaunchFailure)
 	}
 
-	shutdownInstanceLoop(doneCh, ovsCh)
-	wg.Wait()
+	shutdownInstanceLoop(doneCh, ovsCh, &wg, t)
 }
 
 // Check we get an error when starting an instance with an invalid image
@@ -710,11 +716,11 @@ func TestStartBadImage(t *testing.T) {
 		select {
 		case cmdCh <- acCmd.cmd:
 		case <-time.After(time.Second):
-			shutdownInstanceLoop(doneCh, ovsCh)
+			shutdownInstanceLoop(doneCh, ovsCh, &wg, t)
 			t.Fatal("Timed out sending suicide command")
 		}
 	case <-time.After(time.Second):
-		shutdownInstanceLoop(doneCh, ovsCh)
+		shutdownInstanceLoop(doneCh, ovsCh, &wg, t)
 		t.Fatal("Timedout waiting from suicide command")
 	}
 	wg.Wait()
@@ -742,30 +748,30 @@ func sendCommandDuringSuicide(t *testing.T, testCmd interface{}) *instanceTestSt
 	select {
 	case acCmd = <-state.ac.cmdCh:
 	case <-time.After(time.Second):
-		shutdownInstanceLoop(doneCh, ovsCh)
+		shutdownInstanceLoop(doneCh, ovsCh, &wg, t)
 		t.Fatal("Timedout waiting from suicide command")
 	}
 
 	state.errorCh = make(chan struct{})
 	select {
 	case cmdCh <- testCmd:
 	case <-time.After(time.Second):
-		shutdownInstanceLoop(doneCh, ovsCh)
+		shutdownInstanceLoop(doneCh, ovsCh, &wg, t)
 		t.Fatal("Timed out sending command during suicide")
 	}
 
 	select {
 	case <-state.errorCh:
 		state.errorCh = nil
 	case <-time.After(time.Second):
-		shutdownInstanceLoop(doneCh, ovsCh)
+		shutdownInstanceLoop(doneCh, ovsCh, &wg, t)
 		t.Fatal("Timed out waiting on error channel")
 	}
 
 	select {
 	case cmdCh <- acCmd.cmd:
 	case <-time.After(time.Second):
-		shutdownInstanceLoop(doneCh, ovsCh)
+		shutdownInstanceLoop(doneCh, ovsCh, &wg, t)
 		t.Fatal("Timed out sending suicide command")
 	}
 
@@ -855,15 +861,15 @@ func TestLostInstance(t *testing.T) {
 	close(state.monitorClosedCh)
 
 	if !waitForStateChange(t, ovsStopped, ovsCh) {
-		cleanupShutdownFail(t, cfg.Instance, doneCh, ovsCh)
+		cleanupShutdownFail(t, cfg.Instance, doneCh, ovsCh, &wg)
 	}
 
 	// This gets closed by the instanceLoop and so will become available
 	// in the deleteInstance select loop if we don't set it to nil.
 	state.monitorCh = nil
 
 	if !state.deleteInstance(t, ovsCh, cmdCh) {
-		cleanupShutdownFail(t, cfg.Instance, doneCh, ovsCh)
+		cleanupShutdownFail(t, cfg.Instance, doneCh, ovsCh, &wg)
 	}
 
 	wg.Wait()
@@ -884,7 +890,7 @@ func TestStartRunningInstance(t *testing.T) {
 	state, ovsCh, cmdCh, doneCh := startVMWithCFG(t, &wg, &cfg, true, false)
 
 	if !state.startInstance(t, ovsCh, cmdCh, &cfg, true) {
-		cleanupShutdownFail(t, cfg.Instance, doneCh, ovsCh)
+		cleanupShutdownFail(t, cfg.Instance, doneCh, ovsCh, &wg)
 	}
 
 	if state.stf.Reason != payloads.AlreadyRunning {
@@ -893,7 +899,7 @@ func TestStartRunningInstance(t *testing.T) {
 	}
 
 	if !state.deleteInstance(t, ovsCh, cmdCh) {
-		cleanupShutdownFail(t, cfg.Instance, doneCh, ovsCh)
+		cleanupShutdownFail(t, cfg.Instance, doneCh, ovsCh, &wg)
 	}
 
 	wg.Wait()
@@ -927,7 +933,7 @@ func TestAttachVolumeToInstance(t *testing.T) {
 	}
 
 	if !state.deleteInstance(t, ovsCh, cmdCh) {
-		cleanupShutdownFail(t, cfg.Instance, doneCh, ovsCh)
+		cleanupShutdownFail(t, cfg.Instance, doneCh, ovsCh, &wg)
 	}
 
 	wg.Wait()
@@ -979,7 +985,7 @@ func TestAttachExistingVolumeToInstance(t *testing.T) {
 	}
 
 	if !state.deleteInstance(t, ovsCh, cmdCh) {
-		cleanupShutdownFail(t, cfg.Instance, doneCh, ovsCh)
+		cleanupShutdownFail(t, cfg.Instance, doneCh, ovsCh, &wg)
 	}
 
 	wg.Wait()
@@ -1027,7 +1033,7 @@ func TestDetachVolumeFromInstance(t *testing.T) {
 	}
 
 	if !state.deleteInstance(t, ovsCh, cmdCh) {
-		cleanupShutdownFail(t, cfg.Instance, doneCh, ovsCh)
+		cleanupShutdownFail(t, cfg.Instance, doneCh, ovsCh, &wg)
 	}
 
 	wg.Wait()
@@ -1063,7 +1069,7 @@ func TestDetachNonexistingVolumeFromInstance(t *testing.T) {
 	}
 
 	if !state.deleteInstance(t, ovsCh, cmdCh) {
-		cleanupShutdownFail(t, cfg.Instance, doneCh, ovsCh)
+		cleanupShutdownFail(t, cfg.Instance, doneCh, ovsCh, &wg)
 	}
 
 	wg.Wait()