Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

testiso: improve error when QEMU is terminated #3192

Merged
merged 2 commits into from
Nov 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion mantle/cmd/kola/testiso.go
Original file line number Diff line number Diff line change
Expand Up @@ -537,8 +537,12 @@ func awaitCompletion(ctx context.Context, inst *platform.QemuInstance, outdir st
}
go func() {
err := inst.Wait()
// only one Wait() gets process data, so also manually check for signal
if err == nil && inst.Signaled() {
err = errors.New("process killed")
}
if err != nil {
errchan <- err
errchan <- errors.Wrapf(err, "QEMU unexpectedly exited while awaiting completion")
}
time.Sleep(1 * time.Minute)
errchan <- fmt.Errorf("QEMU exited; timed out waiting for completion")
Expand All @@ -549,6 +553,10 @@ func awaitCompletion(ctx context.Context, inst *platform.QemuInstance, outdir st
l, err := r.ReadString('\n')
if err != nil {
if err == io.EOF {
// this may be from QEMU getting killed or exiting; wait a bit
// to give a chance for .Wait() above to feed the channel with a
// better error
time.Sleep(1 * time.Second)
errchan <- fmt.Errorf("Got EOF from completion channel, %s expected", exp)
} else {
errchan <- errors.Wrapf(err, "reading from completion channel")
Expand Down
15 changes: 15 additions & 0 deletions mantle/platform/metal.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"os"
"path/filepath"
"strings"
"time"

coreosarch "github.com/coreos/stream-metadata-go/arch"
"github.com/pkg/errors"
Expand Down Expand Up @@ -435,11 +436,25 @@ func (t *installerRun) completePxeSetup(kargs []string) error {

func switchBootOrderSignal(qinst *QemuInstance, bootstartedchan *os.File, booterrchan *chan error) {
*booterrchan = make(chan error)
go func() {
err := qinst.Wait()
// only one Wait() gets process data, so also manually check for signal
if err == nil && qinst.Signaled() {
err = errors.New("process killed")
}
if err != nil {
*booterrchan <- errors.Wrapf(err, "QEMU unexpectedly exited while waiting for %s", bootStartedSignal)
}
}()
go func() {
r := bufio.NewReader(bootstartedchan)
l, err := r.ReadString('\n')
if err != nil {
if err == io.EOF {
// this may be from QEMU getting killed or exiting; wait a bit
// to give a chance for .Wait() above to feed the channel with a
// better error
time.Sleep(1 * time.Second)
*booterrchan <- fmt.Errorf("Got EOF from boot started channel, %s expected", bootStartedSignal)
} else {
*booterrchan <- errors.Wrapf(err, "reading from boot started channel")
Expand Down
5 changes: 5 additions & 0 deletions mantle/platform/qemu.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,11 @@ type QemuInstance struct {
qmpSocketPath string
}

// Signaled returns whether QEMU process was signaled.
func (inst *QemuInstance) Signaled() bool {
return inst.qemu.Signaled()
}

// Pid returns the PID of QEMU process.
func (inst *QemuInstance) Pid() int {
return inst.qemu.Pid()
Expand Down
11 changes: 11 additions & 0 deletions mantle/system/exec/exec.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ type Cmd interface {

// Simplified wrapper for Process.Pid
Pid() int

// Simplified wrapper to know if a process was signaled
Signaled() bool
}

// Basic Cmd implementation based on exec.Cmd
Expand Down Expand Up @@ -93,6 +96,14 @@ func (cmd *ExecCmd) Kill() error {
return err
}

func (cmd *ExecCmd) Signaled() bool {
if cmd.ProcessState == nil {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suspect the Go race detector will flag this; I believe the separate goroutine that is blocking in Wait() will be the one that writes to this data. Probably hard to fix though without reworking things completely.

If we were to rework things...I think we want a single stream of events from the qemu process effectively, then we could centralize the sleep(1) thing there.

return false
}
status := cmd.ProcessState.Sys().(syscall.WaitStatus)
return status.Signaled()
}

func (cmd *ExecCmd) Pid() int {
return cmd.Process.Pid
}
Expand Down