From adf376d3ed3cb2bb20dd66c35e2885b886b05ed9 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Fri, 15 Sep 2023 13:10:25 +0200 Subject: [PATCH] web: informative and verbose error message when watchdog fails (#647) Right now we use panicf which leads a stack trace which is misleading at what is happening and fills up the space used by kubernetes error reporting. Additionally a few times we have had bug reports about the watchdog failing. This commit updates the message to be far more informative about next steps. Additionally we update the watchdog error to include the response body in case that contains useful information for debugging. Test Plan: Updated the serveHealthz handler to always return an error. Then ran the following $ ZOEKT_WATCHDOG_TICK=1s go run ./cmd/zoekt-webserver 2023/09/14 15:55:27 custom ZOEKT_WATCHDOG_TICK=1s 2023/09/14 15:55:27 loading 1 shard(s): github.com%2Fsourcegraph%2Fzoekt_v16.00000.zoekt 2023/09/14 15:55:28 watchdog: failed, will try 2 more times: watchdog: status=500 body="not ready: boom\n" 2023/09/14 15:55:29 watchdog: failed, will try 1 more times: watchdog: status=500 body="not ready: boom\n" 2023/09/14 15:55:30 watchdog health check has consecutively failed 3 times indicating is likely an unrecoverable error affecting zoekt. As such this process will exit with code 3. Final error: watchdog: status=500 body="not ready: boom\n" Possible Remediations: - If this rarely happens, ignore and let your process manager restart zoekt. - Possibly under provisioned. Try increasing CPU or disk IO. - A bug. Reach out with logs and screenshots of metrics when this occurs. exit status 3 --- cmd/zoekt-webserver/main.go | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/cmd/zoekt-webserver/main.go b/cmd/zoekt-webserver/main.go index c0865e010..22e2a0b68 100644 --- a/cmd/zoekt-webserver/main.go +++ b/cmd/zoekt-webserver/main.go @@ -24,6 +24,7 @@ import ( "flag" "fmt" "html/template" + "io" "log" "net" "net/http" @@ -438,9 +439,11 @@ func watchdogOnce(ctx context.Context, client *http.Client, addr string) error { if err != nil { return err } + body, _ := io.ReadAll(resp.Body) + _ = resp.Body.Close() if resp.StatusCode != http.StatusOK { - return fmt.Errorf("watchdog: status %v", resp.StatusCode) + return fmt.Errorf("watchdog: status=%v body=%q", resp.StatusCode, string(body)) } return nil } @@ -462,7 +465,15 @@ func watchdog(dt time.Duration, maxErrCount int, addr string) { metricWatchdogErrors.Set(float64(errCount)) metricWatchdogErrorsTotal.Inc() if errCount >= maxErrCount { - log.Panicf("watchdog: %v", err) + log.Printf(`watchdog health check has consecutively failed %d times indicating is likely an unrecoverable error affecting zoekt. As such this process will exit with code 3. + +Final error: %v + +Possible remediations: +- If this rarely happens, ignore and let your process manager restart zoekt. +- Possibly under provisioned. Try increasing CPU or disk IO. +- A bug. Reach out with logs and screenshots of metrics when this occurs.`, errCount, err) + os.Exit(3) } else { log.Printf("watchdog: failed, will try %d more times: %v", maxErrCount-errCount, err) }