From 02040089a6c3eb96f231974cdaa4158dc894b5eb Mon Sep 17 00:00:00 2001 From: Valentin Rothberg Date: Fri, 7 Oct 2022 10:29:49 +0200 Subject: health checks: make on-failure action retry aware Make sure that the on-failure actions only kick in once the health check has passed its retries. Also fix race conditions on reading/writing the log. Signed-off-by: Valentin Rothberg --- libpod/healthcheck.go | 43 ++++++++++++++++++++++------------------ test/system/220-healthcheck.bats | 9 +++++---- test/system/250-systemd.bats | 1 + 3 files changed, 30 insertions(+), 23 deletions(-) diff --git a/libpod/healthcheck.go b/libpod/healthcheck.go index a589f2787..1a81360f4 100644 --- a/libpod/healthcheck.go +++ b/libpod/healthcheck.go @@ -32,18 +32,19 @@ func (r *Runtime) HealthCheck(name string) (define.HealthCheckStatus, error) { } hcStatus, err := checkHealthCheckCanBeRun(container) - if err == nil { - hcStatus, err := container.runHealthCheck() - if err := container.processHealthCheckStatus(hcStatus); err != nil { - return hcStatus, err - } + if err != nil { + return hcStatus, err + } + + hcStatus, logStatus, err := container.runHealthCheck() + if err := container.processHealthCheckStatus(logStatus); err != nil { return hcStatus, err } return hcStatus, err } // runHealthCheck runs the health check as defined by the container -func (c *Container) runHealthCheck() (define.HealthCheckStatus, error) { +func (c *Container) runHealthCheck() (define.HealthCheckStatus, string, error) { var ( newCommand []string returnCode int @@ -51,11 +52,11 @@ func (c *Container) runHealthCheck() (define.HealthCheckStatus, error) { ) hcCommand := c.HealthCheckConfig().Test if len(hcCommand) < 1 { - return define.HealthCheckNotDefined, fmt.Errorf("container %s has no defined healthcheck", c.ID()) + return define.HealthCheckNotDefined, "", fmt.Errorf("container %s has no defined healthcheck", c.ID()) } switch hcCommand[0] { case "", define.HealthConfigTestNone: - return define.HealthCheckNotDefined, fmt.Errorf("container %s has no defined healthcheck", c.ID()) + return define.HealthCheckNotDefined, "", fmt.Errorf("container %s has no defined healthcheck", c.ID()) case define.HealthConfigTestCmd: newCommand = hcCommand[1:] case define.HealthConfigTestCmdShell: @@ -66,11 +67,11 @@ func (c *Container) runHealthCheck() (define.HealthCheckStatus, error) { newCommand = hcCommand } if len(newCommand) < 1 || newCommand[0] == "" { - return define.HealthCheckNotDefined, fmt.Errorf("container %s has no defined healthcheck", c.ID()) + return define.HealthCheckNotDefined, "", fmt.Errorf("container %s has no defined healthcheck", c.ID()) } rPipe, wPipe, err := os.Pipe() if err != nil { - return define.HealthCheckInternalError, fmt.Errorf("unable to create pipe for healthcheck session: %w", err) + return define.HealthCheckInternalError, "", fmt.Errorf("unable to create pipe for healthcheck session: %w", err) } defer wPipe.Close() defer rPipe.Close() @@ -135,15 +136,16 @@ func (c *Container) runHealthCheck() (define.HealthCheckStatus, error) { } hcl := newHealthCheckLog(timeStart, timeEnd, returnCode, eventLog) - if err := c.updateHealthCheckLog(hcl, inStartPeriod); err != nil { - return hcResult, fmt.Errorf("unable to update health check log %s for %s: %w", c.healthCheckLogPath(), c.ID(), err) + logStatus, err := c.updateHealthCheckLog(hcl, inStartPeriod) + if err != nil { + return hcResult, "", fmt.Errorf("unable to update health check log %s for %s: %w", c.healthCheckLogPath(), c.ID(), err) } - return hcResult, hcErr + return hcResult, logStatus, hcErr } -func (c *Container) processHealthCheckStatus(status define.HealthCheckStatus) error { - if status == define.HealthCheckSuccess { +func (c *Container) processHealthCheckStatus(status string) error { + if status != define.HealthCheckUnhealthy { return nil } @@ -211,10 +213,13 @@ func (c *Container) updateHealthStatus(status string) error { } // UpdateHealthCheckLog parses the health check results and writes the log -func (c *Container) updateHealthCheckLog(hcl define.HealthCheckLog, inStartPeriod bool) error { +func (c *Container) updateHealthCheckLog(hcl define.HealthCheckLog, inStartPeriod bool) (string, error) { + c.lock.Lock() + defer c.lock.Unlock() + healthCheck, err := c.getHealthCheckLog() if err != nil { - return err + return "", err } if hcl.ExitCode == 0 { // set status to healthy, reset failing state to 0 @@ -239,9 +244,9 @@ func (c *Container) updateHealthCheckLog(hcl define.HealthCheckLog, inStartPerio } newResults, err := json.Marshal(healthCheck) if err != nil { - return fmt.Errorf("unable to marshall healthchecks for writing: %w", err) + return "", fmt.Errorf("unable to marshall healthchecks for writing: %w", err) } - return os.WriteFile(c.healthCheckLogPath(), newResults, 0700) + return healthCheck.Status, os.WriteFile(c.healthCheckLogPath(), newResults, 0700) } // HealthCheckLogPath returns the path for where the health check log is diff --git a/test/system/220-healthcheck.bats b/test/system/220-healthcheck.bats index a1b24d293..8e14adcec 100644 --- a/test/system/220-healthcheck.bats +++ b/test/system/220-healthcheck.bats @@ -28,10 +28,11 @@ function _check_health { --health-cmd /healthcheck \ --health-interval 1s \ --health-retries 3 \ + --health-on-failure=kill \ healthcheck_i run_podman inspect healthcheck_c --format "{{.Config.HealthcheckOnFailureAction}}" - is "$output" "none" "default on-failure action is none" + is "$output" "kill" "on-failure action is set to kill" # We can't check for 'starting' because a 1-second interval is too # short; it could run healthcheck before we get to our first check. @@ -67,9 +68,8 @@ Log[-1].ExitCode | 1 Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\" " - # healthcheck should now fail, with exit status 1 and 'unhealthy' output - run_podman 1 healthcheck run healthcheck_c - is "$output" "unhealthy" "output from 'podman healthcheck run'" + # now the on-failure should kick in and kill the container + podman wait healthcheck_c # Clean up run_podman rm -t 0 -f healthcheck_c @@ -95,6 +95,7 @@ Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\" # Run that healthcheck image. run_podman run -d --name $ctr \ --health-cmd /healthcheck \ + --health-retries=1 \ --health-on-failure=$policy \ $img diff --git a/test/system/250-systemd.bats b/test/system/250-systemd.bats index ddec3a492..5526728b6 100644 --- a/test/system/250-systemd.bats +++ b/test/system/250-systemd.bats @@ -318,6 +318,7 @@ LISTEN_FDNAMES=listen_fdnames" | sort) run_podman create --name $cname \ --health-cmd /healthcheck \ --health-on-failure=kill \ + --health-retries=1 \ --restart=on-failure \ $img -- cgit v1.2.3-54-g00ecf