From aad29e759c78b415a3b0393d7aba2bddbbc0cd3e Mon Sep 17 00:00:00 2001 From: Valentin Rothberg Date: Wed, 7 Sep 2022 11:45:30 +0200 Subject: health check: add on-failure actions For systems that have extreme robustness requirements (edge devices, particularly those in difficult to access environments), it is important that applications continue running in all circumstances. When the application fails, Podman must restart it automatically to provide this robustness. Otherwise, these devices may require customer IT to physically gain access to restart, which can be prohibitively difficult. Add a new `--on-failure` flag that supports four actions: - **none**: Take no action. - **kill**: Kill the container. - **restart**: Restart the container. Do not combine the `restart` action with the `--restart` flag. When running inside of a systemd unit, consider using the `kill` or `stop` action instead to make use of systemd's restart policy. - **stop**: Stop the container. To remain backwards compatible, **none** is the default action. Signed-off-by: Valentin Rothberg --- libpod/container_config.go | 3 ++ libpod/container_inspect.go | 2 ++ libpod/container_validate.go | 4 +++ libpod/define/container_inspect.go | 2 ++ libpod/define/healthchecks.go | 74 ++++++++++++++++++++++++++++++++++++++ libpod/healthcheck.go | 41 ++++++++++++++++++++- libpod/options.go | 11 ++++++ 7 files changed, 136 insertions(+), 1 deletion(-) (limited to 'libpod') diff --git a/libpod/container_config.go b/libpod/container_config.go index bd9816651..f3585d22c 100644 --- a/libpod/container_config.go +++ b/libpod/container_config.go @@ -7,6 +7,7 @@ import ( "github.com/containers/common/libnetwork/types" "github.com/containers/common/pkg/secrets" "github.com/containers/image/v5/manifest" + "github.com/containers/podman/v4/libpod/define" "github.com/containers/podman/v4/pkg/namespaces" "github.com/containers/podman/v4/pkg/specgen" "github.com/containers/storage" @@ -392,6 +393,8 @@ type ContainerMiscConfig struct { Systemd *bool `json:"systemd,omitempty"` // HealthCheckConfig has the health check command and related timings HealthCheckConfig *manifest.Schema2HealthConfig `json:"healthcheck"` + // HealthCheckOnFailureAction defines an action to take once the container turns unhealthy. + HealthCheckOnFailureAction define.HealthCheckOnFailureAction `json:"healthcheck_on_failure_action"` // PreserveFDs is a number of additional file descriptors (in addition // to 0, 1, 2) that will be passed to the executed process. The total FDs // passed will be 3 + PreserveFDs. diff --git a/libpod/container_inspect.go b/libpod/container_inspect.go index 5e2ab2818..ad8bae286 100644 --- a/libpod/container_inspect.go +++ b/libpod/container_inspect.go @@ -390,6 +390,8 @@ func (c *Container) generateInspectContainerConfig(spec *spec.Spec) *define.Insp // leak. ctrConfig.Healthcheck = c.config.HealthCheckConfig + ctrConfig.HealthcheckOnFailureAction = c.config.HealthCheckOnFailureAction.String() + ctrConfig.CreateCommand = c.config.CreateCommand ctrConfig.Timezone = c.config.Timezone diff --git a/libpod/container_validate.go b/libpod/container_validate.go index da33f6db7..f4611ecce 100644 --- a/libpod/container_validate.go +++ b/libpod/container_validate.go @@ -137,5 +137,9 @@ func (c *Container) validate() error { if c.config.SdNotifyMode == define.SdNotifyModeIgnore && len(c.config.SdNotifySocket) > 0 { return fmt.Errorf("cannot set sd-notify socket %q with sd-notify mode %q", c.config.SdNotifySocket, c.config.SdNotifyMode) } + + if c.config.HealthCheckOnFailureAction != define.HealthCheckOnFailureActionNone && c.config.HealthCheckConfig == nil { + return fmt.Errorf("cannot set on-failure action to %s without a health check", c.config.HealthCheckOnFailureAction.String()) + } return nil } diff --git a/libpod/define/container_inspect.go b/libpod/define/container_inspect.go index 5982d684c..da5c58f27 100644 --- a/libpod/define/container_inspect.go +++ b/libpod/define/container_inspect.go @@ -55,6 +55,8 @@ type InspectContainerConfig struct { StopSignal uint `json:"StopSignal"` // Configured healthcheck for the container Healthcheck *manifest.Schema2HealthConfig `json:"Healthcheck,omitempty"` + // HealthcheckOnFailureAction defines an action to take once the container turns unhealthy. + HealthcheckOnFailureAction string `json:"HealthcheckOnFailureAction,omitempty"` // CreateCommand is the full command plus arguments of the process the // container has been created with. CreateCommand []string `json:"CreateCommand,omitempty"` diff --git a/libpod/define/healthchecks.go b/libpod/define/healthchecks.go index f71274350..274e02561 100644 --- a/libpod/define/healthchecks.go +++ b/libpod/define/healthchecks.go @@ -1,5 +1,10 @@ package define +import ( + "fmt" + "strings" +) + const ( // HealthCheckHealthy describes a healthy container HealthCheckHealthy string = "healthy" @@ -57,3 +62,72 @@ const ( // HealthConfigTestCmdShell runs commands with the system's default shell HealthConfigTestCmdShell = "CMD-SHELL" ) + +// HealthCheckOnFailureAction defines how Podman reacts when a container's health +// status turns unhealthy. +type HealthCheckOnFailureAction int + +// Healthcheck on-failure actions. +const ( + // HealthCheckOnFailureActionNonce instructs Podman to not react on an unhealthy status. + HealthCheckOnFailureActionNone = iota // Must be first iota for backwards compatibility + // HealthCheckOnFailureActionInvalid denotes an invalid on-failure policy. + HealthCheckOnFailureActionInvalid = iota + // HealthCheckOnFailureActionNonce instructs Podman to kill the container on an unhealthy status. + HealthCheckOnFailureActionKill = iota + // HealthCheckOnFailureActionNonce instructs Podman to restart the container on an unhealthy status. + HealthCheckOnFailureActionRestart = iota + // HealthCheckOnFailureActionNonce instructs Podman to stop the container on an unhealthy status. + HealthCheckOnFailureActionStop = iota +) + +// String representations for on-failure actions. +const ( + strHealthCheckOnFailureActionNone = "none" + strHealthCheckOnFailureActionInvalid = "invalid" + strHealthCheckOnFailureActionKill = "kill" + strHealthCheckOnFailureActionRestart = "restart" + strHealthCheckOnFailureActionStop = "stop" +) + +// SupportedHealthCheckOnFailureActions lists all supported healthcheck restart policies. +var SupportedHealthCheckOnFailureActions = []string{ + strHealthCheckOnFailureActionNone, + strHealthCheckOnFailureActionKill, + strHealthCheckOnFailureActionRestart, + strHealthCheckOnFailureActionStop, +} + +// String returns the string representation of the HealthCheckOnFailureAction. +func (h HealthCheckOnFailureAction) String() string { + switch h { + case HealthCheckOnFailureActionNone: + return strHealthCheckOnFailureActionNone + case HealthCheckOnFailureActionKill: + return strHealthCheckOnFailureActionKill + case HealthCheckOnFailureActionRestart: + return strHealthCheckOnFailureActionRestart + case HealthCheckOnFailureActionStop: + return strHealthCheckOnFailureActionStop + default: + return strHealthCheckOnFailureActionInvalid + } +} + +// ParseHealthCheckOnFailureAction parses the specified string into a HealthCheckOnFailureAction. +// An error is returned for an invalid input. +func ParseHealthCheckOnFailureAction(s string) (HealthCheckOnFailureAction, error) { + switch s { + case "", strHealthCheckOnFailureActionNone: + return HealthCheckOnFailureActionNone, nil + case strHealthCheckOnFailureActionKill: + return HealthCheckOnFailureActionKill, nil + case strHealthCheckOnFailureActionRestart: + return HealthCheckOnFailureActionRestart, nil + case strHealthCheckOnFailureActionStop: + return HealthCheckOnFailureActionStop, nil + default: + err := fmt.Errorf("invalid on-failure action %q for health check: supported actions are %s", s, strings.Join(SupportedHealthCheckOnFailureActions, ",")) + return HealthCheckOnFailureActionInvalid, err + } +} diff --git a/libpod/healthcheck.go b/libpod/healthcheck.go index 9b9d12b17..e835af9f0 100644 --- a/libpod/healthcheck.go +++ b/libpod/healthcheck.go @@ -2,6 +2,7 @@ package libpod import ( "bufio" + "context" "errors" "fmt" "io/ioutil" @@ -12,6 +13,7 @@ import ( "github.com/containers/podman/v4/libpod/define" "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" ) const ( @@ -29,9 +31,14 @@ func (r *Runtime) HealthCheck(name string) (define.HealthCheckStatus, error) { if err != nil { return define.HealthCheckContainerNotFound, fmt.Errorf("unable to look up %s to perform a health check: %w", name, err) } + hcStatus, err := checkHealthCheckCanBeRun(container) if err == nil { - return container.runHealthCheck() + hcStatus, err := container.runHealthCheck() + if err := container.processHealthCheckStatus(hcStatus); err != nil { + return hcStatus, err + } + return hcStatus, err } return hcStatus, err } @@ -127,13 +134,45 @@ func (c *Container) runHealthCheck() (define.HealthCheckStatus, error) { hcResult = define.HealthCheckFailure hcErr = fmt.Errorf("healthcheck command exceeded timeout of %s", c.HealthCheckConfig().Timeout.String()) } + hcl := newHealthCheckLog(timeStart, timeEnd, returnCode, eventLog) if err := c.updateHealthCheckLog(hcl, inStartPeriod); err != nil { return hcResult, fmt.Errorf("unable to update health check log %s for %s: %w", c.healthCheckLogPath(), c.ID(), err) } + return hcResult, hcErr } +func (c *Container) processHealthCheckStatus(status define.HealthCheckStatus) error { + if status == define.HealthCheckSuccess { + return nil + } + + switch c.config.HealthCheckOnFailureAction { + case define.HealthCheckOnFailureActionNone: // Nothing to do + + case define.HealthCheckOnFailureActionKill: + if err := c.Kill(uint(unix.SIGKILL)); err != nil { + return fmt.Errorf("killing container health-check turned unhealthy: %w", err) + } + + case define.HealthCheckOnFailureActionRestart: + if err := c.RestartWithTimeout(context.Background(), c.config.StopTimeout); err != nil { + return fmt.Errorf("restarting container after health-check turned unhealthy: %w", err) + } + + case define.HealthCheckOnFailureActionStop: + if err := c.Stop(); err != nil { + return fmt.Errorf("stopping container after health-check turned unhealthy: %w", err) + } + + default: // Should not happen but better be safe than sorry + return fmt.Errorf("unsupported on-failure action %d", c.config.HealthCheckOnFailureAction) + } + + return nil +} + func checkHealthCheckCanBeRun(c *Container) (define.HealthCheckStatus, error) { cstate, err := c.State() if err != nil { diff --git a/libpod/options.go b/libpod/options.go index 56d5265d2..71ad3d11e 100644 --- a/libpod/options.go +++ b/libpod/options.go @@ -1473,6 +1473,17 @@ func WithHealthCheck(healthCheck *manifest.Schema2HealthConfig) CtrCreateOption } } +// WithHealthCheckOnFailureAction adds an on-failure action to health-check config +func WithHealthCheckOnFailureAction(action define.HealthCheckOnFailureAction) CtrCreateOption { + return func(ctr *Container) error { + if ctr.valid { + return define.ErrCtrFinalized + } + ctr.config.HealthCheckOnFailureAction = action + return nil + } +} + // WithPreserveFDs forwards from the process running Libpod into the container // the given number of extra FDs (starting after the standard streams) to the created container func WithPreserveFDs(fd uint) CtrCreateOption { -- cgit v1.2.3-54-g00ecf