diff options
Diffstat (limited to 'libpod/healthcheck.go')
-rw-r--r-- | libpod/healthcheck.go | 320 |
1 files changed, 320 insertions, 0 deletions
diff --git a/libpod/healthcheck.go b/libpod/healthcheck.go new file mode 100644 index 000000000..d8f56860b --- /dev/null +++ b/libpod/healthcheck.go @@ -0,0 +1,320 @@ +package libpod + +import ( + "bufio" + "bytes" + "fmt" + "io/ioutil" + "os" + "os/exec" + "path/filepath" + "strings" + "time" + + "github.com/containers/libpod/pkg/inspect" + "github.com/coreos/go-systemd/dbus" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" +) + +// HealthCheckStatus represents the current state of a container +type HealthCheckStatus int + +const ( + // HealthCheckSuccess means the health worked + HealthCheckSuccess HealthCheckStatus = iota + // HealthCheckFailure means the health ran and failed + HealthCheckFailure HealthCheckStatus = iota + // HealthCheckContainerStopped means the health check cannot + // be run because the container is stopped + HealthCheckContainerStopped HealthCheckStatus = iota + // HealthCheckContainerNotFound means the container could + // not be found in local store + HealthCheckContainerNotFound HealthCheckStatus = iota + // HealthCheckNotDefined means the container has no health + // check defined in it + HealthCheckNotDefined HealthCheckStatus = iota + // HealthCheckInternalError means somes something failed obtaining or running + // a given health check + HealthCheckInternalError HealthCheckStatus = iota + // HealthCheckDefined means the healthcheck was found on the container + HealthCheckDefined HealthCheckStatus = iota + + // MaxHealthCheckNumberLogs is the maximum number of attempts we keep + // in the healtcheck history file + MaxHealthCheckNumberLogs int = 5 + // MaxHealthCheckLogLength in characters + MaxHealthCheckLogLength = 500 + + // HealthCheckHealthy describes a healthy container + HealthCheckHealthy string = "healthy" + // HealthCheckUnhealthy describes an unhealthy container + HealthCheckUnhealthy string = "unhealthy" + // HealthCheckStarting describes the time between when the container starts + // and the start-period (time allowed for the container to start and application + // to be running) expires. + HealthCheckStarting string = "starting" +) + +// hcWriteCloser allows us to use bufio as a WriteCloser +type hcWriteCloser struct { + *bufio.Writer +} + +// Used to add a closer to bufio +func (hcwc hcWriteCloser) Close() error { + return nil +} + +// HealthCheck verifies the state and validity of the healthcheck configuration +// on the container and then executes the healthcheck +func (r *Runtime) HealthCheck(name string) (HealthCheckStatus, error) { + container, err := r.LookupContainer(name) + if err != nil { + return HealthCheckContainerNotFound, errors.Wrapf(err, "unable to lookup %s to perform a health check", name) + } + hcStatus, err := checkHealthCheckCanBeRun(container) + if err == nil { + return container.runHealthCheck() + } + return hcStatus, err +} + +// runHealthCheck runs the health check as defined by the container +func (c *Container) runHealthCheck() (HealthCheckStatus, error) { + var ( + newCommand []string + returnCode int + capture bytes.Buffer + inStartPeriod bool + ) + hcStatus, err := checkHealthCheckCanBeRun(c) + if err != nil { + return hcStatus, err + } + hcCommand := c.HealthCheckConfig().Test + if len(hcCommand) > 0 && hcCommand[0] == "CMD-SHELL" { + newCommand = []string{"sh", "-c", strings.Join(hcCommand[1:], " ")} + } else { + newCommand = hcCommand + } + captureBuffer := bufio.NewWriter(&capture) + hcw := hcWriteCloser{ + captureBuffer, + } + streams := new(AttachStreams) + streams.OutputStream = hcw + streams.ErrorStream = hcw + streams.InputStream = os.Stdin + streams.AttachOutput = true + streams.AttachError = true + streams.AttachInput = true + + logrus.Debugf("executing health check command %s for %s", strings.Join(newCommand, " "), c.ID()) + timeStart := time.Now() + hcResult := HealthCheckSuccess + hcErr := c.Exec(false, false, []string{}, newCommand, "", "", streams, 0) + if hcErr != nil { + hcResult = HealthCheckFailure + returnCode = 1 + } + timeEnd := time.Now() + if c.HealthCheckConfig().StartPeriod > 0 { + // there is a start-period we need to honor; we add startPeriod to container start time + startPeriodTime := c.state.StartedTime.Add(c.HealthCheckConfig().StartPeriod) + if timeStart.Before(startPeriodTime) { + // we are still in the start period, flip the inStartPeriod bool + inStartPeriod = true + logrus.Debugf("healthcheck for %s being run in start-period", c.ID()) + } + } + + eventLog := capture.String() + if len(eventLog) > MaxHealthCheckLogLength { + eventLog = eventLog[:MaxHealthCheckLogLength] + } + + if timeEnd.Sub(timeStart) > c.HealthCheckConfig().Timeout { + returnCode = -1 + hcResult = HealthCheckFailure + hcErr = errors.Errorf("healthcheck command exceeded timeout of %s", c.HealthCheckConfig().Timeout.String()) + } + hcl := newHealthCheckLog(timeStart, timeEnd, returnCode, eventLog) + if err := c.updateHealthCheckLog(hcl, inStartPeriod); err != nil { + return hcResult, errors.Wrapf(err, "unable to update health check log %s for %s", c.healthCheckLogPath(), c.ID()) + } + return hcResult, hcErr +} + +func checkHealthCheckCanBeRun(c *Container) (HealthCheckStatus, error) { + cstate, err := c.State() + if err != nil { + return HealthCheckInternalError, err + } + if cstate != ContainerStateRunning { + return HealthCheckContainerStopped, errors.Errorf("container %s is not running", c.ID()) + } + if !c.HasHealthCheck() { + return HealthCheckNotDefined, errors.Errorf("container %s has no defined healthcheck", c.ID()) + } + return HealthCheckDefined, nil +} + +func newHealthCheckLog(start, end time.Time, exitCode int, log string) inspect.HealthCheckLog { + return inspect.HealthCheckLog{ + Start: start.Format(time.RFC3339Nano), + End: end.Format(time.RFC3339Nano), + ExitCode: exitCode, + Output: log, + } +} + +// updatedHealthCheckStatus updates the health status of the container +// in the healthcheck log +func (c *Container) updateHealthStatus(status string) error { + healthCheck, err := c.GetHealthCheckLog() + if err != nil { + return err + } + healthCheck.Status = status + newResults, err := json.Marshal(healthCheck) + if err != nil { + return errors.Wrapf(err, "unable to marshall healthchecks for writing status") + } + return ioutil.WriteFile(c.healthCheckLogPath(), newResults, 0700) +} + +// UpdateHealthCheckLog parses the health check results and writes the log +func (c *Container) updateHealthCheckLog(hcl inspect.HealthCheckLog, inStartPeriod bool) error { + healthCheck, err := c.GetHealthCheckLog() + if err != nil { + return err + } + if hcl.ExitCode == 0 { + // set status to healthy, reset failing state to 0 + healthCheck.Status = HealthCheckHealthy + healthCheck.FailingStreak = 0 + } else { + if len(healthCheck.Status) < 1 { + healthCheck.Status = HealthCheckHealthy + } + if !inStartPeriod { + // increment failing streak + healthCheck.FailingStreak = healthCheck.FailingStreak + 1 + // if failing streak > retries, then status to unhealthy + if int(healthCheck.FailingStreak) >= c.HealthCheckConfig().Retries { + healthCheck.Status = HealthCheckUnhealthy + } + } + } + healthCheck.Log = append(healthCheck.Log, hcl) + if len(healthCheck.Log) > MaxHealthCheckNumberLogs { + healthCheck.Log = healthCheck.Log[1:] + } + newResults, err := json.Marshal(healthCheck) + if err != nil { + return errors.Wrapf(err, "unable to marshall healthchecks for writing") + } + return ioutil.WriteFile(c.healthCheckLogPath(), newResults, 0700) +} + +// HealthCheckLogPath returns the path for where the health check log is +func (c *Container) healthCheckLogPath() string { + return filepath.Join(filepath.Dir(c.LogPath()), "healthcheck.log") +} + +// GetHealthCheckLog returns HealthCheck results by reading the container's +// health check log file. If the health check log file does not exist, then +// an empty healthcheck struct is returned +func (c *Container) GetHealthCheckLog() (inspect.HealthCheckResults, error) { + var healthCheck inspect.HealthCheckResults + if _, err := os.Stat(c.healthCheckLogPath()); os.IsNotExist(err) { + return healthCheck, nil + } + b, err := ioutil.ReadFile(c.healthCheckLogPath()) + if err != nil { + return healthCheck, errors.Wrapf(err, "failed to read health check log file %s", c.healthCheckLogPath()) + } + if err := json.Unmarshal(b, &healthCheck); err != nil { + return healthCheck, errors.Wrapf(err, "failed to unmarshal existing healthcheck results in %s", c.healthCheckLogPath()) + } + return healthCheck, nil +} + +// createTimer systemd timers for healthchecks of a container +func (c *Container) createTimer() error { + if c.disableHealthCheckSystemd() { + return nil + } + podman, err := os.Executable() + if err != nil { + return errors.Wrapf(err, "failed to get path for podman for a health check timer") + } + + var cmd = []string{"--unit", fmt.Sprintf("%s", c.ID()), fmt.Sprintf("--on-unit-inactive=%s", c.HealthCheckConfig().Interval.String()), "--timer-property=AccuracySec=1s", podman, "healthcheck", "run", c.ID()} + + conn, err := dbus.NewSystemdConnection() + if err != nil { + return errors.Wrapf(err, "unable to get systemd connection to add healthchecks") + } + conn.Close() + logrus.Debugf("creating systemd-transient files: %s %s", "systemd-run", cmd) + systemdRun := exec.Command("systemd-run", cmd...) + _, err = systemdRun.CombinedOutput() + if err != nil { + return err + } + return nil +} + +// startTimer starts a systemd timer for the healthchecks +func (c *Container) startTimer() error { + if c.disableHealthCheckSystemd() { + return nil + } + conn, err := dbus.NewSystemdConnection() + if err != nil { + return errors.Wrapf(err, "unable to get systemd connection to start healthchecks") + } + defer conn.Close() + _, err = conn.StartUnit(fmt.Sprintf("%s.service", c.ID()), "fail", nil) + return err +} + +// removeTimer removes the systemd timer and unit files +// for the container +func (c *Container) removeTimer() error { + if c.disableHealthCheckSystemd() { + return nil + } + conn, err := dbus.NewSystemdConnection() + if err != nil { + return errors.Wrapf(err, "unable to get systemd connection to remove healthchecks") + } + defer conn.Close() + serviceFile := fmt.Sprintf("%s.timer", c.ID()) + _, err = conn.StopUnit(serviceFile, "fail", nil) + return err +} + +// HealthCheckStatus returns the current state of a container with a healthcheck +func (c *Container) HealthCheckStatus() (string, error) { + if !c.HasHealthCheck() { + return "", errors.Errorf("container %s has no defined healthcheck", c.ID()) + } + results, err := c.GetHealthCheckLog() + if err != nil { + return "", errors.Wrapf(err, "unable to get healthcheck log for %s", c.ID()) + } + return results.Status, nil +} + +func (c *Container) disableHealthCheckSystemd() bool { + if os.Getenv("DISABLE_HC_SYSTEMD") == "true" { + return true + } + if c.config.HealthCheckConfig.Interval == 0 { + return true + } + return false +} |