summaryrefslogtreecommitdiff
path: root/libpod/healthcheck.go
diff options
context:
space:
mode:
Diffstat (limited to 'libpod/healthcheck.go')
-rw-r--r--libpod/healthcheck.go320
1 files changed, 320 insertions, 0 deletions
diff --git a/libpod/healthcheck.go b/libpod/healthcheck.go
new file mode 100644
index 000000000..d8f56860b
--- /dev/null
+++ b/libpod/healthcheck.go
@@ -0,0 +1,320 @@
+package libpod
+
+import (
+ "bufio"
+ "bytes"
+ "fmt"
+ "io/ioutil"
+ "os"
+ "os/exec"
+ "path/filepath"
+ "strings"
+ "time"
+
+ "github.com/containers/libpod/pkg/inspect"
+ "github.com/coreos/go-systemd/dbus"
+ "github.com/pkg/errors"
+ "github.com/sirupsen/logrus"
+)
+
+// HealthCheckStatus represents the current state of a container
+type HealthCheckStatus int
+
+const (
+ // HealthCheckSuccess means the health worked
+ HealthCheckSuccess HealthCheckStatus = iota
+ // HealthCheckFailure means the health ran and failed
+ HealthCheckFailure HealthCheckStatus = iota
+ // HealthCheckContainerStopped means the health check cannot
+ // be run because the container is stopped
+ HealthCheckContainerStopped HealthCheckStatus = iota
+ // HealthCheckContainerNotFound means the container could
+ // not be found in local store
+ HealthCheckContainerNotFound HealthCheckStatus = iota
+ // HealthCheckNotDefined means the container has no health
+ // check defined in it
+ HealthCheckNotDefined HealthCheckStatus = iota
+ // HealthCheckInternalError means somes something failed obtaining or running
+ // a given health check
+ HealthCheckInternalError HealthCheckStatus = iota
+ // HealthCheckDefined means the healthcheck was found on the container
+ HealthCheckDefined HealthCheckStatus = iota
+
+ // MaxHealthCheckNumberLogs is the maximum number of attempts we keep
+ // in the healtcheck history file
+ MaxHealthCheckNumberLogs int = 5
+ // MaxHealthCheckLogLength in characters
+ MaxHealthCheckLogLength = 500
+
+ // HealthCheckHealthy describes a healthy container
+ HealthCheckHealthy string = "healthy"
+ // HealthCheckUnhealthy describes an unhealthy container
+ HealthCheckUnhealthy string = "unhealthy"
+ // HealthCheckStarting describes the time between when the container starts
+ // and the start-period (time allowed for the container to start and application
+ // to be running) expires.
+ HealthCheckStarting string = "starting"
+)
+
+// hcWriteCloser allows us to use bufio as a WriteCloser
+type hcWriteCloser struct {
+ *bufio.Writer
+}
+
+// Used to add a closer to bufio
+func (hcwc hcWriteCloser) Close() error {
+ return nil
+}
+
+// HealthCheck verifies the state and validity of the healthcheck configuration
+// on the container and then executes the healthcheck
+func (r *Runtime) HealthCheck(name string) (HealthCheckStatus, error) {
+ container, err := r.LookupContainer(name)
+ if err != nil {
+ return HealthCheckContainerNotFound, errors.Wrapf(err, "unable to lookup %s to perform a health check", name)
+ }
+ hcStatus, err := checkHealthCheckCanBeRun(container)
+ if err == nil {
+ return container.runHealthCheck()
+ }
+ return hcStatus, err
+}
+
+// runHealthCheck runs the health check as defined by the container
+func (c *Container) runHealthCheck() (HealthCheckStatus, error) {
+ var (
+ newCommand []string
+ returnCode int
+ capture bytes.Buffer
+ inStartPeriod bool
+ )
+ hcStatus, err := checkHealthCheckCanBeRun(c)
+ if err != nil {
+ return hcStatus, err
+ }
+ hcCommand := c.HealthCheckConfig().Test
+ if len(hcCommand) > 0 && hcCommand[0] == "CMD-SHELL" {
+ newCommand = []string{"sh", "-c", strings.Join(hcCommand[1:], " ")}
+ } else {
+ newCommand = hcCommand
+ }
+ captureBuffer := bufio.NewWriter(&capture)
+ hcw := hcWriteCloser{
+ captureBuffer,
+ }
+ streams := new(AttachStreams)
+ streams.OutputStream = hcw
+ streams.ErrorStream = hcw
+ streams.InputStream = os.Stdin
+ streams.AttachOutput = true
+ streams.AttachError = true
+ streams.AttachInput = true
+
+ logrus.Debugf("executing health check command %s for %s", strings.Join(newCommand, " "), c.ID())
+ timeStart := time.Now()
+ hcResult := HealthCheckSuccess
+ hcErr := c.Exec(false, false, []string{}, newCommand, "", "", streams, 0)
+ if hcErr != nil {
+ hcResult = HealthCheckFailure
+ returnCode = 1
+ }
+ timeEnd := time.Now()
+ if c.HealthCheckConfig().StartPeriod > 0 {
+ // there is a start-period we need to honor; we add startPeriod to container start time
+ startPeriodTime := c.state.StartedTime.Add(c.HealthCheckConfig().StartPeriod)
+ if timeStart.Before(startPeriodTime) {
+ // we are still in the start period, flip the inStartPeriod bool
+ inStartPeriod = true
+ logrus.Debugf("healthcheck for %s being run in start-period", c.ID())
+ }
+ }
+
+ eventLog := capture.String()
+ if len(eventLog) > MaxHealthCheckLogLength {
+ eventLog = eventLog[:MaxHealthCheckLogLength]
+ }
+
+ if timeEnd.Sub(timeStart) > c.HealthCheckConfig().Timeout {
+ returnCode = -1
+ hcResult = HealthCheckFailure
+ hcErr = errors.Errorf("healthcheck command exceeded timeout of %s", c.HealthCheckConfig().Timeout.String())
+ }
+ hcl := newHealthCheckLog(timeStart, timeEnd, returnCode, eventLog)
+ if err := c.updateHealthCheckLog(hcl, inStartPeriod); err != nil {
+ return hcResult, errors.Wrapf(err, "unable to update health check log %s for %s", c.healthCheckLogPath(), c.ID())
+ }
+ return hcResult, hcErr
+}
+
+func checkHealthCheckCanBeRun(c *Container) (HealthCheckStatus, error) {
+ cstate, err := c.State()
+ if err != nil {
+ return HealthCheckInternalError, err
+ }
+ if cstate != ContainerStateRunning {
+ return HealthCheckContainerStopped, errors.Errorf("container %s is not running", c.ID())
+ }
+ if !c.HasHealthCheck() {
+ return HealthCheckNotDefined, errors.Errorf("container %s has no defined healthcheck", c.ID())
+ }
+ return HealthCheckDefined, nil
+}
+
+func newHealthCheckLog(start, end time.Time, exitCode int, log string) inspect.HealthCheckLog {
+ return inspect.HealthCheckLog{
+ Start: start.Format(time.RFC3339Nano),
+ End: end.Format(time.RFC3339Nano),
+ ExitCode: exitCode,
+ Output: log,
+ }
+}
+
+// updatedHealthCheckStatus updates the health status of the container
+// in the healthcheck log
+func (c *Container) updateHealthStatus(status string) error {
+ healthCheck, err := c.GetHealthCheckLog()
+ if err != nil {
+ return err
+ }
+ healthCheck.Status = status
+ newResults, err := json.Marshal(healthCheck)
+ if err != nil {
+ return errors.Wrapf(err, "unable to marshall healthchecks for writing status")
+ }
+ return ioutil.WriteFile(c.healthCheckLogPath(), newResults, 0700)
+}
+
+// UpdateHealthCheckLog parses the health check results and writes the log
+func (c *Container) updateHealthCheckLog(hcl inspect.HealthCheckLog, inStartPeriod bool) error {
+ healthCheck, err := c.GetHealthCheckLog()
+ if err != nil {
+ return err
+ }
+ if hcl.ExitCode == 0 {
+ // set status to healthy, reset failing state to 0
+ healthCheck.Status = HealthCheckHealthy
+ healthCheck.FailingStreak = 0
+ } else {
+ if len(healthCheck.Status) < 1 {
+ healthCheck.Status = HealthCheckHealthy
+ }
+ if !inStartPeriod {
+ // increment failing streak
+ healthCheck.FailingStreak = healthCheck.FailingStreak + 1
+ // if failing streak > retries, then status to unhealthy
+ if int(healthCheck.FailingStreak) >= c.HealthCheckConfig().Retries {
+ healthCheck.Status = HealthCheckUnhealthy
+ }
+ }
+ }
+ healthCheck.Log = append(healthCheck.Log, hcl)
+ if len(healthCheck.Log) > MaxHealthCheckNumberLogs {
+ healthCheck.Log = healthCheck.Log[1:]
+ }
+ newResults, err := json.Marshal(healthCheck)
+ if err != nil {
+ return errors.Wrapf(err, "unable to marshall healthchecks for writing")
+ }
+ return ioutil.WriteFile(c.healthCheckLogPath(), newResults, 0700)
+}
+
+// HealthCheckLogPath returns the path for where the health check log is
+func (c *Container) healthCheckLogPath() string {
+ return filepath.Join(filepath.Dir(c.LogPath()), "healthcheck.log")
+}
+
+// GetHealthCheckLog returns HealthCheck results by reading the container's
+// health check log file. If the health check log file does not exist, then
+// an empty healthcheck struct is returned
+func (c *Container) GetHealthCheckLog() (inspect.HealthCheckResults, error) {
+ var healthCheck inspect.HealthCheckResults
+ if _, err := os.Stat(c.healthCheckLogPath()); os.IsNotExist(err) {
+ return healthCheck, nil
+ }
+ b, err := ioutil.ReadFile(c.healthCheckLogPath())
+ if err != nil {
+ return healthCheck, errors.Wrapf(err, "failed to read health check log file %s", c.healthCheckLogPath())
+ }
+ if err := json.Unmarshal(b, &healthCheck); err != nil {
+ return healthCheck, errors.Wrapf(err, "failed to unmarshal existing healthcheck results in %s", c.healthCheckLogPath())
+ }
+ return healthCheck, nil
+}
+
+// createTimer systemd timers for healthchecks of a container
+func (c *Container) createTimer() error {
+ if c.disableHealthCheckSystemd() {
+ return nil
+ }
+ podman, err := os.Executable()
+ if err != nil {
+ return errors.Wrapf(err, "failed to get path for podman for a health check timer")
+ }
+
+ var cmd = []string{"--unit", fmt.Sprintf("%s", c.ID()), fmt.Sprintf("--on-unit-inactive=%s", c.HealthCheckConfig().Interval.String()), "--timer-property=AccuracySec=1s", podman, "healthcheck", "run", c.ID()}
+
+ conn, err := dbus.NewSystemdConnection()
+ if err != nil {
+ return errors.Wrapf(err, "unable to get systemd connection to add healthchecks")
+ }
+ conn.Close()
+ logrus.Debugf("creating systemd-transient files: %s %s", "systemd-run", cmd)
+ systemdRun := exec.Command("systemd-run", cmd...)
+ _, err = systemdRun.CombinedOutput()
+ if err != nil {
+ return err
+ }
+ return nil
+}
+
+// startTimer starts a systemd timer for the healthchecks
+func (c *Container) startTimer() error {
+ if c.disableHealthCheckSystemd() {
+ return nil
+ }
+ conn, err := dbus.NewSystemdConnection()
+ if err != nil {
+ return errors.Wrapf(err, "unable to get systemd connection to start healthchecks")
+ }
+ defer conn.Close()
+ _, err = conn.StartUnit(fmt.Sprintf("%s.service", c.ID()), "fail", nil)
+ return err
+}
+
+// removeTimer removes the systemd timer and unit files
+// for the container
+func (c *Container) removeTimer() error {
+ if c.disableHealthCheckSystemd() {
+ return nil
+ }
+ conn, err := dbus.NewSystemdConnection()
+ if err != nil {
+ return errors.Wrapf(err, "unable to get systemd connection to remove healthchecks")
+ }
+ defer conn.Close()
+ serviceFile := fmt.Sprintf("%s.timer", c.ID())
+ _, err = conn.StopUnit(serviceFile, "fail", nil)
+ return err
+}
+
+// HealthCheckStatus returns the current state of a container with a healthcheck
+func (c *Container) HealthCheckStatus() (string, error) {
+ if !c.HasHealthCheck() {
+ return "", errors.Errorf("container %s has no defined healthcheck", c.ID())
+ }
+ results, err := c.GetHealthCheckLog()
+ if err != nil {
+ return "", errors.Wrapf(err, "unable to get healthcheck log for %s", c.ID())
+ }
+ return results.Status, nil
+}
+
+func (c *Container) disableHealthCheckSystemd() bool {
+ if os.Getenv("DISABLE_HC_SYSTEMD") == "true" {
+ return true
+ }
+ if c.config.HealthCheckConfig.Interval == 0 {
+ return true
+ }
+ return false
+}