aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorbaude <bbaude@redhat.com>2019-03-14 15:14:18 -0500
committerbaude <bbaude@redhat.com>2019-03-22 14:58:44 -0500
commitbb69004b8c799763b5e47f2e6b0f5bd77f6ca726 (patch)
tree63cf1b7421d7aaf0eec3a8cd60b04ea6dd642e7e
parent0458daf13d50e45bf9ca0ac03449c6ea60b2ef64 (diff)
downloadpodman-bb69004b8c799763b5e47f2e6b0f5bd77f6ca726.tar.gz
podman-bb69004b8c799763b5e47f2e6b0f5bd77f6ca726.tar.bz2
podman-bb69004b8c799763b5e47f2e6b0f5bd77f6ca726.zip
podman health check phase3
podman will not start a transient service and timer for healthchecks. this handles the tracking of the timing for health checks. added the 'started' status which represents the time that a container is in its start-period. the systemd timing can be disabled with an env variable of DISABLE_HC_SYSTEMD="true". added filter for ps where --filter health=[starting, healthy, unhealthy] can now be used. Signed-off-by: baude <bbaude@redhat.com>
-rw-r--r--cmd/podman/common.go2
-rw-r--r--cmd/podman/ps.go8
-rw-r--r--cmd/podman/shared/create.go10
-rw-r--r--docs/podman-ps.1.md1
-rw-r--r--libpod/container_internal.go22
-rw-r--r--libpod/healthcheck.go145
-rw-r--r--test/e2e/common_test.go10
-rw-r--r--test/e2e/healthcheck_run_test.go94
8 files changed, 272 insertions, 20 deletions
diff --git a/cmd/podman/common.go b/cmd/podman/common.go
index 8b42ed673..771738302 100644
--- a/cmd/podman/common.go
+++ b/cmd/podman/common.go
@@ -293,7 +293,7 @@ func getCreateFlags(c *cliconfig.PodmanCommand) {
)
createFlags.String(
"healthcheck-interval", "30s",
- "set an interval for the healthchecks",
+ "set an interval for the healthchecks (a value of disable results in no automatic timer setup)",
)
createFlags.Uint(
"healthcheck-retries", 3,
diff --git a/cmd/podman/ps.go b/cmd/podman/ps.go
index 20638b424..27774f95d 100644
--- a/cmd/podman/ps.go
+++ b/cmd/podman/ps.go
@@ -494,6 +494,14 @@ func generateContainerFilterFuncs(filter, filterValue string, runtime *libpod.Ru
}
return false
}, nil
+ case "health":
+ return func(c *libpod.Container) bool {
+ hcStatus, err := c.HealthCheckStatus()
+ if err != nil {
+ return false
+ }
+ return hcStatus == filterValue
+ }, nil
}
return nil, errors.Errorf("%s is an invalid filter", filter)
}
diff --git a/cmd/podman/shared/create.go b/cmd/podman/shared/create.go
index 55eb3ce83..5ce0b8865 100644
--- a/cmd/podman/shared/create.go
+++ b/cmd/podman/shared/create.go
@@ -868,21 +868,21 @@ func makeHealthCheckFromCli(c *cliconfig.PodmanCommand) (*manifest.Schema2Health
hc := manifest.Schema2HealthConfig{
Test: cmd,
}
+
+ if inInterval == "disable" {
+ inInterval = "0"
+ }
intervalDuration, err := time.ParseDuration(inInterval)
if err != nil {
return nil, errors.Wrapf(err, "invalid healthcheck-interval %s ", inInterval)
}
- if intervalDuration < time.Duration(time.Second*1) {
- return nil, errors.New("healthcheck-interval must be at least 1 second")
- }
-
hc.Interval = intervalDuration
if inRetries < 1 {
return nil, errors.New("healthcheck-retries must be greater than 0.")
}
-
+ hc.Retries = int(inRetries)
timeoutDuration, err := time.ParseDuration(inTimeout)
if err != nil {
return nil, errors.Wrapf(err, "invalid healthcheck-timeout %s", inTimeout)
diff --git a/docs/podman-ps.1.md b/docs/podman-ps.1.md
index 811fbbc2f..685a52bda 100644
--- a/docs/podman-ps.1.md
+++ b/docs/podman-ps.1.md
@@ -100,6 +100,7 @@ Valid filters are listed below:
| before | [ID] or [Name] Containers created before this container |
| since | [ID] or [Name] Containers created since this container |
| volume | [VolumeName] or [MountpointDestination] Volume mounted in container |
+| health | [Status] healthy or unhealthy |
**--help**, **-h**
diff --git a/libpod/container_internal.go b/libpod/container_internal.go
index 13e660dc3..7a90bc7d4 100644
--- a/libpod/container_internal.go
+++ b/libpod/container_internal.go
@@ -833,6 +833,12 @@ func (c *Container) init(ctx context.Context) error {
if err := c.save(); err != nil {
return err
}
+ if c.config.HealthCheckConfig != nil {
+ if err := c.createTimer(); err != nil {
+ logrus.Error(err)
+ }
+ }
+
defer c.newContainerEvent(events.Init)
return c.completeNetworkSetup()
}
@@ -956,6 +962,15 @@ func (c *Container) start() error {
c.state.State = ContainerStateRunning
+ if c.config.HealthCheckConfig != nil {
+ if err := c.updateHealthStatus(HealthCheckStarting); err != nil {
+ logrus.Error(err)
+ }
+ if err := c.startTimer(); err != nil {
+ logrus.Error(err)
+ }
+ }
+
defer c.newContainerEvent(events.Start)
return c.save()
@@ -1123,6 +1138,13 @@ func (c *Container) cleanup(ctx context.Context) error {
logrus.Debugf("Cleaning up container %s", c.ID())
+ // Remove healthcheck unit/timer file if it execs
+ if c.config.HealthCheckConfig != nil {
+ if err := c.removeTimer(); err != nil {
+ logrus.Error(err)
+ }
+ }
+
// Clean up network namespace, if present
if err := c.cleanupNetwork(); err != nil {
lastError = err
diff --git a/libpod/healthcheck.go b/libpod/healthcheck.go
index d2c0ea0fb..d8f56860b 100644
--- a/libpod/healthcheck.go
+++ b/libpod/healthcheck.go
@@ -3,13 +3,16 @@ package libpod
import (
"bufio"
"bytes"
+ "fmt"
"io/ioutil"
"os"
+ "os/exec"
"path/filepath"
"strings"
"time"
"github.com/containers/libpod/pkg/inspect"
+ "github.com/coreos/go-systemd/dbus"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
)
@@ -47,6 +50,10 @@ const (
HealthCheckHealthy string = "healthy"
// HealthCheckUnhealthy describes an unhealthy container
HealthCheckUnhealthy string = "unhealthy"
+ // HealthCheckStarting describes the time between when the container starts
+ // and the start-period (time allowed for the container to start and application
+ // to be running) expires.
+ HealthCheckStarting string = "starting"
)
// hcWriteCloser allows us to use bufio as a WriteCloser
@@ -68,17 +75,18 @@ func (r *Runtime) HealthCheck(name string) (HealthCheckStatus, error) {
}
hcStatus, err := checkHealthCheckCanBeRun(container)
if err == nil {
- return container.RunHealthCheck()
+ return container.runHealthCheck()
}
return hcStatus, err
}
-// RunHealthCheck runs the health check as defined by the container
-func (c *Container) RunHealthCheck() (HealthCheckStatus, error) {
+// runHealthCheck runs the health check as defined by the container
+func (c *Container) runHealthCheck() (HealthCheckStatus, error) {
var (
- newCommand []string
- returnCode int
- capture bytes.Buffer
+ newCommand []string
+ returnCode int
+ capture bytes.Buffer
+ inStartPeriod bool
)
hcStatus, err := checkHealthCheckCanBeRun(c)
if err != nil {
@@ -111,12 +119,28 @@ func (c *Container) RunHealthCheck() (HealthCheckStatus, error) {
returnCode = 1
}
timeEnd := time.Now()
+ if c.HealthCheckConfig().StartPeriod > 0 {
+ // there is a start-period we need to honor; we add startPeriod to container start time
+ startPeriodTime := c.state.StartedTime.Add(c.HealthCheckConfig().StartPeriod)
+ if timeStart.Before(startPeriodTime) {
+ // we are still in the start period, flip the inStartPeriod bool
+ inStartPeriod = true
+ logrus.Debugf("healthcheck for %s being run in start-period", c.ID())
+ }
+ }
+
eventLog := capture.String()
if len(eventLog) > MaxHealthCheckLogLength {
eventLog = eventLog[:MaxHealthCheckLogLength]
}
+
+ if timeEnd.Sub(timeStart) > c.HealthCheckConfig().Timeout {
+ returnCode = -1
+ hcResult = HealthCheckFailure
+ hcErr = errors.Errorf("healthcheck command exceeded timeout of %s", c.HealthCheckConfig().Timeout.String())
+ }
hcl := newHealthCheckLog(timeStart, timeEnd, returnCode, eventLog)
- if err := c.updateHealthCheckLog(hcl); err != nil {
+ if err := c.updateHealthCheckLog(hcl, inStartPeriod); err != nil {
return hcResult, errors.Wrapf(err, "unable to update health check log %s for %s", c.healthCheckLogPath(), c.ID())
}
return hcResult, hcErr
@@ -145,8 +169,23 @@ func newHealthCheckLog(start, end time.Time, exitCode int, log string) inspect.H
}
}
+// updatedHealthCheckStatus updates the health status of the container
+// in the healthcheck log
+func (c *Container) updateHealthStatus(status string) error {
+ healthCheck, err := c.GetHealthCheckLog()
+ if err != nil {
+ return err
+ }
+ healthCheck.Status = status
+ newResults, err := json.Marshal(healthCheck)
+ if err != nil {
+ return errors.Wrapf(err, "unable to marshall healthchecks for writing status")
+ }
+ return ioutil.WriteFile(c.healthCheckLogPath(), newResults, 0700)
+}
+
// UpdateHealthCheckLog parses the health check results and writes the log
-func (c *Container) updateHealthCheckLog(hcl inspect.HealthCheckLog) error {
+func (c *Container) updateHealthCheckLog(hcl inspect.HealthCheckLog, inStartPeriod bool) error {
healthCheck, err := c.GetHealthCheckLog()
if err != nil {
return err
@@ -159,11 +198,13 @@ func (c *Container) updateHealthCheckLog(hcl inspect.HealthCheckLog) error {
if len(healthCheck.Status) < 1 {
healthCheck.Status = HealthCheckHealthy
}
- // increment failing streak
- healthCheck.FailingStreak = healthCheck.FailingStreak + 1
- // if failing streak > retries, then status to unhealthy
- if int(healthCheck.FailingStreak) > c.HealthCheckConfig().Retries {
- healthCheck.Status = HealthCheckUnhealthy
+ if !inStartPeriod {
+ // increment failing streak
+ healthCheck.FailingStreak = healthCheck.FailingStreak + 1
+ // if failing streak > retries, then status to unhealthy
+ if int(healthCheck.FailingStreak) >= c.HealthCheckConfig().Retries {
+ healthCheck.Status = HealthCheckUnhealthy
+ }
}
}
healthCheck.Log = append(healthCheck.Log, hcl)
@@ -199,3 +240,81 @@ func (c *Container) GetHealthCheckLog() (inspect.HealthCheckResults, error) {
}
return healthCheck, nil
}
+
+// createTimer systemd timers for healthchecks of a container
+func (c *Container) createTimer() error {
+ if c.disableHealthCheckSystemd() {
+ return nil
+ }
+ podman, err := os.Executable()
+ if err != nil {
+ return errors.Wrapf(err, "failed to get path for podman for a health check timer")
+ }
+
+ var cmd = []string{"--unit", fmt.Sprintf("%s", c.ID()), fmt.Sprintf("--on-unit-inactive=%s", c.HealthCheckConfig().Interval.String()), "--timer-property=AccuracySec=1s", podman, "healthcheck", "run", c.ID()}
+
+ conn, err := dbus.NewSystemdConnection()
+ if err != nil {
+ return errors.Wrapf(err, "unable to get systemd connection to add healthchecks")
+ }
+ conn.Close()
+ logrus.Debugf("creating systemd-transient files: %s %s", "systemd-run", cmd)
+ systemdRun := exec.Command("systemd-run", cmd...)
+ _, err = systemdRun.CombinedOutput()
+ if err != nil {
+ return err
+ }
+ return nil
+}
+
+// startTimer starts a systemd timer for the healthchecks
+func (c *Container) startTimer() error {
+ if c.disableHealthCheckSystemd() {
+ return nil
+ }
+ conn, err := dbus.NewSystemdConnection()
+ if err != nil {
+ return errors.Wrapf(err, "unable to get systemd connection to start healthchecks")
+ }
+ defer conn.Close()
+ _, err = conn.StartUnit(fmt.Sprintf("%s.service", c.ID()), "fail", nil)
+ return err
+}
+
+// removeTimer removes the systemd timer and unit files
+// for the container
+func (c *Container) removeTimer() error {
+ if c.disableHealthCheckSystemd() {
+ return nil
+ }
+ conn, err := dbus.NewSystemdConnection()
+ if err != nil {
+ return errors.Wrapf(err, "unable to get systemd connection to remove healthchecks")
+ }
+ defer conn.Close()
+ serviceFile := fmt.Sprintf("%s.timer", c.ID())
+ _, err = conn.StopUnit(serviceFile, "fail", nil)
+ return err
+}
+
+// HealthCheckStatus returns the current state of a container with a healthcheck
+func (c *Container) HealthCheckStatus() (string, error) {
+ if !c.HasHealthCheck() {
+ return "", errors.Errorf("container %s has no defined healthcheck", c.ID())
+ }
+ results, err := c.GetHealthCheckLog()
+ if err != nil {
+ return "", errors.Wrapf(err, "unable to get healthcheck log for %s", c.ID())
+ }
+ return results.Status, nil
+}
+
+func (c *Container) disableHealthCheckSystemd() bool {
+ if os.Getenv("DISABLE_HC_SYSTEMD") == "true" {
+ return true
+ }
+ if c.config.HealthCheckConfig.Interval == 0 {
+ return true
+ }
+ return false
+}
diff --git a/test/e2e/common_test.go b/test/e2e/common_test.go
index 54b2cbec2..b20b3b37e 100644
--- a/test/e2e/common_test.go
+++ b/test/e2e/common_test.go
@@ -239,7 +239,7 @@ func PodmanTestCreateUtil(tempDir string, remote bool) *PodmanTestIntegration {
ociRuntime = "/usr/bin/runc"
}
}
-
+ os.Setenv("DISABLE_HC_SYSTEMD", "true")
CNIConfigDir := "/etc/cni/net.d"
p := &PodmanTestIntegration{
@@ -314,6 +314,14 @@ func (s *PodmanSessionIntegration) InspectImageJSON() []inspect.ImageData {
return i
}
+// InspectContainer returns a container's inspect data in JSON format
+func (p *PodmanTestIntegration) InspectContainer(name string) []inspect.ContainerData {
+ cmd := []string{"inspect", name}
+ session := p.Podman(cmd)
+ session.WaitWithDefaultTimeout()
+ return session.InspectContainerToJSON()
+}
+
func processTestResult(f GinkgoTestDescription) {
tr := testResult{length: f.Duration.Seconds(), name: f.TestText}
testResults = append(testResults, tr)
diff --git a/test/e2e/healthcheck_run_test.go b/test/e2e/healthcheck_run_test.go
index f178e8ad5..ec97fdf4a 100644
--- a/test/e2e/healthcheck_run_test.go
+++ b/test/e2e/healthcheck_run_test.go
@@ -83,4 +83,98 @@ var _ = Describe("Podman healthcheck run", func() {
hc.WaitWithDefaultTimeout()
Expect(hc.ExitCode()).To(Equal(125))
})
+
+ It("podman healthcheck should be starting", func() {
+ session := podmanTest.Podman([]string{"run", "-dt", "--name", "hc", "--healthcheck-retries", "2", "--healthcheck-command", "\"CMD-SHELL ls /foo || exit 1\"", ALPINE, "top"})
+ session.WaitWithDefaultTimeout()
+ Expect(session.ExitCode()).To(Equal(0))
+ inspect := podmanTest.InspectContainer("hc")
+ Expect(inspect[0].State.Healthcheck.Status).To(Equal("starting"))
+ })
+
+ It("podman healthcheck failed checks in start-period should not change status", func() {
+ session := podmanTest.Podman([]string{"run", "-dt", "--name", "hc", "--healthcheck-start-period", "2m", "--healthcheck-retries", "2", "--healthcheck-command", "\"CMD-SHELL ls /foo || exit 1\"", ALPINE, "top"})
+ session.WaitWithDefaultTimeout()
+ Expect(session.ExitCode()).To(Equal(0))
+
+ hc := podmanTest.Podman([]string{"healthcheck", "run", "hc"})
+ hc.WaitWithDefaultTimeout()
+ Expect(hc.ExitCode()).To(Equal(1))
+
+ hc = podmanTest.Podman([]string{"healthcheck", "run", "hc"})
+ hc.WaitWithDefaultTimeout()
+ Expect(hc.ExitCode()).To(Equal(1))
+
+ hc = podmanTest.Podman([]string{"healthcheck", "run", "hc"})
+ hc.WaitWithDefaultTimeout()
+ Expect(hc.ExitCode()).To(Equal(1))
+
+ inspect := podmanTest.InspectContainer("hc")
+ Expect(inspect[0].State.Healthcheck.Status).To(Equal("starting"))
+ })
+
+ It("podman healthcheck failed checks must reach retries before unhealthy ", func() {
+ session := podmanTest.Podman([]string{"run", "-dt", "--name", "hc", "--healthcheck-retries", "2", "--healthcheck-command", "\"CMD-SHELL ls /foo || exit 1\"", ALPINE, "top"})
+ session.WaitWithDefaultTimeout()
+ Expect(session.ExitCode()).To(Equal(0))
+
+ hc := podmanTest.Podman([]string{"healthcheck", "run", "hc"})
+ hc.WaitWithDefaultTimeout()
+ Expect(hc.ExitCode()).To(Equal(1))
+
+ inspect := podmanTest.InspectContainer("hc")
+ Expect(inspect[0].State.Healthcheck.Status).To(Equal("starting"))
+
+ hc = podmanTest.Podman([]string{"healthcheck", "run", "hc"})
+ hc.WaitWithDefaultTimeout()
+ Expect(hc.ExitCode()).To(Equal(1))
+
+ inspect = podmanTest.InspectContainer("hc")
+ Expect(inspect[0].State.Healthcheck.Status).To(Equal("unhealthy"))
+
+ })
+
+ It("podman healthcheck good check results in healthy even in start-period", func() {
+ session := podmanTest.Podman([]string{"run", "-dt", "--name", "hc", "--healthcheck-start-period", "2m", "--healthcheck-retries", "2", "--healthcheck-command", "\"CMD-SHELL\" \"ls\" \"||\" \"exit\" \"1\"", ALPINE, "top"})
+ session.WaitWithDefaultTimeout()
+ Expect(session.ExitCode()).To(Equal(0))
+
+ hc := podmanTest.Podman([]string{"healthcheck", "run", "hc"})
+ hc.WaitWithDefaultTimeout()
+ Expect(hc.ExitCode()).To(Equal(0))
+
+ inspect := podmanTest.InspectContainer("hc")
+ Expect(inspect[0].State.Healthcheck.Status).To(Equal("healthy"))
+ })
+
+ It("podman healthcheck single healthy result changes failed to healthy", func() {
+ session := podmanTest.Podman([]string{"run", "-dt", "--name", "hc", "--healthcheck-retries", "2", "--healthcheck-command", "\"CMD-SHELL\" \"ls\" \"/foo\" \"||\" \"exit\" \"1\"", ALPINE, "top"})
+ session.WaitWithDefaultTimeout()
+ Expect(session.ExitCode()).To(Equal(0))
+
+ hc := podmanTest.Podman([]string{"healthcheck", "run", "hc"})
+ hc.WaitWithDefaultTimeout()
+ Expect(hc.ExitCode()).To(Equal(1))
+
+ inspect := podmanTest.InspectContainer("hc")
+ Expect(inspect[0].State.Healthcheck.Status).To(Equal("starting"))
+
+ hc = podmanTest.Podman([]string{"healthcheck", "run", "hc"})
+ hc.WaitWithDefaultTimeout()
+ Expect(hc.ExitCode()).To(Equal(1))
+
+ inspect = podmanTest.InspectContainer("hc")
+ Expect(inspect[0].State.Healthcheck.Status).To(Equal("unhealthy"))
+
+ foo := podmanTest.Podman([]string{"exec", "hc", "touch", "/foo"})
+ foo.WaitWithDefaultTimeout()
+ Expect(foo.ExitCode()).To(BeZero())
+
+ hc = podmanTest.Podman([]string{"healthcheck", "run", "hc"})
+ hc.WaitWithDefaultTimeout()
+ Expect(hc.ExitCode()).To(Equal(0))
+
+ inspect = podmanTest.InspectContainer("hc")
+ Expect(inspect[0].State.Healthcheck.Status).To(Equal("healthy"))
+ })
})