From bb69004b8c799763b5e47f2e6b0f5bd77f6ca726 Mon Sep 17 00:00:00 2001 From: baude Date: Thu, 14 Mar 2019 15:14:18 -0500 Subject: podman health check phase3 podman will not start a transient service and timer for healthchecks. this handles the tracking of the timing for health checks. added the 'started' status which represents the time that a container is in its start-period. the systemd timing can be disabled with an env variable of DISABLE_HC_SYSTEMD="true". added filter for ps where --filter health=[starting, healthy, unhealthy] can now be used. Signed-off-by: baude --- cmd/podman/common.go | 2 +- cmd/podman/ps.go | 8 +++ cmd/podman/shared/create.go | 10 +-- docs/podman-ps.1.md | 1 + libpod/container_internal.go | 22 ++++++ libpod/healthcheck.go | 145 +++++++++++++++++++++++++++++++++++---- test/e2e/common_test.go | 10 ++- test/e2e/healthcheck_run_test.go | 94 +++++++++++++++++++++++++ 8 files changed, 272 insertions(+), 20 deletions(-) diff --git a/cmd/podman/common.go b/cmd/podman/common.go index 8b42ed673..771738302 100644 --- a/cmd/podman/common.go +++ b/cmd/podman/common.go @@ -293,7 +293,7 @@ func getCreateFlags(c *cliconfig.PodmanCommand) { ) createFlags.String( "healthcheck-interval", "30s", - "set an interval for the healthchecks", + "set an interval for the healthchecks (a value of disable results in no automatic timer setup)", ) createFlags.Uint( "healthcheck-retries", 3, diff --git a/cmd/podman/ps.go b/cmd/podman/ps.go index 20638b424..27774f95d 100644 --- a/cmd/podman/ps.go +++ b/cmd/podman/ps.go @@ -494,6 +494,14 @@ func generateContainerFilterFuncs(filter, filterValue string, runtime *libpod.Ru } return false }, nil + case "health": + return func(c *libpod.Container) bool { + hcStatus, err := c.HealthCheckStatus() + if err != nil { + return false + } + return hcStatus == filterValue + }, nil } return nil, errors.Errorf("%s is an invalid filter", filter) } diff --git a/cmd/podman/shared/create.go b/cmd/podman/shared/create.go index 55eb3ce83..5ce0b8865 100644 --- a/cmd/podman/shared/create.go +++ b/cmd/podman/shared/create.go @@ -868,21 +868,21 @@ func makeHealthCheckFromCli(c *cliconfig.PodmanCommand) (*manifest.Schema2Health hc := manifest.Schema2HealthConfig{ Test: cmd, } + + if inInterval == "disable" { + inInterval = "0" + } intervalDuration, err := time.ParseDuration(inInterval) if err != nil { return nil, errors.Wrapf(err, "invalid healthcheck-interval %s ", inInterval) } - if intervalDuration < time.Duration(time.Second*1) { - return nil, errors.New("healthcheck-interval must be at least 1 second") - } - hc.Interval = intervalDuration if inRetries < 1 { return nil, errors.New("healthcheck-retries must be greater than 0.") } - + hc.Retries = int(inRetries) timeoutDuration, err := time.ParseDuration(inTimeout) if err != nil { return nil, errors.Wrapf(err, "invalid healthcheck-timeout %s", inTimeout) diff --git a/docs/podman-ps.1.md b/docs/podman-ps.1.md index 811fbbc2f..685a52bda 100644 --- a/docs/podman-ps.1.md +++ b/docs/podman-ps.1.md @@ -100,6 +100,7 @@ Valid filters are listed below: | before | [ID] or [Name] Containers created before this container | | since | [ID] or [Name] Containers created since this container | | volume | [VolumeName] or [MountpointDestination] Volume mounted in container | +| health | [Status] healthy or unhealthy | **--help**, **-h** diff --git a/libpod/container_internal.go b/libpod/container_internal.go index 13e660dc3..7a90bc7d4 100644 --- a/libpod/container_internal.go +++ b/libpod/container_internal.go @@ -833,6 +833,12 @@ func (c *Container) init(ctx context.Context) error { if err := c.save(); err != nil { return err } + if c.config.HealthCheckConfig != nil { + if err := c.createTimer(); err != nil { + logrus.Error(err) + } + } + defer c.newContainerEvent(events.Init) return c.completeNetworkSetup() } @@ -956,6 +962,15 @@ func (c *Container) start() error { c.state.State = ContainerStateRunning + if c.config.HealthCheckConfig != nil { + if err := c.updateHealthStatus(HealthCheckStarting); err != nil { + logrus.Error(err) + } + if err := c.startTimer(); err != nil { + logrus.Error(err) + } + } + defer c.newContainerEvent(events.Start) return c.save() @@ -1123,6 +1138,13 @@ func (c *Container) cleanup(ctx context.Context) error { logrus.Debugf("Cleaning up container %s", c.ID()) + // Remove healthcheck unit/timer file if it execs + if c.config.HealthCheckConfig != nil { + if err := c.removeTimer(); err != nil { + logrus.Error(err) + } + } + // Clean up network namespace, if present if err := c.cleanupNetwork(); err != nil { lastError = err diff --git a/libpod/healthcheck.go b/libpod/healthcheck.go index d2c0ea0fb..d8f56860b 100644 --- a/libpod/healthcheck.go +++ b/libpod/healthcheck.go @@ -3,13 +3,16 @@ package libpod import ( "bufio" "bytes" + "fmt" "io/ioutil" "os" + "os/exec" "path/filepath" "strings" "time" "github.com/containers/libpod/pkg/inspect" + "github.com/coreos/go-systemd/dbus" "github.com/pkg/errors" "github.com/sirupsen/logrus" ) @@ -47,6 +50,10 @@ const ( HealthCheckHealthy string = "healthy" // HealthCheckUnhealthy describes an unhealthy container HealthCheckUnhealthy string = "unhealthy" + // HealthCheckStarting describes the time between when the container starts + // and the start-period (time allowed for the container to start and application + // to be running) expires. + HealthCheckStarting string = "starting" ) // hcWriteCloser allows us to use bufio as a WriteCloser @@ -68,17 +75,18 @@ func (r *Runtime) HealthCheck(name string) (HealthCheckStatus, error) { } hcStatus, err := checkHealthCheckCanBeRun(container) if err == nil { - return container.RunHealthCheck() + return container.runHealthCheck() } return hcStatus, err } -// RunHealthCheck runs the health check as defined by the container -func (c *Container) RunHealthCheck() (HealthCheckStatus, error) { +// runHealthCheck runs the health check as defined by the container +func (c *Container) runHealthCheck() (HealthCheckStatus, error) { var ( - newCommand []string - returnCode int - capture bytes.Buffer + newCommand []string + returnCode int + capture bytes.Buffer + inStartPeriod bool ) hcStatus, err := checkHealthCheckCanBeRun(c) if err != nil { @@ -111,12 +119,28 @@ func (c *Container) RunHealthCheck() (HealthCheckStatus, error) { returnCode = 1 } timeEnd := time.Now() + if c.HealthCheckConfig().StartPeriod > 0 { + // there is a start-period we need to honor; we add startPeriod to container start time + startPeriodTime := c.state.StartedTime.Add(c.HealthCheckConfig().StartPeriod) + if timeStart.Before(startPeriodTime) { + // we are still in the start period, flip the inStartPeriod bool + inStartPeriod = true + logrus.Debugf("healthcheck for %s being run in start-period", c.ID()) + } + } + eventLog := capture.String() if len(eventLog) > MaxHealthCheckLogLength { eventLog = eventLog[:MaxHealthCheckLogLength] } + + if timeEnd.Sub(timeStart) > c.HealthCheckConfig().Timeout { + returnCode = -1 + hcResult = HealthCheckFailure + hcErr = errors.Errorf("healthcheck command exceeded timeout of %s", c.HealthCheckConfig().Timeout.String()) + } hcl := newHealthCheckLog(timeStart, timeEnd, returnCode, eventLog) - if err := c.updateHealthCheckLog(hcl); err != nil { + if err := c.updateHealthCheckLog(hcl, inStartPeriod); err != nil { return hcResult, errors.Wrapf(err, "unable to update health check log %s for %s", c.healthCheckLogPath(), c.ID()) } return hcResult, hcErr @@ -145,8 +169,23 @@ func newHealthCheckLog(start, end time.Time, exitCode int, log string) inspect.H } } +// updatedHealthCheckStatus updates the health status of the container +// in the healthcheck log +func (c *Container) updateHealthStatus(status string) error { + healthCheck, err := c.GetHealthCheckLog() + if err != nil { + return err + } + healthCheck.Status = status + newResults, err := json.Marshal(healthCheck) + if err != nil { + return errors.Wrapf(err, "unable to marshall healthchecks for writing status") + } + return ioutil.WriteFile(c.healthCheckLogPath(), newResults, 0700) +} + // UpdateHealthCheckLog parses the health check results and writes the log -func (c *Container) updateHealthCheckLog(hcl inspect.HealthCheckLog) error { +func (c *Container) updateHealthCheckLog(hcl inspect.HealthCheckLog, inStartPeriod bool) error { healthCheck, err := c.GetHealthCheckLog() if err != nil { return err @@ -159,11 +198,13 @@ func (c *Container) updateHealthCheckLog(hcl inspect.HealthCheckLog) error { if len(healthCheck.Status) < 1 { healthCheck.Status = HealthCheckHealthy } - // increment failing streak - healthCheck.FailingStreak = healthCheck.FailingStreak + 1 - // if failing streak > retries, then status to unhealthy - if int(healthCheck.FailingStreak) > c.HealthCheckConfig().Retries { - healthCheck.Status = HealthCheckUnhealthy + if !inStartPeriod { + // increment failing streak + healthCheck.FailingStreak = healthCheck.FailingStreak + 1 + // if failing streak > retries, then status to unhealthy + if int(healthCheck.FailingStreak) >= c.HealthCheckConfig().Retries { + healthCheck.Status = HealthCheckUnhealthy + } } } healthCheck.Log = append(healthCheck.Log, hcl) @@ -199,3 +240,81 @@ func (c *Container) GetHealthCheckLog() (inspect.HealthCheckResults, error) { } return healthCheck, nil } + +// createTimer systemd timers for healthchecks of a container +func (c *Container) createTimer() error { + if c.disableHealthCheckSystemd() { + return nil + } + podman, err := os.Executable() + if err != nil { + return errors.Wrapf(err, "failed to get path for podman for a health check timer") + } + + var cmd = []string{"--unit", fmt.Sprintf("%s", c.ID()), fmt.Sprintf("--on-unit-inactive=%s", c.HealthCheckConfig().Interval.String()), "--timer-property=AccuracySec=1s", podman, "healthcheck", "run", c.ID()} + + conn, err := dbus.NewSystemdConnection() + if err != nil { + return errors.Wrapf(err, "unable to get systemd connection to add healthchecks") + } + conn.Close() + logrus.Debugf("creating systemd-transient files: %s %s", "systemd-run", cmd) + systemdRun := exec.Command("systemd-run", cmd...) + _, err = systemdRun.CombinedOutput() + if err != nil { + return err + } + return nil +} + +// startTimer starts a systemd timer for the healthchecks +func (c *Container) startTimer() error { + if c.disableHealthCheckSystemd() { + return nil + } + conn, err := dbus.NewSystemdConnection() + if err != nil { + return errors.Wrapf(err, "unable to get systemd connection to start healthchecks") + } + defer conn.Close() + _, err = conn.StartUnit(fmt.Sprintf("%s.service", c.ID()), "fail", nil) + return err +} + +// removeTimer removes the systemd timer and unit files +// for the container +func (c *Container) removeTimer() error { + if c.disableHealthCheckSystemd() { + return nil + } + conn, err := dbus.NewSystemdConnection() + if err != nil { + return errors.Wrapf(err, "unable to get systemd connection to remove healthchecks") + } + defer conn.Close() + serviceFile := fmt.Sprintf("%s.timer", c.ID()) + _, err = conn.StopUnit(serviceFile, "fail", nil) + return err +} + +// HealthCheckStatus returns the current state of a container with a healthcheck +func (c *Container) HealthCheckStatus() (string, error) { + if !c.HasHealthCheck() { + return "", errors.Errorf("container %s has no defined healthcheck", c.ID()) + } + results, err := c.GetHealthCheckLog() + if err != nil { + return "", errors.Wrapf(err, "unable to get healthcheck log for %s", c.ID()) + } + return results.Status, nil +} + +func (c *Container) disableHealthCheckSystemd() bool { + if os.Getenv("DISABLE_HC_SYSTEMD") == "true" { + return true + } + if c.config.HealthCheckConfig.Interval == 0 { + return true + } + return false +} diff --git a/test/e2e/common_test.go b/test/e2e/common_test.go index 54b2cbec2..b20b3b37e 100644 --- a/test/e2e/common_test.go +++ b/test/e2e/common_test.go @@ -239,7 +239,7 @@ func PodmanTestCreateUtil(tempDir string, remote bool) *PodmanTestIntegration { ociRuntime = "/usr/bin/runc" } } - + os.Setenv("DISABLE_HC_SYSTEMD", "true") CNIConfigDir := "/etc/cni/net.d" p := &PodmanTestIntegration{ @@ -314,6 +314,14 @@ func (s *PodmanSessionIntegration) InspectImageJSON() []inspect.ImageData { return i } +// InspectContainer returns a container's inspect data in JSON format +func (p *PodmanTestIntegration) InspectContainer(name string) []inspect.ContainerData { + cmd := []string{"inspect", name} + session := p.Podman(cmd) + session.WaitWithDefaultTimeout() + return session.InspectContainerToJSON() +} + func processTestResult(f GinkgoTestDescription) { tr := testResult{length: f.Duration.Seconds(), name: f.TestText} testResults = append(testResults, tr) diff --git a/test/e2e/healthcheck_run_test.go b/test/e2e/healthcheck_run_test.go index f178e8ad5..ec97fdf4a 100644 --- a/test/e2e/healthcheck_run_test.go +++ b/test/e2e/healthcheck_run_test.go @@ -83,4 +83,98 @@ var _ = Describe("Podman healthcheck run", func() { hc.WaitWithDefaultTimeout() Expect(hc.ExitCode()).To(Equal(125)) }) + + It("podman healthcheck should be starting", func() { + session := podmanTest.Podman([]string{"run", "-dt", "--name", "hc", "--healthcheck-retries", "2", "--healthcheck-command", "\"CMD-SHELL ls /foo || exit 1\"", ALPINE, "top"}) + session.WaitWithDefaultTimeout() + Expect(session.ExitCode()).To(Equal(0)) + inspect := podmanTest.InspectContainer("hc") + Expect(inspect[0].State.Healthcheck.Status).To(Equal("starting")) + }) + + It("podman healthcheck failed checks in start-period should not change status", func() { + session := podmanTest.Podman([]string{"run", "-dt", "--name", "hc", "--healthcheck-start-period", "2m", "--healthcheck-retries", "2", "--healthcheck-command", "\"CMD-SHELL ls /foo || exit 1\"", ALPINE, "top"}) + session.WaitWithDefaultTimeout() + Expect(session.ExitCode()).To(Equal(0)) + + hc := podmanTest.Podman([]string{"healthcheck", "run", "hc"}) + hc.WaitWithDefaultTimeout() + Expect(hc.ExitCode()).To(Equal(1)) + + hc = podmanTest.Podman([]string{"healthcheck", "run", "hc"}) + hc.WaitWithDefaultTimeout() + Expect(hc.ExitCode()).To(Equal(1)) + + hc = podmanTest.Podman([]string{"healthcheck", "run", "hc"}) + hc.WaitWithDefaultTimeout() + Expect(hc.ExitCode()).To(Equal(1)) + + inspect := podmanTest.InspectContainer("hc") + Expect(inspect[0].State.Healthcheck.Status).To(Equal("starting")) + }) + + It("podman healthcheck failed checks must reach retries before unhealthy ", func() { + session := podmanTest.Podman([]string{"run", "-dt", "--name", "hc", "--healthcheck-retries", "2", "--healthcheck-command", "\"CMD-SHELL ls /foo || exit 1\"", ALPINE, "top"}) + session.WaitWithDefaultTimeout() + Expect(session.ExitCode()).To(Equal(0)) + + hc := podmanTest.Podman([]string{"healthcheck", "run", "hc"}) + hc.WaitWithDefaultTimeout() + Expect(hc.ExitCode()).To(Equal(1)) + + inspect := podmanTest.InspectContainer("hc") + Expect(inspect[0].State.Healthcheck.Status).To(Equal("starting")) + + hc = podmanTest.Podman([]string{"healthcheck", "run", "hc"}) + hc.WaitWithDefaultTimeout() + Expect(hc.ExitCode()).To(Equal(1)) + + inspect = podmanTest.InspectContainer("hc") + Expect(inspect[0].State.Healthcheck.Status).To(Equal("unhealthy")) + + }) + + It("podman healthcheck good check results in healthy even in start-period", func() { + session := podmanTest.Podman([]string{"run", "-dt", "--name", "hc", "--healthcheck-start-period", "2m", "--healthcheck-retries", "2", "--healthcheck-command", "\"CMD-SHELL\" \"ls\" \"||\" \"exit\" \"1\"", ALPINE, "top"}) + session.WaitWithDefaultTimeout() + Expect(session.ExitCode()).To(Equal(0)) + + hc := podmanTest.Podman([]string{"healthcheck", "run", "hc"}) + hc.WaitWithDefaultTimeout() + Expect(hc.ExitCode()).To(Equal(0)) + + inspect := podmanTest.InspectContainer("hc") + Expect(inspect[0].State.Healthcheck.Status).To(Equal("healthy")) + }) + + It("podman healthcheck single healthy result changes failed to healthy", func() { + session := podmanTest.Podman([]string{"run", "-dt", "--name", "hc", "--healthcheck-retries", "2", "--healthcheck-command", "\"CMD-SHELL\" \"ls\" \"/foo\" \"||\" \"exit\" \"1\"", ALPINE, "top"}) + session.WaitWithDefaultTimeout() + Expect(session.ExitCode()).To(Equal(0)) + + hc := podmanTest.Podman([]string{"healthcheck", "run", "hc"}) + hc.WaitWithDefaultTimeout() + Expect(hc.ExitCode()).To(Equal(1)) + + inspect := podmanTest.InspectContainer("hc") + Expect(inspect[0].State.Healthcheck.Status).To(Equal("starting")) + + hc = podmanTest.Podman([]string{"healthcheck", "run", "hc"}) + hc.WaitWithDefaultTimeout() + Expect(hc.ExitCode()).To(Equal(1)) + + inspect = podmanTest.InspectContainer("hc") + Expect(inspect[0].State.Healthcheck.Status).To(Equal("unhealthy")) + + foo := podmanTest.Podman([]string{"exec", "hc", "touch", "/foo"}) + foo.WaitWithDefaultTimeout() + Expect(foo.ExitCode()).To(BeZero()) + + hc = podmanTest.Podman([]string{"healthcheck", "run", "hc"}) + hc.WaitWithDefaultTimeout() + Expect(hc.ExitCode()).To(Equal(0)) + + inspect = podmanTest.InspectContainer("hc") + Expect(inspect[0].State.Healthcheck.Status).To(Equal("healthy")) + }) }) -- cgit v1.2.3-54-g00ecf