summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorValentin Rothberg <vrothberg@redhat.com>2022-05-27 13:04:54 +0200
committerMatthew Heon <mheon@redhat.com>2022-06-14 16:12:10 -0400
commitef2f69d4c255bcae6f900fe9617ea8f9d593afd7 (patch)
tree24315fae45e9c7de024bbf43e7afc741d6b2b7ee
parent4c52debba2ec4807b57deb704cb8cea941a09be3 (diff)
downloadpodman-ef2f69d4c255bcae6f900fe9617ea8f9d593afd7.tar.gz
podman-ef2f69d4c255bcae6f900fe9617ea8f9d593afd7.tar.bz2
podman-ef2f69d4c255bcae6f900fe9617ea8f9d593afd7.zip
healthcheck: wait for systemd operations
Make sure to wait for the systemd operations to finish when starting/stopping healtcheck timers and services. Also make sure to stop the timer before the service to avoid a race with the timer. [NO NEW TESTS NEEDED] since it is a non-functional change and existing tests are expected to pass. Signed-off-by: Valentin Rothberg <vrothberg@redhat.com>
-rw-r--r--libpod/healthcheck_linux.go70
1 files changed, 49 insertions, 21 deletions
diff --git a/libpod/healthcheck_linux.go b/libpod/healthcheck_linux.go
index 45b3a0e41..1e03db542 100644
--- a/libpod/healthcheck_linux.go
+++ b/libpod/healthcheck_linux.go
@@ -7,6 +7,7 @@ import (
"os/exec"
"strings"
+ "github.com/containers/podman/v4/pkg/errorhandling"
"github.com/containers/podman/v4/pkg/rootless"
"github.com/containers/podman/v4/pkg/systemd"
"github.com/pkg/errors"
@@ -46,6 +47,17 @@ func (c *Container) createTimer() error {
return nil
}
+// Wait for a message on the channel. Throw an error if the message is not "done".
+func systemdOpSuccessful(c chan string) error {
+ msg := <-c
+ switch msg {
+ case "done":
+ return nil
+ default:
+ return fmt.Errorf("expected %q but received %q", "done", msg)
+ }
+}
+
// startTimer starts a systemd timer for the healthchecks
func (c *Container) startTimer() error {
if c.disableHealthCheckSystemd() {
@@ -56,8 +68,17 @@ func (c *Container) startTimer() error {
return errors.Wrapf(err, "unable to get systemd connection to start healthchecks")
}
defer conn.Close()
- _, err = conn.StartUnitContext(context.Background(), fmt.Sprintf("%s.service", c.ID()), "fail", nil)
- return err
+
+ startFile := fmt.Sprintf("%s.service", c.ID())
+ startChan := make(chan string)
+ if _, err := conn.StartUnitContext(context.Background(), startFile, "fail", startChan); err != nil {
+ return err
+ }
+ if err := systemdOpSuccessful(startChan); err != nil {
+ return fmt.Errorf("starting systemd health-check timer %q: %w", startFile, err)
+ }
+
+ return nil
}
// removeTransientFiles removes the systemd timer and unit files
@@ -71,30 +92,37 @@ func (c *Container) removeTransientFiles(ctx context.Context) error {
return errors.Wrapf(err, "unable to get systemd connection to remove healthchecks")
}
defer conn.Close()
+
+ // Errors are returned at the very end. Let's make sure to stop and
+ // clean up as much as possible.
+ stopErrors := []error{}
+
+ // Stop the timer before the service to make sure the timer does not
+ // fire after the service is stopped.
+ timerChan := make(chan string)
timerFile := fmt.Sprintf("%s.timer", c.ID())
- serviceFile := fmt.Sprintf("%s.service", c.ID())
+ if _, err := conn.StopUnitContext(ctx, timerFile, "fail", timerChan); err != nil {
+ if !strings.HasSuffix(err.Error(), ".timer not loaded.") {
+ stopErrors = append(stopErrors, fmt.Errorf("removing health-check timer %q: %w", timerFile, err))
+ }
+ } else if err := systemdOpSuccessful(timerChan); err != nil {
+ stopErrors = append(stopErrors, fmt.Errorf("stopping systemd health-check timer %q: %w", timerFile, err))
+ }
- // If the service has failed (the healthcheck has failed), then
- // the .service file is not removed on stopping the unit file. If
- // we check the properties of the service, it will automatically
- // reset the state. But checking the state takes msecs vs usecs to
- // blindly call reset.
+ // Reset the service before stopping it to make sure it's being removed
+ // on stop.
+ serviceChan := make(chan string)
+ serviceFile := fmt.Sprintf("%s.service", c.ID())
if err := conn.ResetFailedUnitContext(ctx, serviceFile); err != nil {
- logrus.Debugf("failed to reset unit file: %q", err)
+ logrus.Debugf("Failed to reset unit file: %q", err)
}
-
- // We want to ignore errors where the timer unit and/or service unit has already
- // been removed. The error return is generic so we have to check against the
- // string in the error
- if _, err = conn.StopUnitContext(ctx, serviceFile, "fail", nil); err != nil {
+ if _, err := conn.StopUnitContext(ctx, serviceFile, "fail", serviceChan); err != nil {
if !strings.HasSuffix(err.Error(), ".service not loaded.") {
- return errors.Wrapf(err, "unable to remove service file")
- }
- }
- if _, err = conn.StopUnitContext(ctx, timerFile, "fail", nil); err != nil {
- if strings.HasSuffix(err.Error(), ".timer not loaded.") {
- return nil
+ stopErrors = append(stopErrors, fmt.Errorf("removing health-check service %q: %w", serviceFile, err))
}
+ } else if err := systemdOpSuccessful(serviceChan); err != nil {
+ stopErrors = append(stopErrors, fmt.Errorf("stopping systemd health-check service %q: %w", serviceFile, err))
}
- return err
+
+ return errorhandling.JoinErrors(stopErrors)
}