From c22f3e8b4e2593ca81d08924889f7e7251c83089 Mon Sep 17 00:00:00 2001 From: Daniel J Walsh Date: Wed, 24 Mar 2021 07:49:29 -0400 Subject: Implement SD-NOTIFY proxy in conmon This leverages conmon's ability to proxy the SD-NOTIFY socket. This prevents locking caused by OCI runtime blocking, waiting for SD-NOTIFY messages, and instead passes the messages directly up to the host. NOTE: Also re-enable the auto-update tests which has been disabled due to flakiness. With this change, Podman properly integrates into systemd. Fixes: #7316 Signed-off-by: Joseph Gooch Signed-off-by: Daniel J Walsh Signed-off-by: Valentin Rothberg --- libpod/container.go | 4 ++++ libpod/container_internal_linux.go | 40 +++++++++++++++++++++++++++++++ libpod/diff.go | 1 + libpod/oci_conmon_exec_linux.go | 2 +- libpod/oci_conmon_linux.go | 49 +++++++++++++++++++++++--------------- test/system/255-auto-update.bats | 1 - test/system/260-sdnotify.bats | 4 +++- 7 files changed, 79 insertions(+), 22 deletions(-) diff --git a/libpod/container.go b/libpod/container.go index 80fd35c09..c57250d72 100644 --- a/libpod/container.go +++ b/libpod/container.go @@ -126,6 +126,10 @@ type Container struct { // This is true if a container is restored from a checkpoint. restoreFromCheckpoint bool + // Used to query the NOTIFY_SOCKET once along with setting up + // mounts etc. + notifySocket string + slirp4netnsSubnet *net.IPNet } diff --git a/libpod/container_internal_linux.go b/libpod/container_internal_linux.go index f21aebb09..8b73c82de 100644 --- a/libpod/container_internal_linux.go +++ b/libpod/container_internal_linux.go @@ -352,6 +352,10 @@ func (c *Container) generateSpec(ctx context.Context) (*spec.Spec, error) { return nil, err } + if err := c.mountNotifySocket(g); err != nil { + return nil, err + } + // Get host UID and GID based on the container process UID and GID. hostUID, hostGID, err := butil.GetHostIDs(util.IDtoolsToRuntimeSpec(c.config.IDMappings.UIDMap), util.IDtoolsToRuntimeSpec(c.config.IDMappings.GIDMap), uint32(execUser.Uid), uint32(execUser.Gid)) if err != nil { @@ -777,6 +781,41 @@ func (c *Container) generateSpec(ctx context.Context) (*spec.Spec, error) { return g.Config, nil } +// mountNotifySocket mounts the NOTIFY_SOCKET into the container if it's set +// and if the sdnotify mode is set to container. It also sets c.notifySocket +// to avoid redundantly looking up the env variable. +func (c *Container) mountNotifySocket(g generate.Generator) error { + notify, ok := os.LookupEnv("NOTIFY_SOCKET") + if !ok { + return nil + } + c.notifySocket = notify + + if c.config.SdNotifyMode != define.SdNotifyModeContainer { + return nil + } + + notifyDir := filepath.Join(c.bundlePath(), "notify") + logrus.Debugf("checking notify %q dir", notifyDir) + if err := os.MkdirAll(notifyDir, 0755); err != nil { + if !os.IsExist(err) { + return errors.Wrapf(err, "unable to create notify %q dir", notifyDir) + } + } + if err := label.Relabel(notifyDir, c.MountLabel(), true); err != nil { + return errors.Wrapf(err, "relabel failed %q", notifyDir) + } + logrus.Debugf("add bindmount notify %q dir", notifyDir) + if _, ok := c.state.BindMounts["/run/notify"]; !ok { + c.state.BindMounts["/run/notify"] = notifyDir + } + + // Set the container's notify socket to the proxy socket created by conmon + g.AddProcessEnv("NOTIFY_SOCKET", "/run/notify/notify.sock") + + return nil +} + // systemd expects to have /run, /run/lock and /tmp on tmpfs // It also expects to be able to write to /sys/fs/cgroup/systemd and /var/log/journal func (c *Container) setupSystemd(mounts []spec.Mount, g generate.Generator) error { @@ -1730,6 +1769,7 @@ rootless=%d c.state.BindMounts[dest] = src } } + return nil } diff --git a/libpod/diff.go b/libpod/diff.go index cdd5e79cb..6a50bef32 100644 --- a/libpod/diff.go +++ b/libpod/diff.go @@ -14,6 +14,7 @@ var initInodes = map[string]bool{ "/etc/resolv.conf": true, "/proc": true, "/run": true, + "/run/notify": true, "/run/.containerenv": true, "/run/secrets": true, "/sys": true, diff --git a/libpod/oci_conmon_exec_linux.go b/libpod/oci_conmon_exec_linux.go index 05a4e19b0..469bc7d86 100644 --- a/libpod/oci_conmon_exec_linux.go +++ b/libpod/oci_conmon_exec_linux.go @@ -462,7 +462,7 @@ func (r *ConmonOCIRuntime) startExec(c *Container, sessionID string, options *Ex Setpgid: true, } - err = startCommandGivenSelinux(execCmd) + err = startCommandGivenSelinux(execCmd, c) // We don't need children pipes on the parent side errorhandling.CloseQuiet(childSyncPipe) diff --git a/libpod/oci_conmon_linux.go b/libpod/oci_conmon_linux.go index 846d3815a..ff25be234 100644 --- a/libpod/oci_conmon_linux.go +++ b/libpod/oci_conmon_linux.go @@ -364,11 +364,6 @@ func (r *ConmonOCIRuntime) StartContainer(ctr *Container) error { return err } env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} - if ctr.config.SdNotifyMode == define.SdNotifyModeContainer { - if notify, ok := os.LookupEnv("NOTIFY_SOCKET"); ok { - env = append(env, fmt.Sprintf("NOTIFY_SOCKET=%s", notify)) - } - } if path, ok := os.LookupEnv("PATH"); ok { env = append(env, fmt.Sprintf("PATH=%s", path)) } @@ -1014,12 +1009,6 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co } } - if ctr.config.SdNotifyMode == define.SdNotifyModeIgnore { - if err := os.Unsetenv("NOTIFY_SOCKET"); err != nil { - logrus.Warnf("Error unsetting NOTIFY_SOCKET %v", err) - } - } - pidfile := ctr.config.PidFile if pidfile == "" { pidfile = filepath.Join(ctr.state.RunDir, "pidfile") @@ -1027,6 +1016,10 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co args := r.sharedConmonArgs(ctr, ctr.ID(), ctr.bundlePath(), pidfile, ctr.LogPath(), r.exitsDir, ociLog, ctr.LogDriver(), logTag) + if ctr.config.SdNotifyMode == define.SdNotifyModeContainer && ctr.notifySocket != "" { + args = append(args, fmt.Sprintf("--sdnotify-socket=%s", ctr.notifySocket)) + } + if ctr.config.Spec.Process.Terminal { args = append(args, "-t") } else if ctr.config.Stdin { @@ -1171,7 +1164,8 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co } } - err = startCommandGivenSelinux(cmd) + err = startCommandGivenSelinux(cmd, ctr) + // regardless of whether we errored or not, we no longer need the children pipes childSyncPipe.Close() childStartPipe.Close() @@ -1203,7 +1197,13 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co // conmon not having a pid file is a valid state, so don't set it if we don't have it logrus.Infof("Got Conmon PID as %d", conmonPID) ctr.state.ConmonPID = conmonPID - if ctr.config.SdNotifyMode != define.SdNotifyModeIgnore { + + // Send the MAINPID via sdnotify if needed. + switch ctr.config.SdNotifyMode { + case define.SdNotifyModeContainer, define.SdNotifyModeIgnore: + // Nothing to do or conmon takes care of it already. + + default: if sent, err := daemon.SdNotify(false, fmt.Sprintf("MAINPID=%d", conmonPID)); err != nil { logrus.Errorf("Error notifying systemd of Conmon PID: %v", err) } else if sent { @@ -1239,11 +1239,6 @@ func (r *ConmonOCIRuntime) configureConmonEnv(ctr *Container, runtimeDir string) } extraFiles := make([]*os.File, 0) - if ctr.config.SdNotifyMode == define.SdNotifyModeContainer { - if notify, ok := os.LookupEnv("NOTIFY_SOCKET"); ok { - env = append(env, fmt.Sprintf("NOTIFY_SOCKET=%s", notify)) - } - } if !r.sdNotify { if listenfds, ok := os.LookupEnv("LISTEN_FDS"); ok { env = append(env, fmt.Sprintf("LISTEN_FDS=%s", listenfds), "LISTEN_PID=1") @@ -1335,7 +1330,23 @@ func (r *ConmonOCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, p // startCommandGivenSelinux starts a container ensuring to set the labels of // the process to make sure SELinux doesn't block conmon communication, if SELinux is enabled -func startCommandGivenSelinux(cmd *exec.Cmd) error { +func startCommandGivenSelinux(cmd *exec.Cmd, ctr *Container) error { + // Make sure to unset the NOTIFY_SOCKET and reset if afterwards if needed. + switch ctr.config.SdNotifyMode { + case define.SdNotifyModeContainer, define.SdNotifyModeIgnore: + if ctr.notifySocket != "" { + if err := os.Unsetenv("NOTIFY_SOCKET"); err != nil { + logrus.Warnf("Error unsetting NOTIFY_SOCKET %v", err) + } + + defer func() { + if err := os.Setenv("NOTIFY_SOCKET", ctr.notifySocket); err != nil { + logrus.Errorf("Error resetting NOTIFY_SOCKET=%s", ctr.notifySocket) + } + }() + } + } + if !selinux.GetEnabled() { return cmd.Start() } diff --git a/test/system/255-auto-update.bats b/test/system/255-auto-update.bats index 8bb32b5b7..7766ca3f9 100644 --- a/test/system/255-auto-update.bats +++ b/test/system/255-auto-update.bats @@ -221,7 +221,6 @@ function _confirm_update() { } @test "podman auto-update - label io.containers.autoupdate=local with rollback" { - skip "This test flakes way too often, see #11175" # sdnotify fails with runc 1.0.0-3-dev2 on Ubuntu. Let's just # assume that we work only with crun, nothing else. # [copied from 260-sdnotify.bats] diff --git a/test/system/260-sdnotify.bats b/test/system/260-sdnotify.bats index acb30de47..b5d3f9b86 100644 --- a/test/system/260-sdnotify.bats +++ b/test/system/260-sdnotify.bats @@ -130,6 +130,8 @@ function _assert_mainpid_is_conmon() { _stop_socat } +# These tests can fail in dev. environment because of SELinux. +# quick fix: chcon -t container_runtime_exec_t ./bin/podman @test "sdnotify : container" { # Sigh... we need to pull a humongous image because it has systemd-notify. # (IMPORTANT: fedora:32 and above silently removed systemd-notify; this @@ -150,7 +152,7 @@ function _assert_mainpid_is_conmon() { wait_for_ready $cid run_podman logs $cid - is "${lines[0]}" "/.*/container\.sock/notify" "NOTIFY_SOCKET is passed to container" + is "${lines[0]}" "/run/notify/notify.sock" "NOTIFY_SOCKET is passed to container" # With container, READY=1 isn't necessarily the last message received; # just look for it anywhere in received messages -- cgit v1.2.3-54-g00ecf