summaryrefslogtreecommitdiff
path: root/libpod/oci_conmon_common.go
diff options
context:
space:
mode:
authorDoug Rabson <dfr@rabson.org>2022-08-17 09:35:19 +0100
committerDoug Rabson <dfr@rabson.org>2022-08-18 08:05:42 +0100
commitbebf55c0f22d6723a27cd39561c0577aa557c5e1 (patch)
treec5a3cbbab83707dbd1e34f72ff22dc05010ce531 /libpod/oci_conmon_common.go
parent1f0c3d52628e7c5b22ee500194155bdd20ad271f (diff)
downloadpodman-bebf55c0f22d6723a27cd39561c0577aa557c5e1.tar.gz
podman-bebf55c0f22d6723a27cd39561c0577aa557c5e1.tar.bz2
podman-bebf55c0f22d6723a27cd39561c0577aa557c5e1.zip
libpod: Move oci_conmon_linux.go to oci_conmon_common.go
[NO NEW TESTS NEEDED] Signed-off-by: Doug Rabson <dfr@rabson.org>
Diffstat (limited to 'libpod/oci_conmon_common.go')
-rw-r--r--libpod/oci_conmon_common.go1929
1 files changed, 1929 insertions, 0 deletions
diff --git a/libpod/oci_conmon_common.go b/libpod/oci_conmon_common.go
new file mode 100644
index 000000000..1b654ed33
--- /dev/null
+++ b/libpod/oci_conmon_common.go
@@ -0,0 +1,1929 @@
+//go:build linux
+// +build linux
+
+package libpod
+
+import (
+ "bufio"
+ "bytes"
+ "context"
+ "errors"
+ "fmt"
+ "io"
+ "io/ioutil"
+ "net"
+ "net/http"
+ "os"
+ "os/exec"
+ "path/filepath"
+ "runtime"
+ "strconv"
+ "strings"
+ "sync"
+ "syscall"
+ "text/template"
+ "time"
+
+ runcconfig "github.com/opencontainers/runc/libcontainer/configs"
+ "github.com/opencontainers/runc/libcontainer/devices"
+
+ "github.com/containers/common/pkg/cgroups"
+ "github.com/containers/common/pkg/config"
+ "github.com/containers/common/pkg/resize"
+ cutil "github.com/containers/common/pkg/util"
+ conmonConfig "github.com/containers/conmon/runner/config"
+ "github.com/containers/podman/v4/libpod/define"
+ "github.com/containers/podman/v4/libpod/logs"
+ "github.com/containers/podman/v4/pkg/checkpoint/crutils"
+ "github.com/containers/podman/v4/pkg/errorhandling"
+ "github.com/containers/podman/v4/pkg/rootless"
+ "github.com/containers/podman/v4/pkg/specgenutil"
+ "github.com/containers/podman/v4/pkg/util"
+ "github.com/containers/podman/v4/utils"
+ "github.com/containers/storage/pkg/homedir"
+ pmount "github.com/containers/storage/pkg/mount"
+ spec "github.com/opencontainers/runtime-spec/specs-go"
+ "github.com/opencontainers/selinux/go-selinux/label"
+ "github.com/sirupsen/logrus"
+ "golang.org/x/sys/unix"
+)
+
+const (
+ // This is Conmon's STDIO_BUF_SIZE. I don't believe we have access to it
+ // directly from the Go code, so const it here
+ // Important: The conmon attach socket uses an extra byte at the beginning of each
+ // message to specify the STREAM so we have to increase the buffer size by one
+ bufferSize = conmonConfig.BufSize + 1
+)
+
+// ConmonOCIRuntime is an OCI runtime managed by Conmon.
+// TODO: Make all calls to OCI runtime have a timeout.
+type ConmonOCIRuntime struct {
+ name string
+ path string
+ conmonPath string
+ conmonEnv []string
+ tmpDir string
+ exitsDir string
+ logSizeMax int64
+ noPivot bool
+ reservePorts bool
+ runtimeFlags []string
+ supportsJSON bool
+ supportsKVM bool
+ supportsNoCgroups bool
+ enableKeyring bool
+}
+
+// Make a new Conmon-based OCI runtime with the given options.
+// Conmon will wrap the given OCI runtime, which can be `runc`, `crun`, or
+// any runtime with a runc-compatible CLI.
+// The first path that points to a valid executable will be used.
+// Deliberately private. Someone should not be able to construct this outside of
+// libpod.
+func newConmonOCIRuntime(name string, paths []string, conmonPath string, runtimeFlags []string, runtimeCfg *config.Config) (OCIRuntime, error) {
+ if name == "" {
+ return nil, fmt.Errorf("the OCI runtime must be provided a non-empty name: %w", define.ErrInvalidArg)
+ }
+
+ // Make lookup tables for runtime support
+ supportsJSON := make(map[string]bool, len(runtimeCfg.Engine.RuntimeSupportsJSON))
+ supportsNoCgroups := make(map[string]bool, len(runtimeCfg.Engine.RuntimeSupportsNoCgroups))
+ supportsKVM := make(map[string]bool, len(runtimeCfg.Engine.RuntimeSupportsKVM))
+ for _, r := range runtimeCfg.Engine.RuntimeSupportsJSON {
+ supportsJSON[r] = true
+ }
+ for _, r := range runtimeCfg.Engine.RuntimeSupportsNoCgroups {
+ supportsNoCgroups[r] = true
+ }
+ for _, r := range runtimeCfg.Engine.RuntimeSupportsKVM {
+ supportsKVM[r] = true
+ }
+
+ runtime := new(ConmonOCIRuntime)
+ runtime.name = name
+ runtime.conmonPath = conmonPath
+ runtime.runtimeFlags = runtimeFlags
+
+ runtime.conmonEnv = runtimeCfg.Engine.ConmonEnvVars
+ runtime.tmpDir = runtimeCfg.Engine.TmpDir
+ runtime.logSizeMax = runtimeCfg.Containers.LogSizeMax
+ runtime.noPivot = runtimeCfg.Engine.NoPivotRoot
+ runtime.reservePorts = runtimeCfg.Engine.EnablePortReservation
+ runtime.enableKeyring = runtimeCfg.Containers.EnableKeyring
+
+ // TODO: probe OCI runtime for feature and enable automatically if
+ // available.
+
+ base := filepath.Base(name)
+ runtime.supportsJSON = supportsJSON[base]
+ runtime.supportsNoCgroups = supportsNoCgroups[base]
+ runtime.supportsKVM = supportsKVM[base]
+
+ foundPath := false
+ for _, path := range paths {
+ stat, err := os.Stat(path)
+ if err != nil {
+ if os.IsNotExist(err) {
+ continue
+ }
+ return nil, fmt.Errorf("cannot stat OCI runtime %s path: %w", name, err)
+ }
+ if !stat.Mode().IsRegular() {
+ continue
+ }
+ foundPath = true
+ logrus.Tracef("found runtime %q", path)
+ runtime.path = path
+ break
+ }
+
+ // Search the $PATH as last fallback
+ if !foundPath {
+ if foundRuntime, err := exec.LookPath(name); err == nil {
+ foundPath = true
+ runtime.path = foundRuntime
+ logrus.Debugf("using runtime %q from $PATH: %q", name, foundRuntime)
+ }
+ }
+
+ if !foundPath {
+ return nil, fmt.Errorf("no valid executable found for OCI runtime %s: %w", name, define.ErrInvalidArg)
+ }
+
+ runtime.exitsDir = filepath.Join(runtime.tmpDir, "exits")
+
+ // Create the exit files and attach sockets directories
+ if err := os.MkdirAll(runtime.exitsDir, 0750); err != nil {
+ // The directory is allowed to exist
+ if !os.IsExist(err) {
+ return nil, fmt.Errorf("error creating OCI runtime exit files directory: %w", err)
+ }
+ }
+ return runtime, nil
+}
+
+// Name returns the name of the runtime being wrapped by Conmon.
+func (r *ConmonOCIRuntime) Name() string {
+ return r.name
+}
+
+// Path returns the path of the OCI runtime being wrapped by Conmon.
+func (r *ConmonOCIRuntime) Path() string {
+ return r.path
+}
+
+// hasCurrentUserMapped checks whether the current user is mapped inside the container user namespace
+func hasCurrentUserMapped(ctr *Container) bool {
+ if len(ctr.config.IDMappings.UIDMap) == 0 && len(ctr.config.IDMappings.GIDMap) == 0 {
+ return true
+ }
+ uid := os.Geteuid()
+ for _, m := range ctr.config.IDMappings.UIDMap {
+ if uid >= m.HostID && uid < m.HostID+m.Size {
+ return true
+ }
+ }
+ return false
+}
+
+// CreateContainer creates a container.
+func (r *ConmonOCIRuntime) CreateContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) {
+ // always make the run dir accessible to the current user so that the PID files can be read without
+ // being in the rootless user namespace.
+ if err := makeAccessible(ctr.state.RunDir, 0, 0); err != nil {
+ return 0, err
+ }
+ if !hasCurrentUserMapped(ctr) {
+ for _, i := range []string{ctr.state.RunDir, ctr.runtime.config.Engine.TmpDir, ctr.config.StaticDir, ctr.state.Mountpoint, ctr.runtime.config.Engine.VolumePath} {
+ if err := makeAccessible(i, ctr.RootUID(), ctr.RootGID()); err != nil {
+ return 0, err
+ }
+ }
+
+ // if we are running a non privileged container, be sure to umount some kernel paths so they are not
+ // bind mounted inside the container at all.
+ if !ctr.config.Privileged && !rootless.IsRootless() {
+ type result struct {
+ restoreDuration int64
+ err error
+ }
+ ch := make(chan result)
+ go func() {
+ runtime.LockOSThread()
+ restoreDuration, err := func() (int64, error) {
+ fd, err := os.Open(fmt.Sprintf("/proc/%d/task/%d/ns/mnt", os.Getpid(), unix.Gettid()))
+ if err != nil {
+ return 0, err
+ }
+ defer errorhandling.CloseQuiet(fd)
+
+ // create a new mountns on the current thread
+ if err = unix.Unshare(unix.CLONE_NEWNS); err != nil {
+ return 0, err
+ }
+ defer func() {
+ if err := unix.Setns(int(fd.Fd()), unix.CLONE_NEWNS); err != nil {
+ logrus.Errorf("Unable to clone new namespace: %q", err)
+ }
+ }()
+
+ // don't spread our mounts around. We are setting only /sys to be slave
+ // so that the cleanup process is still able to umount the storage and the
+ // changes are propagated to the host.
+ err = unix.Mount("/sys", "/sys", "none", unix.MS_REC|unix.MS_SLAVE, "")
+ if err != nil {
+ return 0, fmt.Errorf("cannot make /sys slave: %w", err)
+ }
+
+ mounts, err := pmount.GetMounts()
+ if err != nil {
+ return 0, err
+ }
+ for _, m := range mounts {
+ if !strings.HasPrefix(m.Mountpoint, "/sys/kernel") {
+ continue
+ }
+ err = unix.Unmount(m.Mountpoint, 0)
+ if err != nil && !os.IsNotExist(err) {
+ return 0, fmt.Errorf("cannot unmount %s: %w", m.Mountpoint, err)
+ }
+ }
+ return r.createOCIContainer(ctr, restoreOptions)
+ }()
+ ch <- result{
+ restoreDuration: restoreDuration,
+ err: err,
+ }
+ }()
+ r := <-ch
+ return r.restoreDuration, r.err
+ }
+ }
+ return r.createOCIContainer(ctr, restoreOptions)
+}
+
+// UpdateContainerStatus retrieves the current status of the container from the
+// runtime. It updates the container's state but does not save it.
+// If useRuntime is false, we will not directly hit runc to see the container's
+// status, but will instead only check for the existence of the conmon exit file
+// and update state to stopped if it exists.
+func (r *ConmonOCIRuntime) UpdateContainerStatus(ctr *Container) error {
+ runtimeDir, err := util.GetRuntimeDir()
+ if err != nil {
+ return err
+ }
+
+ // Store old state so we know if we were already stopped
+ oldState := ctr.state.State
+
+ state := new(spec.State)
+
+ cmd := exec.Command(r.path, "state", ctr.ID())
+ cmd.Env = append(cmd.Env, fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir))
+
+ outPipe, err := cmd.StdoutPipe()
+ if err != nil {
+ return fmt.Errorf("getting stdout pipe: %w", err)
+ }
+ errPipe, err := cmd.StderrPipe()
+ if err != nil {
+ return fmt.Errorf("getting stderr pipe: %w", err)
+ }
+
+ if err := cmd.Start(); err != nil {
+ out, err2 := ioutil.ReadAll(errPipe)
+ if err2 != nil {
+ return fmt.Errorf("error getting container %s state: %w", ctr.ID(), err)
+ }
+ if strings.Contains(string(out), "does not exist") || strings.Contains(string(out), "No such file") {
+ if err := ctr.removeConmonFiles(); err != nil {
+ logrus.Debugf("unable to remove conmon files for container %s", ctr.ID())
+ }
+ ctr.state.ExitCode = -1
+ ctr.state.FinishedTime = time.Now()
+ ctr.state.State = define.ContainerStateExited
+ return ctr.runtime.state.AddContainerExitCode(ctr.ID(), ctr.state.ExitCode)
+ }
+ return fmt.Errorf("error getting container %s state. stderr/out: %s: %w", ctr.ID(), out, err)
+ }
+ defer func() {
+ _ = cmd.Wait()
+ }()
+
+ if err := errPipe.Close(); err != nil {
+ return err
+ }
+ out, err := ioutil.ReadAll(outPipe)
+ if err != nil {
+ return fmt.Errorf("error reading stdout: %s: %w", ctr.ID(), err)
+ }
+ if err := json.NewDecoder(bytes.NewBuffer(out)).Decode(state); err != nil {
+ return fmt.Errorf("error decoding container status for container %s: %w", ctr.ID(), err)
+ }
+ ctr.state.PID = state.Pid
+
+ switch state.Status {
+ case "created":
+ ctr.state.State = define.ContainerStateCreated
+ case "paused":
+ ctr.state.State = define.ContainerStatePaused
+ case "running":
+ ctr.state.State = define.ContainerStateRunning
+ case "stopped":
+ ctr.state.State = define.ContainerStateStopped
+ default:
+ return fmt.Errorf("unrecognized status returned by runtime for container %s: %s: %w",
+ ctr.ID(), state.Status, define.ErrInternal)
+ }
+
+ // Only grab exit status if we were not already stopped
+ // If we were, it should already be in the database
+ if ctr.state.State == define.ContainerStateStopped && oldState != define.ContainerStateStopped {
+ if _, err := ctr.Wait(context.Background()); err != nil {
+ logrus.Errorf("Waiting for container %s to exit: %v", ctr.ID(), err)
+ }
+ return nil
+ }
+
+ // Handle ContainerStateStopping - keep it unless the container
+ // transitioned to no longer running.
+ if oldState == define.ContainerStateStopping && (ctr.state.State == define.ContainerStatePaused || ctr.state.State == define.ContainerStateRunning) {
+ ctr.state.State = define.ContainerStateStopping
+ }
+
+ return nil
+}
+
+// StartContainer starts the given container.
+// Sets time the container was started, but does not save it.
+func (r *ConmonOCIRuntime) StartContainer(ctr *Container) error {
+ // TODO: streams should probably *not* be our STDIN/OUT/ERR - redirect to buffers?
+ runtimeDir, err := util.GetRuntimeDir()
+ if err != nil {
+ return err
+ }
+ env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)}
+ if path, ok := os.LookupEnv("PATH"); ok {
+ env = append(env, fmt.Sprintf("PATH=%s", path))
+ }
+ if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, append(r.runtimeFlags, "start", ctr.ID())...); err != nil {
+ return err
+ }
+
+ ctr.state.StartedTime = time.Now()
+
+ return nil
+}
+
+// KillContainer sends the given signal to the given container.
+// If all is set, send to all PIDs in the container.
+// All is only supported if the container created cgroups.
+func (r *ConmonOCIRuntime) KillContainer(ctr *Container, signal uint, all bool) error {
+ logrus.Debugf("Sending signal %d to container %s", signal, ctr.ID())
+ runtimeDir, err := util.GetRuntimeDir()
+ if err != nil {
+ return err
+ }
+ env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)}
+ var args []string
+ args = append(args, r.runtimeFlags...)
+ if all {
+ args = append(args, "kill", "--all", ctr.ID(), fmt.Sprintf("%d", signal))
+ } else {
+ args = append(args, "kill", ctr.ID(), fmt.Sprintf("%d", signal))
+ }
+ if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, args...); err != nil {
+ // Update container state - there's a chance we failed because
+ // the container exited in the meantime.
+ if err2 := r.UpdateContainerStatus(ctr); err2 != nil {
+ logrus.Infof("Error updating status for container %s: %v", ctr.ID(), err2)
+ }
+ if ctr.ensureState(define.ContainerStateStopped, define.ContainerStateExited) {
+ return define.ErrCtrStateInvalid
+ }
+ return fmt.Errorf("error sending signal to container %s: %w", ctr.ID(), err)
+ }
+
+ return nil
+}
+
+// StopContainer stops a container, first using its given stop signal (or
+// SIGTERM if no signal was specified), then using SIGKILL.
+// Timeout is given in seconds. If timeout is 0, the container will be
+// immediately kill with SIGKILL.
+// Does not set finished time for container, assumes you will run updateStatus
+// after to pull the exit code.
+func (r *ConmonOCIRuntime) StopContainer(ctr *Container, timeout uint, all bool) error {
+ logrus.Debugf("Stopping container %s (PID %d)", ctr.ID(), ctr.state.PID)
+
+ // Ping the container to see if it's alive
+ // If it's not, it's already stopped, return
+ err := unix.Kill(ctr.state.PID, 0)
+ if err == unix.ESRCH {
+ return nil
+ }
+
+ stopSignal := ctr.config.StopSignal
+ if stopSignal == 0 {
+ stopSignal = uint(syscall.SIGTERM)
+ }
+
+ if timeout > 0 {
+ if err := r.KillContainer(ctr, stopSignal, all); err != nil {
+ // Is the container gone?
+ // If so, it probably died between the first check and
+ // our sending the signal
+ // The container is stopped, so exit cleanly
+ err := unix.Kill(ctr.state.PID, 0)
+ if err == unix.ESRCH {
+ return nil
+ }
+
+ return err
+ }
+
+ if err := waitContainerStop(ctr, time.Duration(timeout)*time.Second); err != nil {
+ logrus.Debugf("Timed out stopping container %s with %s, resorting to SIGKILL: %v", ctr.ID(), unix.SignalName(syscall.Signal(stopSignal)), err)
+ logrus.Warnf("StopSignal %s failed to stop container %s in %d seconds, resorting to SIGKILL", unix.SignalName(syscall.Signal(stopSignal)), ctr.Name(), timeout)
+ } else {
+ // No error, the container is dead
+ return nil
+ }
+ }
+
+ if err := r.KillContainer(ctr, 9, all); err != nil {
+ // Again, check if the container is gone. If it is, exit cleanly.
+ err := unix.Kill(ctr.state.PID, 0)
+ if err == unix.ESRCH {
+ return nil
+ }
+
+ return fmt.Errorf("error sending SIGKILL to container %s: %w", ctr.ID(), err)
+ }
+
+ // Give runtime a few seconds to make it happen
+ if err := waitContainerStop(ctr, killContainerTimeout); err != nil {
+ return err
+ }
+
+ return nil
+}
+
+// DeleteContainer deletes a container from the OCI runtime.
+func (r *ConmonOCIRuntime) DeleteContainer(ctr *Container) error {
+ runtimeDir, err := util.GetRuntimeDir()
+ if err != nil {
+ return err
+ }
+ env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)}
+ return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, append(r.runtimeFlags, "delete", "--force", ctr.ID())...)
+}
+
+// PauseContainer pauses the given container.
+func (r *ConmonOCIRuntime) PauseContainer(ctr *Container) error {
+ runtimeDir, err := util.GetRuntimeDir()
+ if err != nil {
+ return err
+ }
+ env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)}
+ return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, append(r.runtimeFlags, "pause", ctr.ID())...)
+}
+
+// UnpauseContainer unpauses the given container.
+func (r *ConmonOCIRuntime) UnpauseContainer(ctr *Container) error {
+ runtimeDir, err := util.GetRuntimeDir()
+ if err != nil {
+ return err
+ }
+ env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)}
+ return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, append(r.runtimeFlags, "resume", ctr.ID())...)
+}
+
+// HTTPAttach performs an attach for the HTTP API.
+// The caller must handle closing the HTTP connection after this returns.
+// The cancel channel is not closed; it is up to the caller to do so after
+// this function returns.
+// If this is a container with a terminal, we will stream raw. If it is not, we
+// will stream with an 8-byte header to multiplex STDOUT and STDERR.
+// Returns any errors that occurred, and whether the connection was successfully
+// hijacked before that error occurred.
+func (r *ConmonOCIRuntime) HTTPAttach(ctr *Container, req *http.Request, w http.ResponseWriter, streams *HTTPAttachStreams, detachKeys *string, cancel <-chan bool, hijackDone chan<- bool, streamAttach, streamLogs bool) (deferredErr error) {
+ isTerminal := false
+ if ctr.config.Spec.Process != nil {
+ isTerminal = ctr.config.Spec.Process.Terminal
+ }
+
+ if streams != nil {
+ if !streams.Stdin && !streams.Stdout && !streams.Stderr {
+ return fmt.Errorf("must specify at least one stream to attach to: %w", define.ErrInvalidArg)
+ }
+ }
+
+ attachSock, err := r.AttachSocketPath(ctr)
+ if err != nil {
+ return err
+ }
+
+ var conn *net.UnixConn
+ if streamAttach {
+ newConn, err := openUnixSocket(attachSock)
+ if err != nil {
+ return fmt.Errorf("failed to connect to container's attach socket: %v: %w", attachSock, err)
+ }
+ conn = newConn
+ defer func() {
+ if err := conn.Close(); err != nil {
+ logrus.Errorf("Unable to close container %s attach socket: %q", ctr.ID(), err)
+ }
+ }()
+
+ logrus.Debugf("Successfully connected to container %s attach socket %s", ctr.ID(), attachSock)
+ }
+
+ detachString := ctr.runtime.config.Engine.DetachKeys
+ if detachKeys != nil {
+ detachString = *detachKeys
+ }
+ detach, err := processDetachKeys(detachString)
+ if err != nil {
+ return err
+ }
+
+ attachStdout := true
+ attachStderr := true
+ attachStdin := true
+ if streams != nil {
+ attachStdout = streams.Stdout
+ attachStderr = streams.Stderr
+ attachStdin = streams.Stdin
+ }
+
+ logrus.Debugf("Going to hijack container %s attach connection", ctr.ID())
+
+ // Alright, let's hijack.
+ hijacker, ok := w.(http.Hijacker)
+ if !ok {
+ return fmt.Errorf("unable to hijack connection")
+ }
+
+ httpCon, httpBuf, err := hijacker.Hijack()
+ if err != nil {
+ return fmt.Errorf("error hijacking connection: %w", err)
+ }
+
+ hijackDone <- true
+
+ writeHijackHeader(req, httpBuf)
+
+ // Force a flush after the header is written.
+ if err := httpBuf.Flush(); err != nil {
+ return fmt.Errorf("error flushing HTTP hijack header: %w", err)
+ }
+
+ defer func() {
+ hijackWriteErrorAndClose(deferredErr, ctr.ID(), isTerminal, httpCon, httpBuf)
+ }()
+
+ logrus.Debugf("Hijack for container %s attach session done, ready to stream", ctr.ID())
+
+ // TODO: This is gross. Really, really gross.
+ // I want to say we should read all the logs into an array before
+ // calling this, in container_api.go, but that could take a lot of
+ // memory...
+ // On the whole, we need to figure out a better way of doing this,
+ // though.
+ logSize := 0
+ if streamLogs {
+ logrus.Debugf("Will stream logs for container %s attach session", ctr.ID())
+
+ // Get all logs for the container
+ logChan := make(chan *logs.LogLine)
+ logOpts := new(logs.LogOptions)
+ logOpts.Tail = -1
+ logOpts.WaitGroup = new(sync.WaitGroup)
+ errChan := make(chan error)
+ go func() {
+ var err error
+ // In non-terminal mode we need to prepend with the
+ // stream header.
+ logrus.Debugf("Writing logs for container %s to HTTP attach", ctr.ID())
+ for logLine := range logChan {
+ if !isTerminal {
+ device := logLine.Device
+ var header []byte
+ headerLen := uint32(len(logLine.Msg))
+ logSize += len(logLine.Msg)
+ switch strings.ToLower(device) {
+ case "stdin":
+ header = makeHTTPAttachHeader(0, headerLen)
+ case "stdout":
+ header = makeHTTPAttachHeader(1, headerLen)
+ case "stderr":
+ header = makeHTTPAttachHeader(2, headerLen)
+ default:
+ logrus.Errorf("Unknown device for log line: %s", device)
+ header = makeHTTPAttachHeader(1, headerLen)
+ }
+ _, err = httpBuf.Write(header)
+ if err != nil {
+ break
+ }
+ }
+ _, err = httpBuf.Write([]byte(logLine.Msg))
+ if err != nil {
+ break
+ }
+ if !logLine.Partial() {
+ _, err = httpBuf.Write([]byte("\n"))
+ if err != nil {
+ break
+ }
+ }
+ err = httpBuf.Flush()
+ if err != nil {
+ break
+ }
+ }
+ errChan <- err
+ }()
+ if err := ctr.ReadLog(context.Background(), logOpts, logChan, 0); err != nil {
+ return err
+ }
+ go func() {
+ logOpts.WaitGroup.Wait()
+ close(logChan)
+ }()
+ logrus.Debugf("Done reading logs for container %s, %d bytes", ctr.ID(), logSize)
+ if err := <-errChan; err != nil {
+ return err
+ }
+ }
+ if !streamAttach {
+ logrus.Debugf("Done streaming logs for container %s attach, exiting as attach streaming not requested", ctr.ID())
+ return nil
+ }
+
+ logrus.Debugf("Forwarding attach output for container %s", ctr.ID())
+
+ stdoutChan := make(chan error)
+ stdinChan := make(chan error)
+
+ // Handle STDOUT/STDERR
+ go func() {
+ var err error
+ if isTerminal {
+ // Hack: return immediately if attachStdout not set to
+ // emulate Docker.
+ // Basically, when terminal is set, STDERR goes nowhere.
+ // Everything does over STDOUT.
+ // Therefore, if not attaching STDOUT - we'll never copy
+ // anything from here.
+ logrus.Debugf("Performing terminal HTTP attach for container %s", ctr.ID())
+ if attachStdout {
+ err = httpAttachTerminalCopy(conn, httpBuf, ctr.ID())
+ }
+ } else {
+ logrus.Debugf("Performing non-terminal HTTP attach for container %s", ctr.ID())
+ err = httpAttachNonTerminalCopy(conn, httpBuf, ctr.ID(), attachStdin, attachStdout, attachStderr)
+ }
+ stdoutChan <- err
+ logrus.Debugf("STDOUT/ERR copy completed")
+ }()
+ // Next, STDIN. Avoid entirely if attachStdin unset.
+ if attachStdin {
+ go func() {
+ _, err := cutil.CopyDetachable(conn, httpBuf, detach)
+ logrus.Debugf("STDIN copy completed")
+ stdinChan <- err
+ }()
+ }
+
+ for {
+ select {
+ case err := <-stdoutChan:
+ if err != nil {
+ return err
+ }
+
+ return nil
+ case err := <-stdinChan:
+ if err != nil {
+ return err
+ }
+ // copy stdin is done, close it
+ if connErr := conn.CloseWrite(); connErr != nil {
+ logrus.Errorf("Unable to close conn: %v", connErr)
+ }
+ case <-cancel:
+ return nil
+ }
+ }
+}
+
+// isRetryable returns whether the error was caused by a blocked syscall or the
+// specified operation on a non blocking file descriptor wasn't ready for completion.
+func isRetryable(err error) bool {
+ var errno syscall.Errno
+ if errors.As(err, &errno) {
+ return errno == syscall.EINTR || errno == syscall.EAGAIN
+ }
+ return false
+}
+
+// openControlFile opens the terminal control file.
+func openControlFile(ctr *Container, parentDir string) (*os.File, error) {
+ controlPath := filepath.Join(parentDir, "ctl")
+ for i := 0; i < 600; i++ {
+ controlFile, err := os.OpenFile(controlPath, unix.O_WRONLY|unix.O_NONBLOCK, 0)
+ if err == nil {
+ return controlFile, nil
+ }
+ if !isRetryable(err) {
+ return nil, fmt.Errorf("could not open ctl file for terminal resize for container %s: %w", ctr.ID(), err)
+ }
+ time.Sleep(time.Second / 10)
+ }
+ return nil, fmt.Errorf("timeout waiting for %q", controlPath)
+}
+
+// AttachResize resizes the terminal used by the given container.
+func (r *ConmonOCIRuntime) AttachResize(ctr *Container, newSize resize.TerminalSize) error {
+ controlFile, err := openControlFile(ctr, ctr.bundlePath())
+ if err != nil {
+ return err
+ }
+ defer controlFile.Close()
+
+ logrus.Debugf("Received a resize event for container %s: %+v", ctr.ID(), newSize)
+ if _, err = fmt.Fprintf(controlFile, "%d %d %d\n", 1, newSize.Height, newSize.Width); err != nil {
+ return fmt.Errorf("failed to write to ctl file to resize terminal: %w", err)
+ }
+
+ return nil
+}
+
+// CheckpointContainer checkpoints the given container.
+func (r *ConmonOCIRuntime) CheckpointContainer(ctr *Container, options ContainerCheckpointOptions) (int64, error) {
+ // imagePath is used by CRIU to store the actual checkpoint files
+ imagePath := ctr.CheckpointPath()
+ if options.PreCheckPoint {
+ imagePath = ctr.PreCheckPointPath()
+ }
+ // workPath will be used to store dump.log and stats-dump
+ workPath := ctr.bundlePath()
+ logrus.Debugf("Writing checkpoint to %s", imagePath)
+ logrus.Debugf("Writing checkpoint logs to %s", workPath)
+ logrus.Debugf("Pre-dump the container %t", options.PreCheckPoint)
+ args := []string{}
+ args = append(args, r.runtimeFlags...)
+ args = append(args, "checkpoint")
+ args = append(args, "--image-path")
+ args = append(args, imagePath)
+ args = append(args, "--work-path")
+ args = append(args, workPath)
+ if options.KeepRunning {
+ args = append(args, "--leave-running")
+ }
+ if options.TCPEstablished {
+ args = append(args, "--tcp-established")
+ }
+ if options.FileLocks {
+ args = append(args, "--file-locks")
+ }
+ if !options.PreCheckPoint && options.KeepRunning {
+ args = append(args, "--leave-running")
+ }
+ if options.PreCheckPoint {
+ args = append(args, "--pre-dump")
+ }
+ if !options.PreCheckPoint && options.WithPrevious {
+ args = append(
+ args,
+ "--parent-path",
+ filepath.Join("..", preCheckpointDir),
+ )
+ }
+
+ args = append(args, ctr.ID())
+ logrus.Debugf("the args to checkpoint: %s %s", r.path, strings.Join(args, " "))
+
+ runtimeDir, err := util.GetRuntimeDir()
+ if err != nil {
+ return 0, err
+ }
+ env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)}
+ if path, ok := os.LookupEnv("PATH"); ok {
+ env = append(env, fmt.Sprintf("PATH=%s", path))
+ }
+
+ runtime.LockOSThread()
+ if err := label.SetSocketLabel(ctr.ProcessLabel()); err != nil {
+ return 0, err
+ }
+
+ runtimeCheckpointStarted := time.Now()
+ err = utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, args...)
+ // Ignore error returned from SetSocketLabel("") call,
+ // can't recover.
+ if labelErr := label.SetSocketLabel(""); labelErr == nil {
+ // Unlock the thread only if the process label could be restored
+ // successfully. Otherwise leave the thread locked and the Go runtime
+ // will terminate it once it returns to the threads pool.
+ runtime.UnlockOSThread()
+ } else {
+ logrus.Errorf("Unable to reset socket label: %q", labelErr)
+ }
+
+ runtimeCheckpointDuration := func() int64 {
+ if options.PrintStats {
+ return time.Since(runtimeCheckpointStarted).Microseconds()
+ }
+ return 0
+ }()
+
+ return runtimeCheckpointDuration, err
+}
+
+func (r *ConmonOCIRuntime) CheckConmonRunning(ctr *Container) (bool, error) {
+ if ctr.state.ConmonPID == 0 {
+ // If the container is running or paused, assume Conmon is
+ // running. We didn't record Conmon PID on some old versions, so
+ // that is likely what's going on...
+ // Unusual enough that we should print a warning message though.
+ if ctr.ensureState(define.ContainerStateRunning, define.ContainerStatePaused) {
+ logrus.Warnf("Conmon PID is not set, but container is running!")
+ return true, nil
+ }
+ // Container's not running, so conmon PID being unset is
+ // expected. Conmon is not running.
+ return false, nil
+ }
+
+ // We have a conmon PID. Ping it with signal 0.
+ if err := unix.Kill(ctr.state.ConmonPID, 0); err != nil {
+ if err == unix.ESRCH {
+ return false, nil
+ }
+ return false, fmt.Errorf("error pinging container %s conmon with signal 0: %w", ctr.ID(), err)
+ }
+ return true, nil
+}
+
+// SupportsCheckpoint checks if the OCI runtime supports checkpointing
+// containers.
+func (r *ConmonOCIRuntime) SupportsCheckpoint() bool {
+ return crutils.CRRuntimeSupportsCheckpointRestore(r.path)
+}
+
+// SupportsJSONErrors checks if the OCI runtime supports JSON-formatted error
+// messages.
+func (r *ConmonOCIRuntime) SupportsJSONErrors() bool {
+ return r.supportsJSON
+}
+
+// SupportsNoCgroups checks if the OCI runtime supports running containers
+// without cgroups (the --cgroup-manager=disabled flag).
+func (r *ConmonOCIRuntime) SupportsNoCgroups() bool {
+ return r.supportsNoCgroups
+}
+
+// SupportsKVM checks if the OCI runtime supports running containers
+// without KVM separation
+func (r *ConmonOCIRuntime) SupportsKVM() bool {
+ return r.supportsKVM
+}
+
+// AttachSocketPath is the path to a single container's attach socket.
+func (r *ConmonOCIRuntime) AttachSocketPath(ctr *Container) (string, error) {
+ if ctr == nil {
+ return "", fmt.Errorf("must provide a valid container to get attach socket path: %w", define.ErrInvalidArg)
+ }
+
+ return filepath.Join(ctr.bundlePath(), "attach"), nil
+}
+
+// ExitFilePath is the path to a container's exit file.
+func (r *ConmonOCIRuntime) ExitFilePath(ctr *Container) (string, error) {
+ if ctr == nil {
+ return "", fmt.Errorf("must provide a valid container to get exit file path: %w", define.ErrInvalidArg)
+ }
+ return filepath.Join(r.exitsDir, ctr.ID()), nil
+}
+
+// RuntimeInfo provides information on the runtime.
+func (r *ConmonOCIRuntime) RuntimeInfo() (*define.ConmonInfo, *define.OCIRuntimeInfo, error) {
+ runtimePackage := packageVersion(r.path)
+ conmonPackage := packageVersion(r.conmonPath)
+ runtimeVersion, err := r.getOCIRuntimeVersion()
+ if err != nil {
+ return nil, nil, fmt.Errorf("error getting version of OCI runtime %s: %w", r.name, err)
+ }
+ conmonVersion, err := r.getConmonVersion()
+ if err != nil {
+ return nil, nil, fmt.Errorf("error getting conmon version: %w", err)
+ }
+
+ conmon := define.ConmonInfo{
+ Package: conmonPackage,
+ Path: r.conmonPath,
+ Version: conmonVersion,
+ }
+ ocirt := define.OCIRuntimeInfo{
+ Name: r.name,
+ Path: r.path,
+ Package: runtimePackage,
+ Version: runtimeVersion,
+ }
+ return &conmon, &ocirt, nil
+}
+
+// makeAccessible changes the path permission and each parent directory to have --x--x--x
+func makeAccessible(path string, uid, gid int) error {
+ for ; path != "/"; path = filepath.Dir(path) {
+ st, err := os.Stat(path)
+ if err != nil {
+ if os.IsNotExist(err) {
+ return nil
+ }
+ return err
+ }
+ if int(st.Sys().(*syscall.Stat_t).Uid) == uid && int(st.Sys().(*syscall.Stat_t).Gid) == gid {
+ continue
+ }
+ if st.Mode()&0111 != 0111 {
+ if err := os.Chmod(path, st.Mode()|0111); err != nil {
+ return err
+ }
+ }
+ }
+ return nil
+}
+
+// Wait for a container which has been sent a signal to stop
+func waitContainerStop(ctr *Container, timeout time.Duration) error {
+ return waitPidStop(ctr.state.PID, timeout)
+}
+
+// Wait for a given PID to stop
+func waitPidStop(pid int, timeout time.Duration) error {
+ done := make(chan struct{})
+ chControl := make(chan struct{})
+ go func() {
+ for {
+ select {
+ case <-chControl:
+ return
+ default:
+ if err := unix.Kill(pid, 0); err != nil {
+ if err == unix.ESRCH {
+ close(done)
+ return
+ }
+ logrus.Errorf("Pinging PID %d with signal 0: %v", pid, err)
+ }
+ time.Sleep(100 * time.Millisecond)
+ }
+ }
+ }()
+ select {
+ case <-done:
+ return nil
+ case <-time.After(timeout):
+ close(chControl)
+ return fmt.Errorf("given PIDs did not die within timeout")
+ }
+}
+
+func (r *ConmonOCIRuntime) getLogTag(ctr *Container) (string, error) {
+ logTag := ctr.LogTag()
+ if logTag == "" {
+ return "", nil
+ }
+ data, err := ctr.inspectLocked(false)
+ if err != nil {
+ // FIXME: this error should probably be returned
+ return "", nil //nolint: nilerr
+ }
+ tmpl, err := template.New("container").Parse(logTag)
+ if err != nil {
+ return "", fmt.Errorf("template parsing error %s: %w", logTag, err)
+ }
+ var b bytes.Buffer
+ err = tmpl.Execute(&b, data)
+ if err != nil {
+ return "", err
+ }
+ return b.String(), nil
+}
+
+// createOCIContainer generates this container's main conmon instance and prepares it for starting
+func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) {
+ var stderrBuf bytes.Buffer
+
+ runtimeDir, err := util.GetRuntimeDir()
+ if err != nil {
+ return 0, err
+ }
+
+ parentSyncPipe, childSyncPipe, err := newPipe()
+ if err != nil {
+ return 0, fmt.Errorf("error creating socket pair: %w", err)
+ }
+ defer errorhandling.CloseQuiet(parentSyncPipe)
+
+ childStartPipe, parentStartPipe, err := newPipe()
+ if err != nil {
+ return 0, fmt.Errorf("error creating socket pair for start pipe: %w", err)
+ }
+
+ defer errorhandling.CloseQuiet(parentStartPipe)
+
+ var ociLog string
+ if logrus.GetLevel() != logrus.DebugLevel && r.supportsJSON {
+ ociLog = filepath.Join(ctr.state.RunDir, "oci-log")
+ }
+
+ logTag, err := r.getLogTag(ctr)
+ if err != nil {
+ return 0, err
+ }
+
+ if ctr.config.CgroupsMode == cgroupSplit {
+ if err := utils.MoveUnderCgroupSubtree("runtime"); err != nil {
+ return 0, err
+ }
+ }
+
+ pidfile := ctr.config.PidFile
+ if pidfile == "" {
+ pidfile = filepath.Join(ctr.state.RunDir, "pidfile")
+ }
+
+ args := r.sharedConmonArgs(ctr, ctr.ID(), ctr.bundlePath(), pidfile, ctr.LogPath(), r.exitsDir, ociLog, ctr.LogDriver(), logTag)
+
+ if ctr.config.SdNotifyMode == define.SdNotifyModeContainer && ctr.config.SdNotifySocket != "" {
+ args = append(args, fmt.Sprintf("--sdnotify-socket=%s", ctr.config.SdNotifySocket))
+ }
+
+ if ctr.config.Spec.Process.Terminal {
+ args = append(args, "-t")
+ } else if ctr.config.Stdin {
+ args = append(args, "-i")
+ }
+
+ if ctr.config.Timeout > 0 {
+ args = append(args, fmt.Sprintf("--timeout=%d", ctr.config.Timeout))
+ }
+
+ if !r.enableKeyring {
+ args = append(args, "--no-new-keyring")
+ }
+ if ctr.config.ConmonPidFile != "" {
+ args = append(args, "--conmon-pidfile", ctr.config.ConmonPidFile)
+ }
+
+ if r.noPivot {
+ args = append(args, "--no-pivot")
+ }
+
+ exitCommand, err := specgenutil.CreateExitCommandArgs(ctr.runtime.storageConfig, ctr.runtime.config, logrus.IsLevelEnabled(logrus.DebugLevel), ctr.AutoRemove(), false)
+ if err != nil {
+ return 0, err
+ }
+ exitCommand = append(exitCommand, ctr.config.ID)
+
+ args = append(args, "--exit-command", exitCommand[0])
+ for _, arg := range exitCommand[1:] {
+ args = append(args, []string{"--exit-command-arg", arg}...)
+ }
+
+ // Pass down the LISTEN_* environment (see #10443).
+ preserveFDs := ctr.config.PreserveFDs
+ if val := os.Getenv("LISTEN_FDS"); val != "" {
+ if ctr.config.PreserveFDs > 0 {
+ logrus.Warnf("Ignoring LISTEN_FDS to preserve custom user-specified FDs")
+ } else {
+ fds, err := strconv.Atoi(val)
+ if err != nil {
+ return 0, fmt.Errorf("converting LISTEN_FDS=%s: %w", val, err)
+ }
+ preserveFDs = uint(fds)
+ }
+ }
+
+ if preserveFDs > 0 {
+ args = append(args, formatRuntimeOpts("--preserve-fds", fmt.Sprintf("%d", preserveFDs))...)
+ }
+
+ if restoreOptions != nil {
+ args = append(args, "--restore", ctr.CheckpointPath())
+ if restoreOptions.TCPEstablished {
+ args = append(args, "--runtime-opt", "--tcp-established")
+ }
+ if restoreOptions.FileLocks {
+ args = append(args, "--runtime-opt", "--file-locks")
+ }
+ if restoreOptions.Pod != "" {
+ mountLabel := ctr.config.MountLabel
+ processLabel := ctr.config.ProcessLabel
+ if mountLabel != "" {
+ args = append(
+ args,
+ "--runtime-opt",
+ fmt.Sprintf(
+ "--lsm-mount-context=%s",
+ mountLabel,
+ ),
+ )
+ }
+ if processLabel != "" {
+ args = append(
+ args,
+ "--runtime-opt",
+ fmt.Sprintf(
+ "--lsm-profile=selinux:%s",
+ processLabel,
+ ),
+ )
+ }
+ }
+ }
+
+ logrus.WithFields(logrus.Fields{
+ "args": args,
+ }).Debugf("running conmon: %s", r.conmonPath)
+
+ cmd := exec.Command(r.conmonPath, args...)
+ cmd.SysProcAttr = &syscall.SysProcAttr{
+ Setpgid: true,
+ }
+ // TODO this is probably a really bad idea for some uses
+ // Make this configurable
+ cmd.Stdin = os.Stdin
+ cmd.Stdout = os.Stdout
+ cmd.Stderr = os.Stderr
+ if ctr.config.Spec.Process.Terminal {
+ cmd.Stderr = &stderrBuf
+ }
+
+ // 0, 1 and 2 are stdin, stdout and stderr
+ conmonEnv := r.configureConmonEnv(runtimeDir)
+
+ var filesToClose []*os.File
+ if preserveFDs > 0 {
+ for fd := 3; fd < int(3+preserveFDs); fd++ {
+ f := os.NewFile(uintptr(fd), fmt.Sprintf("fd-%d", fd))
+ filesToClose = append(filesToClose, f)
+ cmd.ExtraFiles = append(cmd.ExtraFiles, f)
+ }
+ }
+
+ cmd.Env = r.conmonEnv
+ // we don't want to step on users fds they asked to preserve
+ // Since 0-2 are used for stdio, start the fds we pass in at preserveFDs+3
+ cmd.Env = append(cmd.Env, fmt.Sprintf("_OCI_SYNCPIPE=%d", preserveFDs+3), fmt.Sprintf("_OCI_STARTPIPE=%d", preserveFDs+4))
+ cmd.Env = append(cmd.Env, conmonEnv...)
+ cmd.ExtraFiles = append(cmd.ExtraFiles, childSyncPipe, childStartPipe)
+
+ if r.reservePorts && !rootless.IsRootless() && !ctr.config.NetMode.IsSlirp4netns() {
+ ports, err := bindPorts(ctr.convertPortMappings())
+ if err != nil {
+ return 0, err
+ }
+ filesToClose = append(filesToClose, ports...)
+
+ // Leak the port we bound in the conmon process. These fd's won't be used
+ // by the container and conmon will keep the ports busy so that another
+ // process cannot use them.
+ cmd.ExtraFiles = append(cmd.ExtraFiles, ports...)
+ }
+
+ if ctr.config.NetMode.IsSlirp4netns() || rootless.IsRootless() {
+ if ctr.config.PostConfigureNetNS {
+ havePortMapping := len(ctr.config.PortMappings) > 0
+ if havePortMapping {
+ ctr.rootlessPortSyncR, ctr.rootlessPortSyncW, err = os.Pipe()
+ if err != nil {
+ return 0, fmt.Errorf("failed to create rootless port sync pipe: %w", err)
+ }
+ }
+ ctr.rootlessSlirpSyncR, ctr.rootlessSlirpSyncW, err = os.Pipe()
+ if err != nil {
+ return 0, fmt.Errorf("failed to create rootless network sync pipe: %w", err)
+ }
+ } else {
+ if ctr.rootlessSlirpSyncR != nil {
+ defer errorhandling.CloseQuiet(ctr.rootlessSlirpSyncR)
+ }
+ if ctr.rootlessSlirpSyncW != nil {
+ defer errorhandling.CloseQuiet(ctr.rootlessSlirpSyncW)
+ }
+ }
+ // Leak one end in conmon, the other one will be leaked into slirp4netns
+ cmd.ExtraFiles = append(cmd.ExtraFiles, ctr.rootlessSlirpSyncW)
+
+ if ctr.rootlessPortSyncW != nil {
+ defer errorhandling.CloseQuiet(ctr.rootlessPortSyncW)
+ // Leak one end in conmon, the other one will be leaked into rootlessport
+ cmd.ExtraFiles = append(cmd.ExtraFiles, ctr.rootlessPortSyncW)
+ }
+ }
+ var runtimeRestoreStarted time.Time
+ if restoreOptions != nil {
+ runtimeRestoreStarted = time.Now()
+ }
+ err = startCommand(cmd, ctr)
+
+ // regardless of whether we errored or not, we no longer need the children pipes
+ childSyncPipe.Close()
+ childStartPipe.Close()
+ if err != nil {
+ return 0, err
+ }
+ if err := r.moveConmonToCgroupAndSignal(ctr, cmd, parentStartPipe); err != nil {
+ return 0, err
+ }
+ /* Wait for initial setup and fork, and reap child */
+ err = cmd.Wait()
+ if err != nil {
+ return 0, err
+ }
+
+ pid, err := readConmonPipeData(r.name, parentSyncPipe, ociLog)
+ if err != nil {
+ if err2 := r.DeleteContainer(ctr); err2 != nil {
+ logrus.Errorf("Removing container %s from runtime after creation failed", ctr.ID())
+ }
+ return 0, err
+ }
+ ctr.state.PID = pid
+
+ conmonPID, err := readConmonPidFile(ctr.config.ConmonPidFile)
+ if err != nil {
+ logrus.Warnf("Error reading conmon pid file for container %s: %v", ctr.ID(), err)
+ } else if conmonPID > 0 {
+ // conmon not having a pid file is a valid state, so don't set it if we don't have it
+ logrus.Infof("Got Conmon PID as %d", conmonPID)
+ ctr.state.ConmonPID = conmonPID
+ }
+
+ runtimeRestoreDuration := func() int64 {
+ if restoreOptions != nil && restoreOptions.PrintStats {
+ return time.Since(runtimeRestoreStarted).Microseconds()
+ }
+ return 0
+ }()
+
+ // These fds were passed down to the runtime. Close them
+ // and not interfere
+ for _, f := range filesToClose {
+ errorhandling.CloseQuiet(f)
+ }
+
+ return runtimeRestoreDuration, nil
+}
+
+// configureConmonEnv gets the environment values to add to conmon's exec struct
+// TODO this may want to be less hardcoded/more configurable in the future
+func (r *ConmonOCIRuntime) configureConmonEnv(runtimeDir string) []string {
+ var env []string
+ for _, e := range os.Environ() {
+ if strings.HasPrefix(e, "LC_") {
+ env = append(env, e)
+ }
+ }
+ conf, ok := os.LookupEnv("CONTAINERS_CONF")
+ if ok {
+ env = append(env, fmt.Sprintf("CONTAINERS_CONF=%s", conf))
+ }
+ env = append(env, fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir))
+ env = append(env, fmt.Sprintf("_CONTAINERS_USERNS_CONFIGURED=%s", os.Getenv("_CONTAINERS_USERNS_CONFIGURED")))
+ env = append(env, fmt.Sprintf("_CONTAINERS_ROOTLESS_UID=%s", os.Getenv("_CONTAINERS_ROOTLESS_UID")))
+ home := homedir.Get()
+ if home != "" {
+ env = append(env, fmt.Sprintf("HOME=%s", home))
+ }
+
+ return env
+}
+
+// sharedConmonArgs takes common arguments for exec and create/restore and formats them for the conmon CLI
+func (r *ConmonOCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, pidPath, logPath, exitDir, ociLogPath, logDriver, logTag string) []string {
+ // set the conmon API version to be able to use the correct sync struct keys
+ args := []string{
+ "--api-version", "1",
+ "-c", ctr.ID(),
+ "-u", cuuid,
+ "-r", r.path,
+ "-b", bundlePath,
+ "-p", pidPath,
+ "-n", ctr.Name(),
+ "--exit-dir", exitDir,
+ "--full-attach",
+ }
+ if len(r.runtimeFlags) > 0 {
+ rFlags := []string{}
+ for _, arg := range r.runtimeFlags {
+ rFlags = append(rFlags, "--runtime-arg", arg)
+ }
+ args = append(args, rFlags...)
+ }
+
+ if ctr.CgroupManager() == config.SystemdCgroupsManager && !ctr.config.NoCgroups && ctr.config.CgroupsMode != cgroupSplit {
+ args = append(args, "-s")
+ }
+
+ var logDriverArg string
+ switch logDriver {
+ case define.JournaldLogging:
+ logDriverArg = define.JournaldLogging
+ case define.NoLogging:
+ logDriverArg = define.NoLogging
+ case define.PassthroughLogging:
+ logDriverArg = define.PassthroughLogging
+ //lint:ignore ST1015 the default case has to be here
+ default: //nolint:stylecheck,gocritic
+ // No case here should happen except JSONLogging, but keep this here in case the options are extended
+ logrus.Errorf("%s logging specified but not supported. Choosing k8s-file logging instead", ctr.LogDriver())
+ fallthrough
+ case "":
+ // to get here, either a user would specify `--log-driver ""`, or this came from another place in libpod
+ // since the former case is obscure, and the latter case isn't an error, let's silently fallthrough
+ fallthrough
+ case define.JSONLogging:
+ fallthrough
+ case define.KubernetesLogging:
+ logDriverArg = fmt.Sprintf("%s:%s", define.KubernetesLogging, logPath)
+ }
+
+ args = append(args, "-l", logDriverArg)
+ logLevel := logrus.GetLevel()
+ args = append(args, "--log-level", logLevel.String())
+
+ if logLevel == logrus.DebugLevel {
+ logrus.Debugf("%s messages will be logged to syslog", r.conmonPath)
+ args = append(args, "--syslog")
+ }
+
+ size := r.logSizeMax
+ if ctr.config.LogSize > 0 {
+ size = ctr.config.LogSize
+ }
+ if size > 0 {
+ args = append(args, "--log-size-max", fmt.Sprintf("%v", size))
+ }
+
+ if ociLogPath != "" {
+ args = append(args, "--runtime-arg", "--log-format=json", "--runtime-arg", "--log", fmt.Sprintf("--runtime-arg=%s", ociLogPath))
+ }
+ if logTag != "" {
+ args = append(args, "--log-tag", logTag)
+ }
+ if ctr.config.NoCgroups {
+ logrus.Debugf("Running with no Cgroups")
+ args = append(args, "--runtime-arg", "--cgroup-manager", "--runtime-arg", "disabled")
+ }
+ return args
+}
+
+func startCommand(cmd *exec.Cmd, ctr *Container) error {
+ // Make sure to unset the NOTIFY_SOCKET and reset it afterwards if needed.
+ switch ctr.config.SdNotifyMode {
+ case define.SdNotifyModeContainer, define.SdNotifyModeIgnore:
+ if prev := os.Getenv("NOTIFY_SOCKET"); prev != "" {
+ if err := os.Unsetenv("NOTIFY_SOCKET"); err != nil {
+ logrus.Warnf("Error unsetting NOTIFY_SOCKET %v", err)
+ }
+ defer func() {
+ if err := os.Setenv("NOTIFY_SOCKET", prev); err != nil {
+ logrus.Errorf("Resetting NOTIFY_SOCKET=%s", prev)
+ }
+ }()
+ }
+ }
+
+ return cmd.Start()
+}
+
+// moveConmonToCgroupAndSignal gets a container's cgroupParent and moves the conmon process to that cgroup
+// it then signals for conmon to start by sending nonce data down the start fd
+func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec.Cmd, startFd *os.File) error {
+ mustCreateCgroup := true
+
+ if ctr.config.NoCgroups {
+ mustCreateCgroup = false
+ }
+
+ // If cgroup creation is disabled - just signal.
+ switch ctr.config.CgroupsMode {
+ case "disabled", "no-conmon", cgroupSplit:
+ mustCreateCgroup = false
+ }
+
+ // $INVOCATION_ID is set by systemd when running as a service.
+ if ctr.runtime.RemoteURI() == "" && os.Getenv("INVOCATION_ID") != "" {
+ mustCreateCgroup = false
+ }
+
+ if mustCreateCgroup {
+ // Usually rootless users are not allowed to configure cgroupfs.
+ // There are cases though, where it is allowed, e.g. if the cgroup
+ // is manually configured and chowned). Avoid detecting all
+ // such cases and simply use a lower log level.
+ logLevel := logrus.WarnLevel
+ if rootless.IsRootless() {
+ logLevel = logrus.InfoLevel
+ }
+ // TODO: This should be a switch - we are not guaranteed that
+ // there are only 2 valid cgroup managers
+ cgroupParent := ctr.CgroupParent()
+ cgroupPath := filepath.Join(ctr.config.CgroupParent, "conmon")
+ Resource := ctr.Spec().Linux.Resources
+ cgroupResources, err := GetLimits(Resource)
+ if err != nil {
+ logrus.StandardLogger().Log(logLevel, "Could not get ctr resources")
+ }
+ if ctr.CgroupManager() == config.SystemdCgroupsManager {
+ unitName := createUnitName("libpod-conmon", ctr.ID())
+ realCgroupParent := cgroupParent
+ splitParent := strings.Split(cgroupParent, "/")
+ if strings.HasSuffix(cgroupParent, ".slice") && len(splitParent) > 1 {
+ realCgroupParent = splitParent[len(splitParent)-1]
+ }
+
+ logrus.Infof("Running conmon under slice %s and unitName %s", realCgroupParent, unitName)
+ if err := utils.RunUnderSystemdScope(cmd.Process.Pid, realCgroupParent, unitName); err != nil {
+ logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to systemd sandbox cgroup: %v", err)
+ }
+ } else {
+ control, err := cgroups.New(cgroupPath, &cgroupResources)
+ if err != nil {
+ logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to cgroupfs sandbox cgroup: %v", err)
+ } else if err := control.AddPid(cmd.Process.Pid); err != nil {
+ // we need to remove this defer and delete the cgroup once conmon exits
+ // maybe need a conmon monitor?
+ logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to cgroupfs sandbox cgroup: %v", err)
+ }
+ }
+ }
+
+ /* We set the cgroup, now the child can start creating children */
+ if err := writeConmonPipeData(startFd); err != nil {
+ return err
+ }
+ return nil
+}
+
+// newPipe creates a unix socket pair for communication.
+// Returns two files - first is parent, second is child.
+func newPipe() (*os.File, *os.File, error) {
+ fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0)
+ if err != nil {
+ return nil, nil, err
+ }
+ return os.NewFile(uintptr(fds[1]), "parent"), os.NewFile(uintptr(fds[0]), "child"), nil
+}
+
+// readConmonPidFile attempts to read conmon's pid from its pid file
+func readConmonPidFile(pidFile string) (int, error) {
+ // Let's try reading the Conmon pid at the same time.
+ if pidFile != "" {
+ contents, err := ioutil.ReadFile(pidFile)
+ if err != nil {
+ return -1, err
+ }
+ // Convert it to an int
+ conmonPID, err := strconv.Atoi(string(contents))
+ if err != nil {
+ return -1, err
+ }
+ return conmonPID, nil
+ }
+ return 0, nil
+}
+
+// readConmonPipeData attempts to read a syncInfo struct from the pipe
+func readConmonPipeData(runtimeName string, pipe *os.File, ociLog string) (int, error) {
+ // syncInfo is used to return data from monitor process to daemon
+ type syncInfo struct {
+ Data int `json:"data"`
+ Message string `json:"message,omitempty"`
+ }
+
+ // Wait to get container pid from conmon
+ type syncStruct struct {
+ si *syncInfo
+ err error
+ }
+ ch := make(chan syncStruct)
+ go func() {
+ var si *syncInfo
+ rdr := bufio.NewReader(pipe)
+ b, err := rdr.ReadBytes('\n')
+ // ignore EOF here, error is returned even when data was read
+ // if it is no valid json unmarshal will fail below
+ if err != nil && !errors.Is(err, io.EOF) {
+ ch <- syncStruct{err: err}
+ }
+ if err := json.Unmarshal(b, &si); err != nil {
+ ch <- syncStruct{err: fmt.Errorf("conmon bytes %q: %w", string(b), err)}
+ return
+ }
+ ch <- syncStruct{si: si}
+ }()
+
+ data := -1 //nolint: wastedassign
+ select {
+ case ss := <-ch:
+ if ss.err != nil {
+ if ociLog != "" {
+ ociLogData, err := ioutil.ReadFile(ociLog)
+ if err == nil {
+ var ociErr ociError
+ if err := json.Unmarshal(ociLogData, &ociErr); err == nil {
+ return -1, getOCIRuntimeError(runtimeName, ociErr.Msg)
+ }
+ }
+ }
+ return -1, fmt.Errorf("container create failed (no logs from conmon): %w", ss.err)
+ }
+ logrus.Debugf("Received: %d", ss.si.Data)
+ if ss.si.Data < 0 {
+ if ociLog != "" {
+ ociLogData, err := ioutil.ReadFile(ociLog)
+ if err == nil {
+ var ociErr ociError
+ if err := json.Unmarshal(ociLogData, &ociErr); err == nil {
+ return ss.si.Data, getOCIRuntimeError(runtimeName, ociErr.Msg)
+ }
+ }
+ }
+ // If we failed to parse the JSON errors, then print the output as it is
+ if ss.si.Message != "" {
+ return ss.si.Data, getOCIRuntimeError(runtimeName, ss.si.Message)
+ }
+ return ss.si.Data, fmt.Errorf("container create failed: %w", define.ErrInternal)
+ }
+ data = ss.si.Data
+ case <-time.After(define.ContainerCreateTimeout):
+ return -1, fmt.Errorf("container creation timeout: %w", define.ErrInternal)
+ }
+ return data, nil
+}
+
+// writeConmonPipeData writes nonce data to a pipe
+func writeConmonPipeData(pipe *os.File) error {
+ someData := []byte{0}
+ _, err := pipe.Write(someData)
+ return err
+}
+
+// formatRuntimeOpts prepends opts passed to it with --runtime-opt for passing to conmon
+func formatRuntimeOpts(opts ...string) []string {
+ args := make([]string, 0, len(opts)*2)
+ for _, o := range opts {
+ args = append(args, "--runtime-opt", o)
+ }
+ return args
+}
+
+// getConmonVersion returns a string representation of the conmon version.
+func (r *ConmonOCIRuntime) getConmonVersion() (string, error) {
+ output, err := utils.ExecCmd(r.conmonPath, "--version")
+ if err != nil {
+ return "", err
+ }
+ return strings.TrimSuffix(strings.Replace(output, "\n", ", ", 1), "\n"), nil
+}
+
+// getOCIRuntimeVersion returns a string representation of the OCI runtime's
+// version.
+func (r *ConmonOCIRuntime) getOCIRuntimeVersion() (string, error) {
+ output, err := utils.ExecCmd(r.path, "--version")
+ if err != nil {
+ return "", err
+ }
+ return strings.TrimSuffix(output, "\n"), nil
+}
+
+// Copy data from container to HTTP connection, for terminal attach.
+// Container is the container's attach socket connection, http is a buffer for
+// the HTTP connection. cid is the ID of the container the attach session is
+// running for (used solely for error messages).
+func httpAttachTerminalCopy(container *net.UnixConn, http *bufio.ReadWriter, cid string) error {
+ buf := make([]byte, bufferSize)
+ for {
+ numR, err := container.Read(buf)
+ logrus.Debugf("Read fd(%d) %d/%d bytes for container %s", int(buf[0]), numR, len(buf), cid)
+
+ if numR > 0 {
+ switch buf[0] {
+ case AttachPipeStdout:
+ // Do nothing
+ default:
+ logrus.Errorf("Received unexpected attach type %+d, discarding %d bytes", buf[0], numR)
+ continue
+ }
+
+ numW, err2 := http.Write(buf[1:numR])
+ if err2 != nil {
+ if err != nil {
+ logrus.Errorf("Reading container %s STDOUT: %v", cid, err)
+ }
+ return err2
+ } else if numW+1 != numR {
+ return io.ErrShortWrite
+ }
+ // We need to force the buffer to write immediately, so
+ // there isn't a delay on the terminal side.
+ if err2 := http.Flush(); err2 != nil {
+ if err != nil {
+ logrus.Errorf("Reading container %s STDOUT: %v", cid, err)
+ }
+ return err2
+ }
+ }
+ if err != nil {
+ if err == io.EOF {
+ return nil
+ }
+ return err
+ }
+ }
+}
+
+// Copy data from a container to an HTTP connection, for non-terminal attach.
+// Appends a header to multiplex input.
+func httpAttachNonTerminalCopy(container *net.UnixConn, http *bufio.ReadWriter, cid string, stdin, stdout, stderr bool) error {
+ buf := make([]byte, bufferSize)
+ for {
+ numR, err := container.Read(buf)
+ if numR > 0 {
+ var headerBuf []byte
+
+ // Subtract 1 because we strip the first byte (used for
+ // multiplexing by Conmon).
+ headerLen := uint32(numR - 1)
+ // Practically speaking, we could make this buf[0] - 1,
+ // but we need to validate it anyway.
+ switch buf[0] {
+ case AttachPipeStdin:
+ headerBuf = makeHTTPAttachHeader(0, headerLen)
+ if !stdin {
+ continue
+ }
+ case AttachPipeStdout:
+ if !stdout {
+ continue
+ }
+ headerBuf = makeHTTPAttachHeader(1, headerLen)
+ case AttachPipeStderr:
+ if !stderr {
+ continue
+ }
+ headerBuf = makeHTTPAttachHeader(2, headerLen)
+ default:
+ logrus.Errorf("Received unexpected attach type %+d, discarding %d bytes", buf[0], numR)
+ continue
+ }
+
+ numH, err2 := http.Write(headerBuf)
+ if err2 != nil {
+ if err != nil {
+ logrus.Errorf("Reading container %s standard streams: %v", cid, err)
+ }
+
+ return err2
+ }
+ // Hardcoding header length is pretty gross, but
+ // fast. Should be safe, as this is a fixed part
+ // of the protocol.
+ if numH != 8 {
+ if err != nil {
+ logrus.Errorf("Reading container %s standard streams: %v", cid, err)
+ }
+
+ return io.ErrShortWrite
+ }
+
+ numW, err2 := http.Write(buf[1:numR])
+ if err2 != nil {
+ if err != nil {
+ logrus.Errorf("Reading container %s standard streams: %v", cid, err)
+ }
+
+ return err2
+ } else if numW+1 != numR {
+ if err != nil {
+ logrus.Errorf("Reading container %s standard streams: %v", cid, err)
+ }
+
+ return io.ErrShortWrite
+ }
+ // We need to force the buffer to write immediately, so
+ // there isn't a delay on the terminal side.
+ if err2 := http.Flush(); err2 != nil {
+ if err != nil {
+ logrus.Errorf("Reading container %s STDOUT: %v", cid, err)
+ }
+ return err2
+ }
+ }
+ if err != nil {
+ if err == io.EOF {
+ return nil
+ }
+
+ return err
+ }
+ }
+}
+
+// GetLimits converts spec resource limits to cgroup consumable limits
+func GetLimits(resource *spec.LinuxResources) (runcconfig.Resources, error) {
+ if resource == nil {
+ resource = &spec.LinuxResources{}
+ }
+ final := &runcconfig.Resources{}
+ devs := []*devices.Rule{}
+
+ // Devices
+ for _, entry := range resource.Devices {
+ if entry.Major == nil || entry.Minor == nil {
+ continue
+ }
+ runeType := 'a'
+ switch entry.Type {
+ case "b":
+ runeType = 'b'
+ case "c":
+ runeType = 'c'
+ }
+
+ devs = append(devs, &devices.Rule{
+ Type: devices.Type(runeType),
+ Major: *entry.Major,
+ Minor: *entry.Minor,
+ Permissions: devices.Permissions(entry.Access),
+ Allow: entry.Allow,
+ })
+ }
+ final.Devices = devs
+
+ // HugepageLimits
+ pageLimits := []*runcconfig.HugepageLimit{}
+ for _, entry := range resource.HugepageLimits {
+ pageLimits = append(pageLimits, &runcconfig.HugepageLimit{
+ Pagesize: entry.Pagesize,
+ Limit: entry.Limit,
+ })
+ }
+ final.HugetlbLimit = pageLimits
+
+ // Networking
+ netPriorities := []*runcconfig.IfPrioMap{}
+ if resource.Network != nil {
+ for _, entry := range resource.Network.Priorities {
+ netPriorities = append(netPriorities, &runcconfig.IfPrioMap{
+ Interface: entry.Name,
+ Priority: int64(entry.Priority),
+ })
+ }
+ }
+ final.NetPrioIfpriomap = netPriorities
+ rdma := make(map[string]runcconfig.LinuxRdma)
+ for name, entry := range resource.Rdma {
+ rdma[name] = runcconfig.LinuxRdma{HcaHandles: entry.HcaHandles, HcaObjects: entry.HcaObjects}
+ }
+ final.Rdma = rdma
+
+ // Memory
+ if resource.Memory != nil {
+ if resource.Memory.Limit != nil {
+ final.Memory = *resource.Memory.Limit
+ }
+ if resource.Memory.Reservation != nil {
+ final.MemoryReservation = *resource.Memory.Reservation
+ }
+ if resource.Memory.Swap != nil {
+ final.MemorySwap = *resource.Memory.Swap
+ }
+ if resource.Memory.Swappiness != nil {
+ final.MemorySwappiness = resource.Memory.Swappiness
+ }
+ }
+
+ // CPU
+ if resource.CPU != nil {
+ if resource.CPU.Period != nil {
+ final.CpuPeriod = *resource.CPU.Period
+ }
+ if resource.CPU.Quota != nil {
+ final.CpuQuota = *resource.CPU.Quota
+ }
+ if resource.CPU.RealtimePeriod != nil {
+ final.CpuRtPeriod = *resource.CPU.RealtimePeriod
+ }
+ if resource.CPU.RealtimeRuntime != nil {
+ final.CpuRtRuntime = *resource.CPU.RealtimeRuntime
+ }
+ if resource.CPU.Shares != nil {
+ final.CpuShares = *resource.CPU.Shares
+ }
+ final.CpusetCpus = resource.CPU.Cpus
+ final.CpusetMems = resource.CPU.Mems
+ }
+
+ // BlkIO
+ if resource.BlockIO != nil {
+ if len(resource.BlockIO.ThrottleReadBpsDevice) > 0 {
+ for _, entry := range resource.BlockIO.ThrottleReadBpsDevice {
+ throttle := &runcconfig.ThrottleDevice{}
+ dev := &runcconfig.BlockIODevice{
+ Major: entry.Major,
+ Minor: entry.Minor,
+ }
+ throttle.BlockIODevice = *dev
+ throttle.Rate = entry.Rate
+ final.BlkioThrottleReadBpsDevice = append(final.BlkioThrottleReadBpsDevice, throttle)
+ }
+ }
+ if len(resource.BlockIO.ThrottleWriteBpsDevice) > 0 {
+ for _, entry := range resource.BlockIO.ThrottleWriteBpsDevice {
+ throttle := &runcconfig.ThrottleDevice{}
+ dev := &runcconfig.BlockIODevice{
+ Major: entry.Major,
+ Minor: entry.Minor,
+ }
+ throttle.BlockIODevice = *dev
+ throttle.Rate = entry.Rate
+ final.BlkioThrottleWriteBpsDevice = append(final.BlkioThrottleWriteBpsDevice, throttle)
+ }
+ }
+ if len(resource.BlockIO.ThrottleReadIOPSDevice) > 0 {
+ for _, entry := range resource.BlockIO.ThrottleReadIOPSDevice {
+ throttle := &runcconfig.ThrottleDevice{}
+ dev := &runcconfig.BlockIODevice{
+ Major: entry.Major,
+ Minor: entry.Minor,
+ }
+ throttle.BlockIODevice = *dev
+ throttle.Rate = entry.Rate
+ final.BlkioThrottleReadIOPSDevice = append(final.BlkioThrottleReadIOPSDevice, throttle)
+ }
+ }
+ if len(resource.BlockIO.ThrottleWriteIOPSDevice) > 0 {
+ for _, entry := range resource.BlockIO.ThrottleWriteIOPSDevice {
+ throttle := &runcconfig.ThrottleDevice{}
+ dev := &runcconfig.BlockIODevice{
+ Major: entry.Major,
+ Minor: entry.Minor,
+ }
+ throttle.BlockIODevice = *dev
+ throttle.Rate = entry.Rate
+ final.BlkioThrottleWriteIOPSDevice = append(final.BlkioThrottleWriteIOPSDevice, throttle)
+ }
+ }
+ if resource.BlockIO.LeafWeight != nil {
+ final.BlkioLeafWeight = *resource.BlockIO.LeafWeight
+ }
+ if resource.BlockIO.Weight != nil {
+ final.BlkioWeight = *resource.BlockIO.Weight
+ }
+ if len(resource.BlockIO.WeightDevice) > 0 {
+ for _, entry := range resource.BlockIO.WeightDevice {
+ weight := &runcconfig.WeightDevice{}
+ dev := &runcconfig.BlockIODevice{
+ Major: entry.Major,
+ Minor: entry.Minor,
+ }
+ if entry.Weight != nil {
+ weight.Weight = *entry.Weight
+ }
+ if entry.LeafWeight != nil {
+ weight.LeafWeight = *entry.LeafWeight
+ }
+ weight.BlockIODevice = *dev
+ final.BlkioWeightDevice = append(final.BlkioWeightDevice, weight)
+ }
+ }
+ }
+
+ // Pids
+ if resource.Pids != nil {
+ final.PidsLimit = resource.Pids.Limit
+ }
+
+ // Networking
+ if resource.Network != nil {
+ if resource.Network.ClassID != nil {
+ final.NetClsClassid = *resource.Network.ClassID
+ }
+ }
+
+ // Unified state
+ final.Unified = resource.Unified
+
+ return *final, nil
+}