diff options
author | Doug Rabson <dfr@rabson.org> | 2022-08-17 09:35:19 +0100 |
---|---|---|
committer | Doug Rabson <dfr@rabson.org> | 2022-08-18 08:05:42 +0100 |
commit | bebf55c0f22d6723a27cd39561c0577aa557c5e1 (patch) | |
tree | c5a3cbbab83707dbd1e34f72ff22dc05010ce531 /libpod/oci_conmon_common.go | |
parent | 1f0c3d52628e7c5b22ee500194155bdd20ad271f (diff) | |
download | podman-bebf55c0f22d6723a27cd39561c0577aa557c5e1.tar.gz podman-bebf55c0f22d6723a27cd39561c0577aa557c5e1.tar.bz2 podman-bebf55c0f22d6723a27cd39561c0577aa557c5e1.zip |
libpod: Move oci_conmon_linux.go to oci_conmon_common.go
[NO NEW TESTS NEEDED]
Signed-off-by: Doug Rabson <dfr@rabson.org>
Diffstat (limited to 'libpod/oci_conmon_common.go')
-rw-r--r-- | libpod/oci_conmon_common.go | 1929 |
1 files changed, 1929 insertions, 0 deletions
diff --git a/libpod/oci_conmon_common.go b/libpod/oci_conmon_common.go new file mode 100644 index 000000000..1b654ed33 --- /dev/null +++ b/libpod/oci_conmon_common.go @@ -0,0 +1,1929 @@ +//go:build linux +// +build linux + +package libpod + +import ( + "bufio" + "bytes" + "context" + "errors" + "fmt" + "io" + "io/ioutil" + "net" + "net/http" + "os" + "os/exec" + "path/filepath" + "runtime" + "strconv" + "strings" + "sync" + "syscall" + "text/template" + "time" + + runcconfig "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/devices" + + "github.com/containers/common/pkg/cgroups" + "github.com/containers/common/pkg/config" + "github.com/containers/common/pkg/resize" + cutil "github.com/containers/common/pkg/util" + conmonConfig "github.com/containers/conmon/runner/config" + "github.com/containers/podman/v4/libpod/define" + "github.com/containers/podman/v4/libpod/logs" + "github.com/containers/podman/v4/pkg/checkpoint/crutils" + "github.com/containers/podman/v4/pkg/errorhandling" + "github.com/containers/podman/v4/pkg/rootless" + "github.com/containers/podman/v4/pkg/specgenutil" + "github.com/containers/podman/v4/pkg/util" + "github.com/containers/podman/v4/utils" + "github.com/containers/storage/pkg/homedir" + pmount "github.com/containers/storage/pkg/mount" + spec "github.com/opencontainers/runtime-spec/specs-go" + "github.com/opencontainers/selinux/go-selinux/label" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +const ( + // This is Conmon's STDIO_BUF_SIZE. I don't believe we have access to it + // directly from the Go code, so const it here + // Important: The conmon attach socket uses an extra byte at the beginning of each + // message to specify the STREAM so we have to increase the buffer size by one + bufferSize = conmonConfig.BufSize + 1 +) + +// ConmonOCIRuntime is an OCI runtime managed by Conmon. +// TODO: Make all calls to OCI runtime have a timeout. +type ConmonOCIRuntime struct { + name string + path string + conmonPath string + conmonEnv []string + tmpDir string + exitsDir string + logSizeMax int64 + noPivot bool + reservePorts bool + runtimeFlags []string + supportsJSON bool + supportsKVM bool + supportsNoCgroups bool + enableKeyring bool +} + +// Make a new Conmon-based OCI runtime with the given options. +// Conmon will wrap the given OCI runtime, which can be `runc`, `crun`, or +// any runtime with a runc-compatible CLI. +// The first path that points to a valid executable will be used. +// Deliberately private. Someone should not be able to construct this outside of +// libpod. +func newConmonOCIRuntime(name string, paths []string, conmonPath string, runtimeFlags []string, runtimeCfg *config.Config) (OCIRuntime, error) { + if name == "" { + return nil, fmt.Errorf("the OCI runtime must be provided a non-empty name: %w", define.ErrInvalidArg) + } + + // Make lookup tables for runtime support + supportsJSON := make(map[string]bool, len(runtimeCfg.Engine.RuntimeSupportsJSON)) + supportsNoCgroups := make(map[string]bool, len(runtimeCfg.Engine.RuntimeSupportsNoCgroups)) + supportsKVM := make(map[string]bool, len(runtimeCfg.Engine.RuntimeSupportsKVM)) + for _, r := range runtimeCfg.Engine.RuntimeSupportsJSON { + supportsJSON[r] = true + } + for _, r := range runtimeCfg.Engine.RuntimeSupportsNoCgroups { + supportsNoCgroups[r] = true + } + for _, r := range runtimeCfg.Engine.RuntimeSupportsKVM { + supportsKVM[r] = true + } + + runtime := new(ConmonOCIRuntime) + runtime.name = name + runtime.conmonPath = conmonPath + runtime.runtimeFlags = runtimeFlags + + runtime.conmonEnv = runtimeCfg.Engine.ConmonEnvVars + runtime.tmpDir = runtimeCfg.Engine.TmpDir + runtime.logSizeMax = runtimeCfg.Containers.LogSizeMax + runtime.noPivot = runtimeCfg.Engine.NoPivotRoot + runtime.reservePorts = runtimeCfg.Engine.EnablePortReservation + runtime.enableKeyring = runtimeCfg.Containers.EnableKeyring + + // TODO: probe OCI runtime for feature and enable automatically if + // available. + + base := filepath.Base(name) + runtime.supportsJSON = supportsJSON[base] + runtime.supportsNoCgroups = supportsNoCgroups[base] + runtime.supportsKVM = supportsKVM[base] + + foundPath := false + for _, path := range paths { + stat, err := os.Stat(path) + if err != nil { + if os.IsNotExist(err) { + continue + } + return nil, fmt.Errorf("cannot stat OCI runtime %s path: %w", name, err) + } + if !stat.Mode().IsRegular() { + continue + } + foundPath = true + logrus.Tracef("found runtime %q", path) + runtime.path = path + break + } + + // Search the $PATH as last fallback + if !foundPath { + if foundRuntime, err := exec.LookPath(name); err == nil { + foundPath = true + runtime.path = foundRuntime + logrus.Debugf("using runtime %q from $PATH: %q", name, foundRuntime) + } + } + + if !foundPath { + return nil, fmt.Errorf("no valid executable found for OCI runtime %s: %w", name, define.ErrInvalidArg) + } + + runtime.exitsDir = filepath.Join(runtime.tmpDir, "exits") + + // Create the exit files and attach sockets directories + if err := os.MkdirAll(runtime.exitsDir, 0750); err != nil { + // The directory is allowed to exist + if !os.IsExist(err) { + return nil, fmt.Errorf("error creating OCI runtime exit files directory: %w", err) + } + } + return runtime, nil +} + +// Name returns the name of the runtime being wrapped by Conmon. +func (r *ConmonOCIRuntime) Name() string { + return r.name +} + +// Path returns the path of the OCI runtime being wrapped by Conmon. +func (r *ConmonOCIRuntime) Path() string { + return r.path +} + +// hasCurrentUserMapped checks whether the current user is mapped inside the container user namespace +func hasCurrentUserMapped(ctr *Container) bool { + if len(ctr.config.IDMappings.UIDMap) == 0 && len(ctr.config.IDMappings.GIDMap) == 0 { + return true + } + uid := os.Geteuid() + for _, m := range ctr.config.IDMappings.UIDMap { + if uid >= m.HostID && uid < m.HostID+m.Size { + return true + } + } + return false +} + +// CreateContainer creates a container. +func (r *ConmonOCIRuntime) CreateContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) { + // always make the run dir accessible to the current user so that the PID files can be read without + // being in the rootless user namespace. + if err := makeAccessible(ctr.state.RunDir, 0, 0); err != nil { + return 0, err + } + if !hasCurrentUserMapped(ctr) { + for _, i := range []string{ctr.state.RunDir, ctr.runtime.config.Engine.TmpDir, ctr.config.StaticDir, ctr.state.Mountpoint, ctr.runtime.config.Engine.VolumePath} { + if err := makeAccessible(i, ctr.RootUID(), ctr.RootGID()); err != nil { + return 0, err + } + } + + // if we are running a non privileged container, be sure to umount some kernel paths so they are not + // bind mounted inside the container at all. + if !ctr.config.Privileged && !rootless.IsRootless() { + type result struct { + restoreDuration int64 + err error + } + ch := make(chan result) + go func() { + runtime.LockOSThread() + restoreDuration, err := func() (int64, error) { + fd, err := os.Open(fmt.Sprintf("/proc/%d/task/%d/ns/mnt", os.Getpid(), unix.Gettid())) + if err != nil { + return 0, err + } + defer errorhandling.CloseQuiet(fd) + + // create a new mountns on the current thread + if err = unix.Unshare(unix.CLONE_NEWNS); err != nil { + return 0, err + } + defer func() { + if err := unix.Setns(int(fd.Fd()), unix.CLONE_NEWNS); err != nil { + logrus.Errorf("Unable to clone new namespace: %q", err) + } + }() + + // don't spread our mounts around. We are setting only /sys to be slave + // so that the cleanup process is still able to umount the storage and the + // changes are propagated to the host. + err = unix.Mount("/sys", "/sys", "none", unix.MS_REC|unix.MS_SLAVE, "") + if err != nil { + return 0, fmt.Errorf("cannot make /sys slave: %w", err) + } + + mounts, err := pmount.GetMounts() + if err != nil { + return 0, err + } + for _, m := range mounts { + if !strings.HasPrefix(m.Mountpoint, "/sys/kernel") { + continue + } + err = unix.Unmount(m.Mountpoint, 0) + if err != nil && !os.IsNotExist(err) { + return 0, fmt.Errorf("cannot unmount %s: %w", m.Mountpoint, err) + } + } + return r.createOCIContainer(ctr, restoreOptions) + }() + ch <- result{ + restoreDuration: restoreDuration, + err: err, + } + }() + r := <-ch + return r.restoreDuration, r.err + } + } + return r.createOCIContainer(ctr, restoreOptions) +} + +// UpdateContainerStatus retrieves the current status of the container from the +// runtime. It updates the container's state but does not save it. +// If useRuntime is false, we will not directly hit runc to see the container's +// status, but will instead only check for the existence of the conmon exit file +// and update state to stopped if it exists. +func (r *ConmonOCIRuntime) UpdateContainerStatus(ctr *Container) error { + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return err + } + + // Store old state so we know if we were already stopped + oldState := ctr.state.State + + state := new(spec.State) + + cmd := exec.Command(r.path, "state", ctr.ID()) + cmd.Env = append(cmd.Env, fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)) + + outPipe, err := cmd.StdoutPipe() + if err != nil { + return fmt.Errorf("getting stdout pipe: %w", err) + } + errPipe, err := cmd.StderrPipe() + if err != nil { + return fmt.Errorf("getting stderr pipe: %w", err) + } + + if err := cmd.Start(); err != nil { + out, err2 := ioutil.ReadAll(errPipe) + if err2 != nil { + return fmt.Errorf("error getting container %s state: %w", ctr.ID(), err) + } + if strings.Contains(string(out), "does not exist") || strings.Contains(string(out), "No such file") { + if err := ctr.removeConmonFiles(); err != nil { + logrus.Debugf("unable to remove conmon files for container %s", ctr.ID()) + } + ctr.state.ExitCode = -1 + ctr.state.FinishedTime = time.Now() + ctr.state.State = define.ContainerStateExited + return ctr.runtime.state.AddContainerExitCode(ctr.ID(), ctr.state.ExitCode) + } + return fmt.Errorf("error getting container %s state. stderr/out: %s: %w", ctr.ID(), out, err) + } + defer func() { + _ = cmd.Wait() + }() + + if err := errPipe.Close(); err != nil { + return err + } + out, err := ioutil.ReadAll(outPipe) + if err != nil { + return fmt.Errorf("error reading stdout: %s: %w", ctr.ID(), err) + } + if err := json.NewDecoder(bytes.NewBuffer(out)).Decode(state); err != nil { + return fmt.Errorf("error decoding container status for container %s: %w", ctr.ID(), err) + } + ctr.state.PID = state.Pid + + switch state.Status { + case "created": + ctr.state.State = define.ContainerStateCreated + case "paused": + ctr.state.State = define.ContainerStatePaused + case "running": + ctr.state.State = define.ContainerStateRunning + case "stopped": + ctr.state.State = define.ContainerStateStopped + default: + return fmt.Errorf("unrecognized status returned by runtime for container %s: %s: %w", + ctr.ID(), state.Status, define.ErrInternal) + } + + // Only grab exit status if we were not already stopped + // If we were, it should already be in the database + if ctr.state.State == define.ContainerStateStopped && oldState != define.ContainerStateStopped { + if _, err := ctr.Wait(context.Background()); err != nil { + logrus.Errorf("Waiting for container %s to exit: %v", ctr.ID(), err) + } + return nil + } + + // Handle ContainerStateStopping - keep it unless the container + // transitioned to no longer running. + if oldState == define.ContainerStateStopping && (ctr.state.State == define.ContainerStatePaused || ctr.state.State == define.ContainerStateRunning) { + ctr.state.State = define.ContainerStateStopping + } + + return nil +} + +// StartContainer starts the given container. +// Sets time the container was started, but does not save it. +func (r *ConmonOCIRuntime) StartContainer(ctr *Container) error { + // TODO: streams should probably *not* be our STDIN/OUT/ERR - redirect to buffers? + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return err + } + env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} + if path, ok := os.LookupEnv("PATH"); ok { + env = append(env, fmt.Sprintf("PATH=%s", path)) + } + if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, append(r.runtimeFlags, "start", ctr.ID())...); err != nil { + return err + } + + ctr.state.StartedTime = time.Now() + + return nil +} + +// KillContainer sends the given signal to the given container. +// If all is set, send to all PIDs in the container. +// All is only supported if the container created cgroups. +func (r *ConmonOCIRuntime) KillContainer(ctr *Container, signal uint, all bool) error { + logrus.Debugf("Sending signal %d to container %s", signal, ctr.ID()) + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return err + } + env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} + var args []string + args = append(args, r.runtimeFlags...) + if all { + args = append(args, "kill", "--all", ctr.ID(), fmt.Sprintf("%d", signal)) + } else { + args = append(args, "kill", ctr.ID(), fmt.Sprintf("%d", signal)) + } + if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, args...); err != nil { + // Update container state - there's a chance we failed because + // the container exited in the meantime. + if err2 := r.UpdateContainerStatus(ctr); err2 != nil { + logrus.Infof("Error updating status for container %s: %v", ctr.ID(), err2) + } + if ctr.ensureState(define.ContainerStateStopped, define.ContainerStateExited) { + return define.ErrCtrStateInvalid + } + return fmt.Errorf("error sending signal to container %s: %w", ctr.ID(), err) + } + + return nil +} + +// StopContainer stops a container, first using its given stop signal (or +// SIGTERM if no signal was specified), then using SIGKILL. +// Timeout is given in seconds. If timeout is 0, the container will be +// immediately kill with SIGKILL. +// Does not set finished time for container, assumes you will run updateStatus +// after to pull the exit code. +func (r *ConmonOCIRuntime) StopContainer(ctr *Container, timeout uint, all bool) error { + logrus.Debugf("Stopping container %s (PID %d)", ctr.ID(), ctr.state.PID) + + // Ping the container to see if it's alive + // If it's not, it's already stopped, return + err := unix.Kill(ctr.state.PID, 0) + if err == unix.ESRCH { + return nil + } + + stopSignal := ctr.config.StopSignal + if stopSignal == 0 { + stopSignal = uint(syscall.SIGTERM) + } + + if timeout > 0 { + if err := r.KillContainer(ctr, stopSignal, all); err != nil { + // Is the container gone? + // If so, it probably died between the first check and + // our sending the signal + // The container is stopped, so exit cleanly + err := unix.Kill(ctr.state.PID, 0) + if err == unix.ESRCH { + return nil + } + + return err + } + + if err := waitContainerStop(ctr, time.Duration(timeout)*time.Second); err != nil { + logrus.Debugf("Timed out stopping container %s with %s, resorting to SIGKILL: %v", ctr.ID(), unix.SignalName(syscall.Signal(stopSignal)), err) + logrus.Warnf("StopSignal %s failed to stop container %s in %d seconds, resorting to SIGKILL", unix.SignalName(syscall.Signal(stopSignal)), ctr.Name(), timeout) + } else { + // No error, the container is dead + return nil + } + } + + if err := r.KillContainer(ctr, 9, all); err != nil { + // Again, check if the container is gone. If it is, exit cleanly. + err := unix.Kill(ctr.state.PID, 0) + if err == unix.ESRCH { + return nil + } + + return fmt.Errorf("error sending SIGKILL to container %s: %w", ctr.ID(), err) + } + + // Give runtime a few seconds to make it happen + if err := waitContainerStop(ctr, killContainerTimeout); err != nil { + return err + } + + return nil +} + +// DeleteContainer deletes a container from the OCI runtime. +func (r *ConmonOCIRuntime) DeleteContainer(ctr *Container) error { + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return err + } + env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} + return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, append(r.runtimeFlags, "delete", "--force", ctr.ID())...) +} + +// PauseContainer pauses the given container. +func (r *ConmonOCIRuntime) PauseContainer(ctr *Container) error { + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return err + } + env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} + return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, append(r.runtimeFlags, "pause", ctr.ID())...) +} + +// UnpauseContainer unpauses the given container. +func (r *ConmonOCIRuntime) UnpauseContainer(ctr *Container) error { + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return err + } + env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} + return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, append(r.runtimeFlags, "resume", ctr.ID())...) +} + +// HTTPAttach performs an attach for the HTTP API. +// The caller must handle closing the HTTP connection after this returns. +// The cancel channel is not closed; it is up to the caller to do so after +// this function returns. +// If this is a container with a terminal, we will stream raw. If it is not, we +// will stream with an 8-byte header to multiplex STDOUT and STDERR. +// Returns any errors that occurred, and whether the connection was successfully +// hijacked before that error occurred. +func (r *ConmonOCIRuntime) HTTPAttach(ctr *Container, req *http.Request, w http.ResponseWriter, streams *HTTPAttachStreams, detachKeys *string, cancel <-chan bool, hijackDone chan<- bool, streamAttach, streamLogs bool) (deferredErr error) { + isTerminal := false + if ctr.config.Spec.Process != nil { + isTerminal = ctr.config.Spec.Process.Terminal + } + + if streams != nil { + if !streams.Stdin && !streams.Stdout && !streams.Stderr { + return fmt.Errorf("must specify at least one stream to attach to: %w", define.ErrInvalidArg) + } + } + + attachSock, err := r.AttachSocketPath(ctr) + if err != nil { + return err + } + + var conn *net.UnixConn + if streamAttach { + newConn, err := openUnixSocket(attachSock) + if err != nil { + return fmt.Errorf("failed to connect to container's attach socket: %v: %w", attachSock, err) + } + conn = newConn + defer func() { + if err := conn.Close(); err != nil { + logrus.Errorf("Unable to close container %s attach socket: %q", ctr.ID(), err) + } + }() + + logrus.Debugf("Successfully connected to container %s attach socket %s", ctr.ID(), attachSock) + } + + detachString := ctr.runtime.config.Engine.DetachKeys + if detachKeys != nil { + detachString = *detachKeys + } + detach, err := processDetachKeys(detachString) + if err != nil { + return err + } + + attachStdout := true + attachStderr := true + attachStdin := true + if streams != nil { + attachStdout = streams.Stdout + attachStderr = streams.Stderr + attachStdin = streams.Stdin + } + + logrus.Debugf("Going to hijack container %s attach connection", ctr.ID()) + + // Alright, let's hijack. + hijacker, ok := w.(http.Hijacker) + if !ok { + return fmt.Errorf("unable to hijack connection") + } + + httpCon, httpBuf, err := hijacker.Hijack() + if err != nil { + return fmt.Errorf("error hijacking connection: %w", err) + } + + hijackDone <- true + + writeHijackHeader(req, httpBuf) + + // Force a flush after the header is written. + if err := httpBuf.Flush(); err != nil { + return fmt.Errorf("error flushing HTTP hijack header: %w", err) + } + + defer func() { + hijackWriteErrorAndClose(deferredErr, ctr.ID(), isTerminal, httpCon, httpBuf) + }() + + logrus.Debugf("Hijack for container %s attach session done, ready to stream", ctr.ID()) + + // TODO: This is gross. Really, really gross. + // I want to say we should read all the logs into an array before + // calling this, in container_api.go, but that could take a lot of + // memory... + // On the whole, we need to figure out a better way of doing this, + // though. + logSize := 0 + if streamLogs { + logrus.Debugf("Will stream logs for container %s attach session", ctr.ID()) + + // Get all logs for the container + logChan := make(chan *logs.LogLine) + logOpts := new(logs.LogOptions) + logOpts.Tail = -1 + logOpts.WaitGroup = new(sync.WaitGroup) + errChan := make(chan error) + go func() { + var err error + // In non-terminal mode we need to prepend with the + // stream header. + logrus.Debugf("Writing logs for container %s to HTTP attach", ctr.ID()) + for logLine := range logChan { + if !isTerminal { + device := logLine.Device + var header []byte + headerLen := uint32(len(logLine.Msg)) + logSize += len(logLine.Msg) + switch strings.ToLower(device) { + case "stdin": + header = makeHTTPAttachHeader(0, headerLen) + case "stdout": + header = makeHTTPAttachHeader(1, headerLen) + case "stderr": + header = makeHTTPAttachHeader(2, headerLen) + default: + logrus.Errorf("Unknown device for log line: %s", device) + header = makeHTTPAttachHeader(1, headerLen) + } + _, err = httpBuf.Write(header) + if err != nil { + break + } + } + _, err = httpBuf.Write([]byte(logLine.Msg)) + if err != nil { + break + } + if !logLine.Partial() { + _, err = httpBuf.Write([]byte("\n")) + if err != nil { + break + } + } + err = httpBuf.Flush() + if err != nil { + break + } + } + errChan <- err + }() + if err := ctr.ReadLog(context.Background(), logOpts, logChan, 0); err != nil { + return err + } + go func() { + logOpts.WaitGroup.Wait() + close(logChan) + }() + logrus.Debugf("Done reading logs for container %s, %d bytes", ctr.ID(), logSize) + if err := <-errChan; err != nil { + return err + } + } + if !streamAttach { + logrus.Debugf("Done streaming logs for container %s attach, exiting as attach streaming not requested", ctr.ID()) + return nil + } + + logrus.Debugf("Forwarding attach output for container %s", ctr.ID()) + + stdoutChan := make(chan error) + stdinChan := make(chan error) + + // Handle STDOUT/STDERR + go func() { + var err error + if isTerminal { + // Hack: return immediately if attachStdout not set to + // emulate Docker. + // Basically, when terminal is set, STDERR goes nowhere. + // Everything does over STDOUT. + // Therefore, if not attaching STDOUT - we'll never copy + // anything from here. + logrus.Debugf("Performing terminal HTTP attach for container %s", ctr.ID()) + if attachStdout { + err = httpAttachTerminalCopy(conn, httpBuf, ctr.ID()) + } + } else { + logrus.Debugf("Performing non-terminal HTTP attach for container %s", ctr.ID()) + err = httpAttachNonTerminalCopy(conn, httpBuf, ctr.ID(), attachStdin, attachStdout, attachStderr) + } + stdoutChan <- err + logrus.Debugf("STDOUT/ERR copy completed") + }() + // Next, STDIN. Avoid entirely if attachStdin unset. + if attachStdin { + go func() { + _, err := cutil.CopyDetachable(conn, httpBuf, detach) + logrus.Debugf("STDIN copy completed") + stdinChan <- err + }() + } + + for { + select { + case err := <-stdoutChan: + if err != nil { + return err + } + + return nil + case err := <-stdinChan: + if err != nil { + return err + } + // copy stdin is done, close it + if connErr := conn.CloseWrite(); connErr != nil { + logrus.Errorf("Unable to close conn: %v", connErr) + } + case <-cancel: + return nil + } + } +} + +// isRetryable returns whether the error was caused by a blocked syscall or the +// specified operation on a non blocking file descriptor wasn't ready for completion. +func isRetryable(err error) bool { + var errno syscall.Errno + if errors.As(err, &errno) { + return errno == syscall.EINTR || errno == syscall.EAGAIN + } + return false +} + +// openControlFile opens the terminal control file. +func openControlFile(ctr *Container, parentDir string) (*os.File, error) { + controlPath := filepath.Join(parentDir, "ctl") + for i := 0; i < 600; i++ { + controlFile, err := os.OpenFile(controlPath, unix.O_WRONLY|unix.O_NONBLOCK, 0) + if err == nil { + return controlFile, nil + } + if !isRetryable(err) { + return nil, fmt.Errorf("could not open ctl file for terminal resize for container %s: %w", ctr.ID(), err) + } + time.Sleep(time.Second / 10) + } + return nil, fmt.Errorf("timeout waiting for %q", controlPath) +} + +// AttachResize resizes the terminal used by the given container. +func (r *ConmonOCIRuntime) AttachResize(ctr *Container, newSize resize.TerminalSize) error { + controlFile, err := openControlFile(ctr, ctr.bundlePath()) + if err != nil { + return err + } + defer controlFile.Close() + + logrus.Debugf("Received a resize event for container %s: %+v", ctr.ID(), newSize) + if _, err = fmt.Fprintf(controlFile, "%d %d %d\n", 1, newSize.Height, newSize.Width); err != nil { + return fmt.Errorf("failed to write to ctl file to resize terminal: %w", err) + } + + return nil +} + +// CheckpointContainer checkpoints the given container. +func (r *ConmonOCIRuntime) CheckpointContainer(ctr *Container, options ContainerCheckpointOptions) (int64, error) { + // imagePath is used by CRIU to store the actual checkpoint files + imagePath := ctr.CheckpointPath() + if options.PreCheckPoint { + imagePath = ctr.PreCheckPointPath() + } + // workPath will be used to store dump.log and stats-dump + workPath := ctr.bundlePath() + logrus.Debugf("Writing checkpoint to %s", imagePath) + logrus.Debugf("Writing checkpoint logs to %s", workPath) + logrus.Debugf("Pre-dump the container %t", options.PreCheckPoint) + args := []string{} + args = append(args, r.runtimeFlags...) + args = append(args, "checkpoint") + args = append(args, "--image-path") + args = append(args, imagePath) + args = append(args, "--work-path") + args = append(args, workPath) + if options.KeepRunning { + args = append(args, "--leave-running") + } + if options.TCPEstablished { + args = append(args, "--tcp-established") + } + if options.FileLocks { + args = append(args, "--file-locks") + } + if !options.PreCheckPoint && options.KeepRunning { + args = append(args, "--leave-running") + } + if options.PreCheckPoint { + args = append(args, "--pre-dump") + } + if !options.PreCheckPoint && options.WithPrevious { + args = append( + args, + "--parent-path", + filepath.Join("..", preCheckpointDir), + ) + } + + args = append(args, ctr.ID()) + logrus.Debugf("the args to checkpoint: %s %s", r.path, strings.Join(args, " ")) + + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return 0, err + } + env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} + if path, ok := os.LookupEnv("PATH"); ok { + env = append(env, fmt.Sprintf("PATH=%s", path)) + } + + runtime.LockOSThread() + if err := label.SetSocketLabel(ctr.ProcessLabel()); err != nil { + return 0, err + } + + runtimeCheckpointStarted := time.Now() + err = utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, args...) + // Ignore error returned from SetSocketLabel("") call, + // can't recover. + if labelErr := label.SetSocketLabel(""); labelErr == nil { + // Unlock the thread only if the process label could be restored + // successfully. Otherwise leave the thread locked and the Go runtime + // will terminate it once it returns to the threads pool. + runtime.UnlockOSThread() + } else { + logrus.Errorf("Unable to reset socket label: %q", labelErr) + } + + runtimeCheckpointDuration := func() int64 { + if options.PrintStats { + return time.Since(runtimeCheckpointStarted).Microseconds() + } + return 0 + }() + + return runtimeCheckpointDuration, err +} + +func (r *ConmonOCIRuntime) CheckConmonRunning(ctr *Container) (bool, error) { + if ctr.state.ConmonPID == 0 { + // If the container is running or paused, assume Conmon is + // running. We didn't record Conmon PID on some old versions, so + // that is likely what's going on... + // Unusual enough that we should print a warning message though. + if ctr.ensureState(define.ContainerStateRunning, define.ContainerStatePaused) { + logrus.Warnf("Conmon PID is not set, but container is running!") + return true, nil + } + // Container's not running, so conmon PID being unset is + // expected. Conmon is not running. + return false, nil + } + + // We have a conmon PID. Ping it with signal 0. + if err := unix.Kill(ctr.state.ConmonPID, 0); err != nil { + if err == unix.ESRCH { + return false, nil + } + return false, fmt.Errorf("error pinging container %s conmon with signal 0: %w", ctr.ID(), err) + } + return true, nil +} + +// SupportsCheckpoint checks if the OCI runtime supports checkpointing +// containers. +func (r *ConmonOCIRuntime) SupportsCheckpoint() bool { + return crutils.CRRuntimeSupportsCheckpointRestore(r.path) +} + +// SupportsJSONErrors checks if the OCI runtime supports JSON-formatted error +// messages. +func (r *ConmonOCIRuntime) SupportsJSONErrors() bool { + return r.supportsJSON +} + +// SupportsNoCgroups checks if the OCI runtime supports running containers +// without cgroups (the --cgroup-manager=disabled flag). +func (r *ConmonOCIRuntime) SupportsNoCgroups() bool { + return r.supportsNoCgroups +} + +// SupportsKVM checks if the OCI runtime supports running containers +// without KVM separation +func (r *ConmonOCIRuntime) SupportsKVM() bool { + return r.supportsKVM +} + +// AttachSocketPath is the path to a single container's attach socket. +func (r *ConmonOCIRuntime) AttachSocketPath(ctr *Container) (string, error) { + if ctr == nil { + return "", fmt.Errorf("must provide a valid container to get attach socket path: %w", define.ErrInvalidArg) + } + + return filepath.Join(ctr.bundlePath(), "attach"), nil +} + +// ExitFilePath is the path to a container's exit file. +func (r *ConmonOCIRuntime) ExitFilePath(ctr *Container) (string, error) { + if ctr == nil { + return "", fmt.Errorf("must provide a valid container to get exit file path: %w", define.ErrInvalidArg) + } + return filepath.Join(r.exitsDir, ctr.ID()), nil +} + +// RuntimeInfo provides information on the runtime. +func (r *ConmonOCIRuntime) RuntimeInfo() (*define.ConmonInfo, *define.OCIRuntimeInfo, error) { + runtimePackage := packageVersion(r.path) + conmonPackage := packageVersion(r.conmonPath) + runtimeVersion, err := r.getOCIRuntimeVersion() + if err != nil { + return nil, nil, fmt.Errorf("error getting version of OCI runtime %s: %w", r.name, err) + } + conmonVersion, err := r.getConmonVersion() + if err != nil { + return nil, nil, fmt.Errorf("error getting conmon version: %w", err) + } + + conmon := define.ConmonInfo{ + Package: conmonPackage, + Path: r.conmonPath, + Version: conmonVersion, + } + ocirt := define.OCIRuntimeInfo{ + Name: r.name, + Path: r.path, + Package: runtimePackage, + Version: runtimeVersion, + } + return &conmon, &ocirt, nil +} + +// makeAccessible changes the path permission and each parent directory to have --x--x--x +func makeAccessible(path string, uid, gid int) error { + for ; path != "/"; path = filepath.Dir(path) { + st, err := os.Stat(path) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + if int(st.Sys().(*syscall.Stat_t).Uid) == uid && int(st.Sys().(*syscall.Stat_t).Gid) == gid { + continue + } + if st.Mode()&0111 != 0111 { + if err := os.Chmod(path, st.Mode()|0111); err != nil { + return err + } + } + } + return nil +} + +// Wait for a container which has been sent a signal to stop +func waitContainerStop(ctr *Container, timeout time.Duration) error { + return waitPidStop(ctr.state.PID, timeout) +} + +// Wait for a given PID to stop +func waitPidStop(pid int, timeout time.Duration) error { + done := make(chan struct{}) + chControl := make(chan struct{}) + go func() { + for { + select { + case <-chControl: + return + default: + if err := unix.Kill(pid, 0); err != nil { + if err == unix.ESRCH { + close(done) + return + } + logrus.Errorf("Pinging PID %d with signal 0: %v", pid, err) + } + time.Sleep(100 * time.Millisecond) + } + } + }() + select { + case <-done: + return nil + case <-time.After(timeout): + close(chControl) + return fmt.Errorf("given PIDs did not die within timeout") + } +} + +func (r *ConmonOCIRuntime) getLogTag(ctr *Container) (string, error) { + logTag := ctr.LogTag() + if logTag == "" { + return "", nil + } + data, err := ctr.inspectLocked(false) + if err != nil { + // FIXME: this error should probably be returned + return "", nil //nolint: nilerr + } + tmpl, err := template.New("container").Parse(logTag) + if err != nil { + return "", fmt.Errorf("template parsing error %s: %w", logTag, err) + } + var b bytes.Buffer + err = tmpl.Execute(&b, data) + if err != nil { + return "", err + } + return b.String(), nil +} + +// createOCIContainer generates this container's main conmon instance and prepares it for starting +func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) { + var stderrBuf bytes.Buffer + + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return 0, err + } + + parentSyncPipe, childSyncPipe, err := newPipe() + if err != nil { + return 0, fmt.Errorf("error creating socket pair: %w", err) + } + defer errorhandling.CloseQuiet(parentSyncPipe) + + childStartPipe, parentStartPipe, err := newPipe() + if err != nil { + return 0, fmt.Errorf("error creating socket pair for start pipe: %w", err) + } + + defer errorhandling.CloseQuiet(parentStartPipe) + + var ociLog string + if logrus.GetLevel() != logrus.DebugLevel && r.supportsJSON { + ociLog = filepath.Join(ctr.state.RunDir, "oci-log") + } + + logTag, err := r.getLogTag(ctr) + if err != nil { + return 0, err + } + + if ctr.config.CgroupsMode == cgroupSplit { + if err := utils.MoveUnderCgroupSubtree("runtime"); err != nil { + return 0, err + } + } + + pidfile := ctr.config.PidFile + if pidfile == "" { + pidfile = filepath.Join(ctr.state.RunDir, "pidfile") + } + + args := r.sharedConmonArgs(ctr, ctr.ID(), ctr.bundlePath(), pidfile, ctr.LogPath(), r.exitsDir, ociLog, ctr.LogDriver(), logTag) + + if ctr.config.SdNotifyMode == define.SdNotifyModeContainer && ctr.config.SdNotifySocket != "" { + args = append(args, fmt.Sprintf("--sdnotify-socket=%s", ctr.config.SdNotifySocket)) + } + + if ctr.config.Spec.Process.Terminal { + args = append(args, "-t") + } else if ctr.config.Stdin { + args = append(args, "-i") + } + + if ctr.config.Timeout > 0 { + args = append(args, fmt.Sprintf("--timeout=%d", ctr.config.Timeout)) + } + + if !r.enableKeyring { + args = append(args, "--no-new-keyring") + } + if ctr.config.ConmonPidFile != "" { + args = append(args, "--conmon-pidfile", ctr.config.ConmonPidFile) + } + + if r.noPivot { + args = append(args, "--no-pivot") + } + + exitCommand, err := specgenutil.CreateExitCommandArgs(ctr.runtime.storageConfig, ctr.runtime.config, logrus.IsLevelEnabled(logrus.DebugLevel), ctr.AutoRemove(), false) + if err != nil { + return 0, err + } + exitCommand = append(exitCommand, ctr.config.ID) + + args = append(args, "--exit-command", exitCommand[0]) + for _, arg := range exitCommand[1:] { + args = append(args, []string{"--exit-command-arg", arg}...) + } + + // Pass down the LISTEN_* environment (see #10443). + preserveFDs := ctr.config.PreserveFDs + if val := os.Getenv("LISTEN_FDS"); val != "" { + if ctr.config.PreserveFDs > 0 { + logrus.Warnf("Ignoring LISTEN_FDS to preserve custom user-specified FDs") + } else { + fds, err := strconv.Atoi(val) + if err != nil { + return 0, fmt.Errorf("converting LISTEN_FDS=%s: %w", val, err) + } + preserveFDs = uint(fds) + } + } + + if preserveFDs > 0 { + args = append(args, formatRuntimeOpts("--preserve-fds", fmt.Sprintf("%d", preserveFDs))...) + } + + if restoreOptions != nil { + args = append(args, "--restore", ctr.CheckpointPath()) + if restoreOptions.TCPEstablished { + args = append(args, "--runtime-opt", "--tcp-established") + } + if restoreOptions.FileLocks { + args = append(args, "--runtime-opt", "--file-locks") + } + if restoreOptions.Pod != "" { + mountLabel := ctr.config.MountLabel + processLabel := ctr.config.ProcessLabel + if mountLabel != "" { + args = append( + args, + "--runtime-opt", + fmt.Sprintf( + "--lsm-mount-context=%s", + mountLabel, + ), + ) + } + if processLabel != "" { + args = append( + args, + "--runtime-opt", + fmt.Sprintf( + "--lsm-profile=selinux:%s", + processLabel, + ), + ) + } + } + } + + logrus.WithFields(logrus.Fields{ + "args": args, + }).Debugf("running conmon: %s", r.conmonPath) + + cmd := exec.Command(r.conmonPath, args...) + cmd.SysProcAttr = &syscall.SysProcAttr{ + Setpgid: true, + } + // TODO this is probably a really bad idea for some uses + // Make this configurable + cmd.Stdin = os.Stdin + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + if ctr.config.Spec.Process.Terminal { + cmd.Stderr = &stderrBuf + } + + // 0, 1 and 2 are stdin, stdout and stderr + conmonEnv := r.configureConmonEnv(runtimeDir) + + var filesToClose []*os.File + if preserveFDs > 0 { + for fd := 3; fd < int(3+preserveFDs); fd++ { + f := os.NewFile(uintptr(fd), fmt.Sprintf("fd-%d", fd)) + filesToClose = append(filesToClose, f) + cmd.ExtraFiles = append(cmd.ExtraFiles, f) + } + } + + cmd.Env = r.conmonEnv + // we don't want to step on users fds they asked to preserve + // Since 0-2 are used for stdio, start the fds we pass in at preserveFDs+3 + cmd.Env = append(cmd.Env, fmt.Sprintf("_OCI_SYNCPIPE=%d", preserveFDs+3), fmt.Sprintf("_OCI_STARTPIPE=%d", preserveFDs+4)) + cmd.Env = append(cmd.Env, conmonEnv...) + cmd.ExtraFiles = append(cmd.ExtraFiles, childSyncPipe, childStartPipe) + + if r.reservePorts && !rootless.IsRootless() && !ctr.config.NetMode.IsSlirp4netns() { + ports, err := bindPorts(ctr.convertPortMappings()) + if err != nil { + return 0, err + } + filesToClose = append(filesToClose, ports...) + + // Leak the port we bound in the conmon process. These fd's won't be used + // by the container and conmon will keep the ports busy so that another + // process cannot use them. + cmd.ExtraFiles = append(cmd.ExtraFiles, ports...) + } + + if ctr.config.NetMode.IsSlirp4netns() || rootless.IsRootless() { + if ctr.config.PostConfigureNetNS { + havePortMapping := len(ctr.config.PortMappings) > 0 + if havePortMapping { + ctr.rootlessPortSyncR, ctr.rootlessPortSyncW, err = os.Pipe() + if err != nil { + return 0, fmt.Errorf("failed to create rootless port sync pipe: %w", err) + } + } + ctr.rootlessSlirpSyncR, ctr.rootlessSlirpSyncW, err = os.Pipe() + if err != nil { + return 0, fmt.Errorf("failed to create rootless network sync pipe: %w", err) + } + } else { + if ctr.rootlessSlirpSyncR != nil { + defer errorhandling.CloseQuiet(ctr.rootlessSlirpSyncR) + } + if ctr.rootlessSlirpSyncW != nil { + defer errorhandling.CloseQuiet(ctr.rootlessSlirpSyncW) + } + } + // Leak one end in conmon, the other one will be leaked into slirp4netns + cmd.ExtraFiles = append(cmd.ExtraFiles, ctr.rootlessSlirpSyncW) + + if ctr.rootlessPortSyncW != nil { + defer errorhandling.CloseQuiet(ctr.rootlessPortSyncW) + // Leak one end in conmon, the other one will be leaked into rootlessport + cmd.ExtraFiles = append(cmd.ExtraFiles, ctr.rootlessPortSyncW) + } + } + var runtimeRestoreStarted time.Time + if restoreOptions != nil { + runtimeRestoreStarted = time.Now() + } + err = startCommand(cmd, ctr) + + // regardless of whether we errored or not, we no longer need the children pipes + childSyncPipe.Close() + childStartPipe.Close() + if err != nil { + return 0, err + } + if err := r.moveConmonToCgroupAndSignal(ctr, cmd, parentStartPipe); err != nil { + return 0, err + } + /* Wait for initial setup and fork, and reap child */ + err = cmd.Wait() + if err != nil { + return 0, err + } + + pid, err := readConmonPipeData(r.name, parentSyncPipe, ociLog) + if err != nil { + if err2 := r.DeleteContainer(ctr); err2 != nil { + logrus.Errorf("Removing container %s from runtime after creation failed", ctr.ID()) + } + return 0, err + } + ctr.state.PID = pid + + conmonPID, err := readConmonPidFile(ctr.config.ConmonPidFile) + if err != nil { + logrus.Warnf("Error reading conmon pid file for container %s: %v", ctr.ID(), err) + } else if conmonPID > 0 { + // conmon not having a pid file is a valid state, so don't set it if we don't have it + logrus.Infof("Got Conmon PID as %d", conmonPID) + ctr.state.ConmonPID = conmonPID + } + + runtimeRestoreDuration := func() int64 { + if restoreOptions != nil && restoreOptions.PrintStats { + return time.Since(runtimeRestoreStarted).Microseconds() + } + return 0 + }() + + // These fds were passed down to the runtime. Close them + // and not interfere + for _, f := range filesToClose { + errorhandling.CloseQuiet(f) + } + + return runtimeRestoreDuration, nil +} + +// configureConmonEnv gets the environment values to add to conmon's exec struct +// TODO this may want to be less hardcoded/more configurable in the future +func (r *ConmonOCIRuntime) configureConmonEnv(runtimeDir string) []string { + var env []string + for _, e := range os.Environ() { + if strings.HasPrefix(e, "LC_") { + env = append(env, e) + } + } + conf, ok := os.LookupEnv("CONTAINERS_CONF") + if ok { + env = append(env, fmt.Sprintf("CONTAINERS_CONF=%s", conf)) + } + env = append(env, fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)) + env = append(env, fmt.Sprintf("_CONTAINERS_USERNS_CONFIGURED=%s", os.Getenv("_CONTAINERS_USERNS_CONFIGURED"))) + env = append(env, fmt.Sprintf("_CONTAINERS_ROOTLESS_UID=%s", os.Getenv("_CONTAINERS_ROOTLESS_UID"))) + home := homedir.Get() + if home != "" { + env = append(env, fmt.Sprintf("HOME=%s", home)) + } + + return env +} + +// sharedConmonArgs takes common arguments for exec and create/restore and formats them for the conmon CLI +func (r *ConmonOCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, pidPath, logPath, exitDir, ociLogPath, logDriver, logTag string) []string { + // set the conmon API version to be able to use the correct sync struct keys + args := []string{ + "--api-version", "1", + "-c", ctr.ID(), + "-u", cuuid, + "-r", r.path, + "-b", bundlePath, + "-p", pidPath, + "-n", ctr.Name(), + "--exit-dir", exitDir, + "--full-attach", + } + if len(r.runtimeFlags) > 0 { + rFlags := []string{} + for _, arg := range r.runtimeFlags { + rFlags = append(rFlags, "--runtime-arg", arg) + } + args = append(args, rFlags...) + } + + if ctr.CgroupManager() == config.SystemdCgroupsManager && !ctr.config.NoCgroups && ctr.config.CgroupsMode != cgroupSplit { + args = append(args, "-s") + } + + var logDriverArg string + switch logDriver { + case define.JournaldLogging: + logDriverArg = define.JournaldLogging + case define.NoLogging: + logDriverArg = define.NoLogging + case define.PassthroughLogging: + logDriverArg = define.PassthroughLogging + //lint:ignore ST1015 the default case has to be here + default: //nolint:stylecheck,gocritic + // No case here should happen except JSONLogging, but keep this here in case the options are extended + logrus.Errorf("%s logging specified but not supported. Choosing k8s-file logging instead", ctr.LogDriver()) + fallthrough + case "": + // to get here, either a user would specify `--log-driver ""`, or this came from another place in libpod + // since the former case is obscure, and the latter case isn't an error, let's silently fallthrough + fallthrough + case define.JSONLogging: + fallthrough + case define.KubernetesLogging: + logDriverArg = fmt.Sprintf("%s:%s", define.KubernetesLogging, logPath) + } + + args = append(args, "-l", logDriverArg) + logLevel := logrus.GetLevel() + args = append(args, "--log-level", logLevel.String()) + + if logLevel == logrus.DebugLevel { + logrus.Debugf("%s messages will be logged to syslog", r.conmonPath) + args = append(args, "--syslog") + } + + size := r.logSizeMax + if ctr.config.LogSize > 0 { + size = ctr.config.LogSize + } + if size > 0 { + args = append(args, "--log-size-max", fmt.Sprintf("%v", size)) + } + + if ociLogPath != "" { + args = append(args, "--runtime-arg", "--log-format=json", "--runtime-arg", "--log", fmt.Sprintf("--runtime-arg=%s", ociLogPath)) + } + if logTag != "" { + args = append(args, "--log-tag", logTag) + } + if ctr.config.NoCgroups { + logrus.Debugf("Running with no Cgroups") + args = append(args, "--runtime-arg", "--cgroup-manager", "--runtime-arg", "disabled") + } + return args +} + +func startCommand(cmd *exec.Cmd, ctr *Container) error { + // Make sure to unset the NOTIFY_SOCKET and reset it afterwards if needed. + switch ctr.config.SdNotifyMode { + case define.SdNotifyModeContainer, define.SdNotifyModeIgnore: + if prev := os.Getenv("NOTIFY_SOCKET"); prev != "" { + if err := os.Unsetenv("NOTIFY_SOCKET"); err != nil { + logrus.Warnf("Error unsetting NOTIFY_SOCKET %v", err) + } + defer func() { + if err := os.Setenv("NOTIFY_SOCKET", prev); err != nil { + logrus.Errorf("Resetting NOTIFY_SOCKET=%s", prev) + } + }() + } + } + + return cmd.Start() +} + +// moveConmonToCgroupAndSignal gets a container's cgroupParent and moves the conmon process to that cgroup +// it then signals for conmon to start by sending nonce data down the start fd +func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec.Cmd, startFd *os.File) error { + mustCreateCgroup := true + + if ctr.config.NoCgroups { + mustCreateCgroup = false + } + + // If cgroup creation is disabled - just signal. + switch ctr.config.CgroupsMode { + case "disabled", "no-conmon", cgroupSplit: + mustCreateCgroup = false + } + + // $INVOCATION_ID is set by systemd when running as a service. + if ctr.runtime.RemoteURI() == "" && os.Getenv("INVOCATION_ID") != "" { + mustCreateCgroup = false + } + + if mustCreateCgroup { + // Usually rootless users are not allowed to configure cgroupfs. + // There are cases though, where it is allowed, e.g. if the cgroup + // is manually configured and chowned). Avoid detecting all + // such cases and simply use a lower log level. + logLevel := logrus.WarnLevel + if rootless.IsRootless() { + logLevel = logrus.InfoLevel + } + // TODO: This should be a switch - we are not guaranteed that + // there are only 2 valid cgroup managers + cgroupParent := ctr.CgroupParent() + cgroupPath := filepath.Join(ctr.config.CgroupParent, "conmon") + Resource := ctr.Spec().Linux.Resources + cgroupResources, err := GetLimits(Resource) + if err != nil { + logrus.StandardLogger().Log(logLevel, "Could not get ctr resources") + } + if ctr.CgroupManager() == config.SystemdCgroupsManager { + unitName := createUnitName("libpod-conmon", ctr.ID()) + realCgroupParent := cgroupParent + splitParent := strings.Split(cgroupParent, "/") + if strings.HasSuffix(cgroupParent, ".slice") && len(splitParent) > 1 { + realCgroupParent = splitParent[len(splitParent)-1] + } + + logrus.Infof("Running conmon under slice %s and unitName %s", realCgroupParent, unitName) + if err := utils.RunUnderSystemdScope(cmd.Process.Pid, realCgroupParent, unitName); err != nil { + logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to systemd sandbox cgroup: %v", err) + } + } else { + control, err := cgroups.New(cgroupPath, &cgroupResources) + if err != nil { + logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to cgroupfs sandbox cgroup: %v", err) + } else if err := control.AddPid(cmd.Process.Pid); err != nil { + // we need to remove this defer and delete the cgroup once conmon exits + // maybe need a conmon monitor? + logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to cgroupfs sandbox cgroup: %v", err) + } + } + } + + /* We set the cgroup, now the child can start creating children */ + if err := writeConmonPipeData(startFd); err != nil { + return err + } + return nil +} + +// newPipe creates a unix socket pair for communication. +// Returns two files - first is parent, second is child. +func newPipe() (*os.File, *os.File, error) { + fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0) + if err != nil { + return nil, nil, err + } + return os.NewFile(uintptr(fds[1]), "parent"), os.NewFile(uintptr(fds[0]), "child"), nil +} + +// readConmonPidFile attempts to read conmon's pid from its pid file +func readConmonPidFile(pidFile string) (int, error) { + // Let's try reading the Conmon pid at the same time. + if pidFile != "" { + contents, err := ioutil.ReadFile(pidFile) + if err != nil { + return -1, err + } + // Convert it to an int + conmonPID, err := strconv.Atoi(string(contents)) + if err != nil { + return -1, err + } + return conmonPID, nil + } + return 0, nil +} + +// readConmonPipeData attempts to read a syncInfo struct from the pipe +func readConmonPipeData(runtimeName string, pipe *os.File, ociLog string) (int, error) { + // syncInfo is used to return data from monitor process to daemon + type syncInfo struct { + Data int `json:"data"` + Message string `json:"message,omitempty"` + } + + // Wait to get container pid from conmon + type syncStruct struct { + si *syncInfo + err error + } + ch := make(chan syncStruct) + go func() { + var si *syncInfo + rdr := bufio.NewReader(pipe) + b, err := rdr.ReadBytes('\n') + // ignore EOF here, error is returned even when data was read + // if it is no valid json unmarshal will fail below + if err != nil && !errors.Is(err, io.EOF) { + ch <- syncStruct{err: err} + } + if err := json.Unmarshal(b, &si); err != nil { + ch <- syncStruct{err: fmt.Errorf("conmon bytes %q: %w", string(b), err)} + return + } + ch <- syncStruct{si: si} + }() + + data := -1 //nolint: wastedassign + select { + case ss := <-ch: + if ss.err != nil { + if ociLog != "" { + ociLogData, err := ioutil.ReadFile(ociLog) + if err == nil { + var ociErr ociError + if err := json.Unmarshal(ociLogData, &ociErr); err == nil { + return -1, getOCIRuntimeError(runtimeName, ociErr.Msg) + } + } + } + return -1, fmt.Errorf("container create failed (no logs from conmon): %w", ss.err) + } + logrus.Debugf("Received: %d", ss.si.Data) + if ss.si.Data < 0 { + if ociLog != "" { + ociLogData, err := ioutil.ReadFile(ociLog) + if err == nil { + var ociErr ociError + if err := json.Unmarshal(ociLogData, &ociErr); err == nil { + return ss.si.Data, getOCIRuntimeError(runtimeName, ociErr.Msg) + } + } + } + // If we failed to parse the JSON errors, then print the output as it is + if ss.si.Message != "" { + return ss.si.Data, getOCIRuntimeError(runtimeName, ss.si.Message) + } + return ss.si.Data, fmt.Errorf("container create failed: %w", define.ErrInternal) + } + data = ss.si.Data + case <-time.After(define.ContainerCreateTimeout): + return -1, fmt.Errorf("container creation timeout: %w", define.ErrInternal) + } + return data, nil +} + +// writeConmonPipeData writes nonce data to a pipe +func writeConmonPipeData(pipe *os.File) error { + someData := []byte{0} + _, err := pipe.Write(someData) + return err +} + +// formatRuntimeOpts prepends opts passed to it with --runtime-opt for passing to conmon +func formatRuntimeOpts(opts ...string) []string { + args := make([]string, 0, len(opts)*2) + for _, o := range opts { + args = append(args, "--runtime-opt", o) + } + return args +} + +// getConmonVersion returns a string representation of the conmon version. +func (r *ConmonOCIRuntime) getConmonVersion() (string, error) { + output, err := utils.ExecCmd(r.conmonPath, "--version") + if err != nil { + return "", err + } + return strings.TrimSuffix(strings.Replace(output, "\n", ", ", 1), "\n"), nil +} + +// getOCIRuntimeVersion returns a string representation of the OCI runtime's +// version. +func (r *ConmonOCIRuntime) getOCIRuntimeVersion() (string, error) { + output, err := utils.ExecCmd(r.path, "--version") + if err != nil { + return "", err + } + return strings.TrimSuffix(output, "\n"), nil +} + +// Copy data from container to HTTP connection, for terminal attach. +// Container is the container's attach socket connection, http is a buffer for +// the HTTP connection. cid is the ID of the container the attach session is +// running for (used solely for error messages). +func httpAttachTerminalCopy(container *net.UnixConn, http *bufio.ReadWriter, cid string) error { + buf := make([]byte, bufferSize) + for { + numR, err := container.Read(buf) + logrus.Debugf("Read fd(%d) %d/%d bytes for container %s", int(buf[0]), numR, len(buf), cid) + + if numR > 0 { + switch buf[0] { + case AttachPipeStdout: + // Do nothing + default: + logrus.Errorf("Received unexpected attach type %+d, discarding %d bytes", buf[0], numR) + continue + } + + numW, err2 := http.Write(buf[1:numR]) + if err2 != nil { + if err != nil { + logrus.Errorf("Reading container %s STDOUT: %v", cid, err) + } + return err2 + } else if numW+1 != numR { + return io.ErrShortWrite + } + // We need to force the buffer to write immediately, so + // there isn't a delay on the terminal side. + if err2 := http.Flush(); err2 != nil { + if err != nil { + logrus.Errorf("Reading container %s STDOUT: %v", cid, err) + } + return err2 + } + } + if err != nil { + if err == io.EOF { + return nil + } + return err + } + } +} + +// Copy data from a container to an HTTP connection, for non-terminal attach. +// Appends a header to multiplex input. +func httpAttachNonTerminalCopy(container *net.UnixConn, http *bufio.ReadWriter, cid string, stdin, stdout, stderr bool) error { + buf := make([]byte, bufferSize) + for { + numR, err := container.Read(buf) + if numR > 0 { + var headerBuf []byte + + // Subtract 1 because we strip the first byte (used for + // multiplexing by Conmon). + headerLen := uint32(numR - 1) + // Practically speaking, we could make this buf[0] - 1, + // but we need to validate it anyway. + switch buf[0] { + case AttachPipeStdin: + headerBuf = makeHTTPAttachHeader(0, headerLen) + if !stdin { + continue + } + case AttachPipeStdout: + if !stdout { + continue + } + headerBuf = makeHTTPAttachHeader(1, headerLen) + case AttachPipeStderr: + if !stderr { + continue + } + headerBuf = makeHTTPAttachHeader(2, headerLen) + default: + logrus.Errorf("Received unexpected attach type %+d, discarding %d bytes", buf[0], numR) + continue + } + + numH, err2 := http.Write(headerBuf) + if err2 != nil { + if err != nil { + logrus.Errorf("Reading container %s standard streams: %v", cid, err) + } + + return err2 + } + // Hardcoding header length is pretty gross, but + // fast. Should be safe, as this is a fixed part + // of the protocol. + if numH != 8 { + if err != nil { + logrus.Errorf("Reading container %s standard streams: %v", cid, err) + } + + return io.ErrShortWrite + } + + numW, err2 := http.Write(buf[1:numR]) + if err2 != nil { + if err != nil { + logrus.Errorf("Reading container %s standard streams: %v", cid, err) + } + + return err2 + } else if numW+1 != numR { + if err != nil { + logrus.Errorf("Reading container %s standard streams: %v", cid, err) + } + + return io.ErrShortWrite + } + // We need to force the buffer to write immediately, so + // there isn't a delay on the terminal side. + if err2 := http.Flush(); err2 != nil { + if err != nil { + logrus.Errorf("Reading container %s STDOUT: %v", cid, err) + } + return err2 + } + } + if err != nil { + if err == io.EOF { + return nil + } + + return err + } + } +} + +// GetLimits converts spec resource limits to cgroup consumable limits +func GetLimits(resource *spec.LinuxResources) (runcconfig.Resources, error) { + if resource == nil { + resource = &spec.LinuxResources{} + } + final := &runcconfig.Resources{} + devs := []*devices.Rule{} + + // Devices + for _, entry := range resource.Devices { + if entry.Major == nil || entry.Minor == nil { + continue + } + runeType := 'a' + switch entry.Type { + case "b": + runeType = 'b' + case "c": + runeType = 'c' + } + + devs = append(devs, &devices.Rule{ + Type: devices.Type(runeType), + Major: *entry.Major, + Minor: *entry.Minor, + Permissions: devices.Permissions(entry.Access), + Allow: entry.Allow, + }) + } + final.Devices = devs + + // HugepageLimits + pageLimits := []*runcconfig.HugepageLimit{} + for _, entry := range resource.HugepageLimits { + pageLimits = append(pageLimits, &runcconfig.HugepageLimit{ + Pagesize: entry.Pagesize, + Limit: entry.Limit, + }) + } + final.HugetlbLimit = pageLimits + + // Networking + netPriorities := []*runcconfig.IfPrioMap{} + if resource.Network != nil { + for _, entry := range resource.Network.Priorities { + netPriorities = append(netPriorities, &runcconfig.IfPrioMap{ + Interface: entry.Name, + Priority: int64(entry.Priority), + }) + } + } + final.NetPrioIfpriomap = netPriorities + rdma := make(map[string]runcconfig.LinuxRdma) + for name, entry := range resource.Rdma { + rdma[name] = runcconfig.LinuxRdma{HcaHandles: entry.HcaHandles, HcaObjects: entry.HcaObjects} + } + final.Rdma = rdma + + // Memory + if resource.Memory != nil { + if resource.Memory.Limit != nil { + final.Memory = *resource.Memory.Limit + } + if resource.Memory.Reservation != nil { + final.MemoryReservation = *resource.Memory.Reservation + } + if resource.Memory.Swap != nil { + final.MemorySwap = *resource.Memory.Swap + } + if resource.Memory.Swappiness != nil { + final.MemorySwappiness = resource.Memory.Swappiness + } + } + + // CPU + if resource.CPU != nil { + if resource.CPU.Period != nil { + final.CpuPeriod = *resource.CPU.Period + } + if resource.CPU.Quota != nil { + final.CpuQuota = *resource.CPU.Quota + } + if resource.CPU.RealtimePeriod != nil { + final.CpuRtPeriod = *resource.CPU.RealtimePeriod + } + if resource.CPU.RealtimeRuntime != nil { + final.CpuRtRuntime = *resource.CPU.RealtimeRuntime + } + if resource.CPU.Shares != nil { + final.CpuShares = *resource.CPU.Shares + } + final.CpusetCpus = resource.CPU.Cpus + final.CpusetMems = resource.CPU.Mems + } + + // BlkIO + if resource.BlockIO != nil { + if len(resource.BlockIO.ThrottleReadBpsDevice) > 0 { + for _, entry := range resource.BlockIO.ThrottleReadBpsDevice { + throttle := &runcconfig.ThrottleDevice{} + dev := &runcconfig.BlockIODevice{ + Major: entry.Major, + Minor: entry.Minor, + } + throttle.BlockIODevice = *dev + throttle.Rate = entry.Rate + final.BlkioThrottleReadBpsDevice = append(final.BlkioThrottleReadBpsDevice, throttle) + } + } + if len(resource.BlockIO.ThrottleWriteBpsDevice) > 0 { + for _, entry := range resource.BlockIO.ThrottleWriteBpsDevice { + throttle := &runcconfig.ThrottleDevice{} + dev := &runcconfig.BlockIODevice{ + Major: entry.Major, + Minor: entry.Minor, + } + throttle.BlockIODevice = *dev + throttle.Rate = entry.Rate + final.BlkioThrottleWriteBpsDevice = append(final.BlkioThrottleWriteBpsDevice, throttle) + } + } + if len(resource.BlockIO.ThrottleReadIOPSDevice) > 0 { + for _, entry := range resource.BlockIO.ThrottleReadIOPSDevice { + throttle := &runcconfig.ThrottleDevice{} + dev := &runcconfig.BlockIODevice{ + Major: entry.Major, + Minor: entry.Minor, + } + throttle.BlockIODevice = *dev + throttle.Rate = entry.Rate + final.BlkioThrottleReadIOPSDevice = append(final.BlkioThrottleReadIOPSDevice, throttle) + } + } + if len(resource.BlockIO.ThrottleWriteIOPSDevice) > 0 { + for _, entry := range resource.BlockIO.ThrottleWriteIOPSDevice { + throttle := &runcconfig.ThrottleDevice{} + dev := &runcconfig.BlockIODevice{ + Major: entry.Major, + Minor: entry.Minor, + } + throttle.BlockIODevice = *dev + throttle.Rate = entry.Rate + final.BlkioThrottleWriteIOPSDevice = append(final.BlkioThrottleWriteIOPSDevice, throttle) + } + } + if resource.BlockIO.LeafWeight != nil { + final.BlkioLeafWeight = *resource.BlockIO.LeafWeight + } + if resource.BlockIO.Weight != nil { + final.BlkioWeight = *resource.BlockIO.Weight + } + if len(resource.BlockIO.WeightDevice) > 0 { + for _, entry := range resource.BlockIO.WeightDevice { + weight := &runcconfig.WeightDevice{} + dev := &runcconfig.BlockIODevice{ + Major: entry.Major, + Minor: entry.Minor, + } + if entry.Weight != nil { + weight.Weight = *entry.Weight + } + if entry.LeafWeight != nil { + weight.LeafWeight = *entry.LeafWeight + } + weight.BlockIODevice = *dev + final.BlkioWeightDevice = append(final.BlkioWeightDevice, weight) + } + } + } + + // Pids + if resource.Pids != nil { + final.PidsLimit = resource.Pids.Limit + } + + // Networking + if resource.Network != nil { + if resource.Network.ClassID != nil { + final.NetClsClassid = *resource.Network.ClassID + } + } + + // Unified state + final.Unified = resource.Unified + + return *final, nil +} |