From bebf55c0f22d6723a27cd39561c0577aa557c5e1 Mon Sep 17 00:00:00 2001 From: Doug Rabson Date: Wed, 17 Aug 2022 09:35:19 +0100 Subject: libpod: Move oci_conmon_linux.go to oci_conmon_common.go [NO NEW TESTS NEEDED] Signed-off-by: Doug Rabson --- libpod/oci_conmon_common.go | 1929 +++++++++++++++++++++++++++++++++++++++++++ libpod/oci_conmon_linux.go | 1929 ------------------------------------------- 2 files changed, 1929 insertions(+), 1929 deletions(-) create mode 100644 libpod/oci_conmon_common.go delete mode 100644 libpod/oci_conmon_linux.go (limited to 'libpod') diff --git a/libpod/oci_conmon_common.go b/libpod/oci_conmon_common.go new file mode 100644 index 000000000..1b654ed33 --- /dev/null +++ b/libpod/oci_conmon_common.go @@ -0,0 +1,1929 @@ +//go:build linux +// +build linux + +package libpod + +import ( + "bufio" + "bytes" + "context" + "errors" + "fmt" + "io" + "io/ioutil" + "net" + "net/http" + "os" + "os/exec" + "path/filepath" + "runtime" + "strconv" + "strings" + "sync" + "syscall" + "text/template" + "time" + + runcconfig "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/devices" + + "github.com/containers/common/pkg/cgroups" + "github.com/containers/common/pkg/config" + "github.com/containers/common/pkg/resize" + cutil "github.com/containers/common/pkg/util" + conmonConfig "github.com/containers/conmon/runner/config" + "github.com/containers/podman/v4/libpod/define" + "github.com/containers/podman/v4/libpod/logs" + "github.com/containers/podman/v4/pkg/checkpoint/crutils" + "github.com/containers/podman/v4/pkg/errorhandling" + "github.com/containers/podman/v4/pkg/rootless" + "github.com/containers/podman/v4/pkg/specgenutil" + "github.com/containers/podman/v4/pkg/util" + "github.com/containers/podman/v4/utils" + "github.com/containers/storage/pkg/homedir" + pmount "github.com/containers/storage/pkg/mount" + spec "github.com/opencontainers/runtime-spec/specs-go" + "github.com/opencontainers/selinux/go-selinux/label" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +const ( + // This is Conmon's STDIO_BUF_SIZE. I don't believe we have access to it + // directly from the Go code, so const it here + // Important: The conmon attach socket uses an extra byte at the beginning of each + // message to specify the STREAM so we have to increase the buffer size by one + bufferSize = conmonConfig.BufSize + 1 +) + +// ConmonOCIRuntime is an OCI runtime managed by Conmon. +// TODO: Make all calls to OCI runtime have a timeout. +type ConmonOCIRuntime struct { + name string + path string + conmonPath string + conmonEnv []string + tmpDir string + exitsDir string + logSizeMax int64 + noPivot bool + reservePorts bool + runtimeFlags []string + supportsJSON bool + supportsKVM bool + supportsNoCgroups bool + enableKeyring bool +} + +// Make a new Conmon-based OCI runtime with the given options. +// Conmon will wrap the given OCI runtime, which can be `runc`, `crun`, or +// any runtime with a runc-compatible CLI. +// The first path that points to a valid executable will be used. +// Deliberately private. Someone should not be able to construct this outside of +// libpod. +func newConmonOCIRuntime(name string, paths []string, conmonPath string, runtimeFlags []string, runtimeCfg *config.Config) (OCIRuntime, error) { + if name == "" { + return nil, fmt.Errorf("the OCI runtime must be provided a non-empty name: %w", define.ErrInvalidArg) + } + + // Make lookup tables for runtime support + supportsJSON := make(map[string]bool, len(runtimeCfg.Engine.RuntimeSupportsJSON)) + supportsNoCgroups := make(map[string]bool, len(runtimeCfg.Engine.RuntimeSupportsNoCgroups)) + supportsKVM := make(map[string]bool, len(runtimeCfg.Engine.RuntimeSupportsKVM)) + for _, r := range runtimeCfg.Engine.RuntimeSupportsJSON { + supportsJSON[r] = true + } + for _, r := range runtimeCfg.Engine.RuntimeSupportsNoCgroups { + supportsNoCgroups[r] = true + } + for _, r := range runtimeCfg.Engine.RuntimeSupportsKVM { + supportsKVM[r] = true + } + + runtime := new(ConmonOCIRuntime) + runtime.name = name + runtime.conmonPath = conmonPath + runtime.runtimeFlags = runtimeFlags + + runtime.conmonEnv = runtimeCfg.Engine.ConmonEnvVars + runtime.tmpDir = runtimeCfg.Engine.TmpDir + runtime.logSizeMax = runtimeCfg.Containers.LogSizeMax + runtime.noPivot = runtimeCfg.Engine.NoPivotRoot + runtime.reservePorts = runtimeCfg.Engine.EnablePortReservation + runtime.enableKeyring = runtimeCfg.Containers.EnableKeyring + + // TODO: probe OCI runtime for feature and enable automatically if + // available. + + base := filepath.Base(name) + runtime.supportsJSON = supportsJSON[base] + runtime.supportsNoCgroups = supportsNoCgroups[base] + runtime.supportsKVM = supportsKVM[base] + + foundPath := false + for _, path := range paths { + stat, err := os.Stat(path) + if err != nil { + if os.IsNotExist(err) { + continue + } + return nil, fmt.Errorf("cannot stat OCI runtime %s path: %w", name, err) + } + if !stat.Mode().IsRegular() { + continue + } + foundPath = true + logrus.Tracef("found runtime %q", path) + runtime.path = path + break + } + + // Search the $PATH as last fallback + if !foundPath { + if foundRuntime, err := exec.LookPath(name); err == nil { + foundPath = true + runtime.path = foundRuntime + logrus.Debugf("using runtime %q from $PATH: %q", name, foundRuntime) + } + } + + if !foundPath { + return nil, fmt.Errorf("no valid executable found for OCI runtime %s: %w", name, define.ErrInvalidArg) + } + + runtime.exitsDir = filepath.Join(runtime.tmpDir, "exits") + + // Create the exit files and attach sockets directories + if err := os.MkdirAll(runtime.exitsDir, 0750); err != nil { + // The directory is allowed to exist + if !os.IsExist(err) { + return nil, fmt.Errorf("error creating OCI runtime exit files directory: %w", err) + } + } + return runtime, nil +} + +// Name returns the name of the runtime being wrapped by Conmon. +func (r *ConmonOCIRuntime) Name() string { + return r.name +} + +// Path returns the path of the OCI runtime being wrapped by Conmon. +func (r *ConmonOCIRuntime) Path() string { + return r.path +} + +// hasCurrentUserMapped checks whether the current user is mapped inside the container user namespace +func hasCurrentUserMapped(ctr *Container) bool { + if len(ctr.config.IDMappings.UIDMap) == 0 && len(ctr.config.IDMappings.GIDMap) == 0 { + return true + } + uid := os.Geteuid() + for _, m := range ctr.config.IDMappings.UIDMap { + if uid >= m.HostID && uid < m.HostID+m.Size { + return true + } + } + return false +} + +// CreateContainer creates a container. +func (r *ConmonOCIRuntime) CreateContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) { + // always make the run dir accessible to the current user so that the PID files can be read without + // being in the rootless user namespace. + if err := makeAccessible(ctr.state.RunDir, 0, 0); err != nil { + return 0, err + } + if !hasCurrentUserMapped(ctr) { + for _, i := range []string{ctr.state.RunDir, ctr.runtime.config.Engine.TmpDir, ctr.config.StaticDir, ctr.state.Mountpoint, ctr.runtime.config.Engine.VolumePath} { + if err := makeAccessible(i, ctr.RootUID(), ctr.RootGID()); err != nil { + return 0, err + } + } + + // if we are running a non privileged container, be sure to umount some kernel paths so they are not + // bind mounted inside the container at all. + if !ctr.config.Privileged && !rootless.IsRootless() { + type result struct { + restoreDuration int64 + err error + } + ch := make(chan result) + go func() { + runtime.LockOSThread() + restoreDuration, err := func() (int64, error) { + fd, err := os.Open(fmt.Sprintf("/proc/%d/task/%d/ns/mnt", os.Getpid(), unix.Gettid())) + if err != nil { + return 0, err + } + defer errorhandling.CloseQuiet(fd) + + // create a new mountns on the current thread + if err = unix.Unshare(unix.CLONE_NEWNS); err != nil { + return 0, err + } + defer func() { + if err := unix.Setns(int(fd.Fd()), unix.CLONE_NEWNS); err != nil { + logrus.Errorf("Unable to clone new namespace: %q", err) + } + }() + + // don't spread our mounts around. We are setting only /sys to be slave + // so that the cleanup process is still able to umount the storage and the + // changes are propagated to the host. + err = unix.Mount("/sys", "/sys", "none", unix.MS_REC|unix.MS_SLAVE, "") + if err != nil { + return 0, fmt.Errorf("cannot make /sys slave: %w", err) + } + + mounts, err := pmount.GetMounts() + if err != nil { + return 0, err + } + for _, m := range mounts { + if !strings.HasPrefix(m.Mountpoint, "/sys/kernel") { + continue + } + err = unix.Unmount(m.Mountpoint, 0) + if err != nil && !os.IsNotExist(err) { + return 0, fmt.Errorf("cannot unmount %s: %w", m.Mountpoint, err) + } + } + return r.createOCIContainer(ctr, restoreOptions) + }() + ch <- result{ + restoreDuration: restoreDuration, + err: err, + } + }() + r := <-ch + return r.restoreDuration, r.err + } + } + return r.createOCIContainer(ctr, restoreOptions) +} + +// UpdateContainerStatus retrieves the current status of the container from the +// runtime. It updates the container's state but does not save it. +// If useRuntime is false, we will not directly hit runc to see the container's +// status, but will instead only check for the existence of the conmon exit file +// and update state to stopped if it exists. +func (r *ConmonOCIRuntime) UpdateContainerStatus(ctr *Container) error { + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return err + } + + // Store old state so we know if we were already stopped + oldState := ctr.state.State + + state := new(spec.State) + + cmd := exec.Command(r.path, "state", ctr.ID()) + cmd.Env = append(cmd.Env, fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)) + + outPipe, err := cmd.StdoutPipe() + if err != nil { + return fmt.Errorf("getting stdout pipe: %w", err) + } + errPipe, err := cmd.StderrPipe() + if err != nil { + return fmt.Errorf("getting stderr pipe: %w", err) + } + + if err := cmd.Start(); err != nil { + out, err2 := ioutil.ReadAll(errPipe) + if err2 != nil { + return fmt.Errorf("error getting container %s state: %w", ctr.ID(), err) + } + if strings.Contains(string(out), "does not exist") || strings.Contains(string(out), "No such file") { + if err := ctr.removeConmonFiles(); err != nil { + logrus.Debugf("unable to remove conmon files for container %s", ctr.ID()) + } + ctr.state.ExitCode = -1 + ctr.state.FinishedTime = time.Now() + ctr.state.State = define.ContainerStateExited + return ctr.runtime.state.AddContainerExitCode(ctr.ID(), ctr.state.ExitCode) + } + return fmt.Errorf("error getting container %s state. stderr/out: %s: %w", ctr.ID(), out, err) + } + defer func() { + _ = cmd.Wait() + }() + + if err := errPipe.Close(); err != nil { + return err + } + out, err := ioutil.ReadAll(outPipe) + if err != nil { + return fmt.Errorf("error reading stdout: %s: %w", ctr.ID(), err) + } + if err := json.NewDecoder(bytes.NewBuffer(out)).Decode(state); err != nil { + return fmt.Errorf("error decoding container status for container %s: %w", ctr.ID(), err) + } + ctr.state.PID = state.Pid + + switch state.Status { + case "created": + ctr.state.State = define.ContainerStateCreated + case "paused": + ctr.state.State = define.ContainerStatePaused + case "running": + ctr.state.State = define.ContainerStateRunning + case "stopped": + ctr.state.State = define.ContainerStateStopped + default: + return fmt.Errorf("unrecognized status returned by runtime for container %s: %s: %w", + ctr.ID(), state.Status, define.ErrInternal) + } + + // Only grab exit status if we were not already stopped + // If we were, it should already be in the database + if ctr.state.State == define.ContainerStateStopped && oldState != define.ContainerStateStopped { + if _, err := ctr.Wait(context.Background()); err != nil { + logrus.Errorf("Waiting for container %s to exit: %v", ctr.ID(), err) + } + return nil + } + + // Handle ContainerStateStopping - keep it unless the container + // transitioned to no longer running. + if oldState == define.ContainerStateStopping && (ctr.state.State == define.ContainerStatePaused || ctr.state.State == define.ContainerStateRunning) { + ctr.state.State = define.ContainerStateStopping + } + + return nil +} + +// StartContainer starts the given container. +// Sets time the container was started, but does not save it. +func (r *ConmonOCIRuntime) StartContainer(ctr *Container) error { + // TODO: streams should probably *not* be our STDIN/OUT/ERR - redirect to buffers? + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return err + } + env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} + if path, ok := os.LookupEnv("PATH"); ok { + env = append(env, fmt.Sprintf("PATH=%s", path)) + } + if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, append(r.runtimeFlags, "start", ctr.ID())...); err != nil { + return err + } + + ctr.state.StartedTime = time.Now() + + return nil +} + +// KillContainer sends the given signal to the given container. +// If all is set, send to all PIDs in the container. +// All is only supported if the container created cgroups. +func (r *ConmonOCIRuntime) KillContainer(ctr *Container, signal uint, all bool) error { + logrus.Debugf("Sending signal %d to container %s", signal, ctr.ID()) + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return err + } + env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} + var args []string + args = append(args, r.runtimeFlags...) + if all { + args = append(args, "kill", "--all", ctr.ID(), fmt.Sprintf("%d", signal)) + } else { + args = append(args, "kill", ctr.ID(), fmt.Sprintf("%d", signal)) + } + if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, args...); err != nil { + // Update container state - there's a chance we failed because + // the container exited in the meantime. + if err2 := r.UpdateContainerStatus(ctr); err2 != nil { + logrus.Infof("Error updating status for container %s: %v", ctr.ID(), err2) + } + if ctr.ensureState(define.ContainerStateStopped, define.ContainerStateExited) { + return define.ErrCtrStateInvalid + } + return fmt.Errorf("error sending signal to container %s: %w", ctr.ID(), err) + } + + return nil +} + +// StopContainer stops a container, first using its given stop signal (or +// SIGTERM if no signal was specified), then using SIGKILL. +// Timeout is given in seconds. If timeout is 0, the container will be +// immediately kill with SIGKILL. +// Does not set finished time for container, assumes you will run updateStatus +// after to pull the exit code. +func (r *ConmonOCIRuntime) StopContainer(ctr *Container, timeout uint, all bool) error { + logrus.Debugf("Stopping container %s (PID %d)", ctr.ID(), ctr.state.PID) + + // Ping the container to see if it's alive + // If it's not, it's already stopped, return + err := unix.Kill(ctr.state.PID, 0) + if err == unix.ESRCH { + return nil + } + + stopSignal := ctr.config.StopSignal + if stopSignal == 0 { + stopSignal = uint(syscall.SIGTERM) + } + + if timeout > 0 { + if err := r.KillContainer(ctr, stopSignal, all); err != nil { + // Is the container gone? + // If so, it probably died between the first check and + // our sending the signal + // The container is stopped, so exit cleanly + err := unix.Kill(ctr.state.PID, 0) + if err == unix.ESRCH { + return nil + } + + return err + } + + if err := waitContainerStop(ctr, time.Duration(timeout)*time.Second); err != nil { + logrus.Debugf("Timed out stopping container %s with %s, resorting to SIGKILL: %v", ctr.ID(), unix.SignalName(syscall.Signal(stopSignal)), err) + logrus.Warnf("StopSignal %s failed to stop container %s in %d seconds, resorting to SIGKILL", unix.SignalName(syscall.Signal(stopSignal)), ctr.Name(), timeout) + } else { + // No error, the container is dead + return nil + } + } + + if err := r.KillContainer(ctr, 9, all); err != nil { + // Again, check if the container is gone. If it is, exit cleanly. + err := unix.Kill(ctr.state.PID, 0) + if err == unix.ESRCH { + return nil + } + + return fmt.Errorf("error sending SIGKILL to container %s: %w", ctr.ID(), err) + } + + // Give runtime a few seconds to make it happen + if err := waitContainerStop(ctr, killContainerTimeout); err != nil { + return err + } + + return nil +} + +// DeleteContainer deletes a container from the OCI runtime. +func (r *ConmonOCIRuntime) DeleteContainer(ctr *Container) error { + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return err + } + env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} + return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, append(r.runtimeFlags, "delete", "--force", ctr.ID())...) +} + +// PauseContainer pauses the given container. +func (r *ConmonOCIRuntime) PauseContainer(ctr *Container) error { + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return err + } + env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} + return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, append(r.runtimeFlags, "pause", ctr.ID())...) +} + +// UnpauseContainer unpauses the given container. +func (r *ConmonOCIRuntime) UnpauseContainer(ctr *Container) error { + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return err + } + env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} + return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, append(r.runtimeFlags, "resume", ctr.ID())...) +} + +// HTTPAttach performs an attach for the HTTP API. +// The caller must handle closing the HTTP connection after this returns. +// The cancel channel is not closed; it is up to the caller to do so after +// this function returns. +// If this is a container with a terminal, we will stream raw. If it is not, we +// will stream with an 8-byte header to multiplex STDOUT and STDERR. +// Returns any errors that occurred, and whether the connection was successfully +// hijacked before that error occurred. +func (r *ConmonOCIRuntime) HTTPAttach(ctr *Container, req *http.Request, w http.ResponseWriter, streams *HTTPAttachStreams, detachKeys *string, cancel <-chan bool, hijackDone chan<- bool, streamAttach, streamLogs bool) (deferredErr error) { + isTerminal := false + if ctr.config.Spec.Process != nil { + isTerminal = ctr.config.Spec.Process.Terminal + } + + if streams != nil { + if !streams.Stdin && !streams.Stdout && !streams.Stderr { + return fmt.Errorf("must specify at least one stream to attach to: %w", define.ErrInvalidArg) + } + } + + attachSock, err := r.AttachSocketPath(ctr) + if err != nil { + return err + } + + var conn *net.UnixConn + if streamAttach { + newConn, err := openUnixSocket(attachSock) + if err != nil { + return fmt.Errorf("failed to connect to container's attach socket: %v: %w", attachSock, err) + } + conn = newConn + defer func() { + if err := conn.Close(); err != nil { + logrus.Errorf("Unable to close container %s attach socket: %q", ctr.ID(), err) + } + }() + + logrus.Debugf("Successfully connected to container %s attach socket %s", ctr.ID(), attachSock) + } + + detachString := ctr.runtime.config.Engine.DetachKeys + if detachKeys != nil { + detachString = *detachKeys + } + detach, err := processDetachKeys(detachString) + if err != nil { + return err + } + + attachStdout := true + attachStderr := true + attachStdin := true + if streams != nil { + attachStdout = streams.Stdout + attachStderr = streams.Stderr + attachStdin = streams.Stdin + } + + logrus.Debugf("Going to hijack container %s attach connection", ctr.ID()) + + // Alright, let's hijack. + hijacker, ok := w.(http.Hijacker) + if !ok { + return fmt.Errorf("unable to hijack connection") + } + + httpCon, httpBuf, err := hijacker.Hijack() + if err != nil { + return fmt.Errorf("error hijacking connection: %w", err) + } + + hijackDone <- true + + writeHijackHeader(req, httpBuf) + + // Force a flush after the header is written. + if err := httpBuf.Flush(); err != nil { + return fmt.Errorf("error flushing HTTP hijack header: %w", err) + } + + defer func() { + hijackWriteErrorAndClose(deferredErr, ctr.ID(), isTerminal, httpCon, httpBuf) + }() + + logrus.Debugf("Hijack for container %s attach session done, ready to stream", ctr.ID()) + + // TODO: This is gross. Really, really gross. + // I want to say we should read all the logs into an array before + // calling this, in container_api.go, but that could take a lot of + // memory... + // On the whole, we need to figure out a better way of doing this, + // though. + logSize := 0 + if streamLogs { + logrus.Debugf("Will stream logs for container %s attach session", ctr.ID()) + + // Get all logs for the container + logChan := make(chan *logs.LogLine) + logOpts := new(logs.LogOptions) + logOpts.Tail = -1 + logOpts.WaitGroup = new(sync.WaitGroup) + errChan := make(chan error) + go func() { + var err error + // In non-terminal mode we need to prepend with the + // stream header. + logrus.Debugf("Writing logs for container %s to HTTP attach", ctr.ID()) + for logLine := range logChan { + if !isTerminal { + device := logLine.Device + var header []byte + headerLen := uint32(len(logLine.Msg)) + logSize += len(logLine.Msg) + switch strings.ToLower(device) { + case "stdin": + header = makeHTTPAttachHeader(0, headerLen) + case "stdout": + header = makeHTTPAttachHeader(1, headerLen) + case "stderr": + header = makeHTTPAttachHeader(2, headerLen) + default: + logrus.Errorf("Unknown device for log line: %s", device) + header = makeHTTPAttachHeader(1, headerLen) + } + _, err = httpBuf.Write(header) + if err != nil { + break + } + } + _, err = httpBuf.Write([]byte(logLine.Msg)) + if err != nil { + break + } + if !logLine.Partial() { + _, err = httpBuf.Write([]byte("\n")) + if err != nil { + break + } + } + err = httpBuf.Flush() + if err != nil { + break + } + } + errChan <- err + }() + if err := ctr.ReadLog(context.Background(), logOpts, logChan, 0); err != nil { + return err + } + go func() { + logOpts.WaitGroup.Wait() + close(logChan) + }() + logrus.Debugf("Done reading logs for container %s, %d bytes", ctr.ID(), logSize) + if err := <-errChan; err != nil { + return err + } + } + if !streamAttach { + logrus.Debugf("Done streaming logs for container %s attach, exiting as attach streaming not requested", ctr.ID()) + return nil + } + + logrus.Debugf("Forwarding attach output for container %s", ctr.ID()) + + stdoutChan := make(chan error) + stdinChan := make(chan error) + + // Handle STDOUT/STDERR + go func() { + var err error + if isTerminal { + // Hack: return immediately if attachStdout not set to + // emulate Docker. + // Basically, when terminal is set, STDERR goes nowhere. + // Everything does over STDOUT. + // Therefore, if not attaching STDOUT - we'll never copy + // anything from here. + logrus.Debugf("Performing terminal HTTP attach for container %s", ctr.ID()) + if attachStdout { + err = httpAttachTerminalCopy(conn, httpBuf, ctr.ID()) + } + } else { + logrus.Debugf("Performing non-terminal HTTP attach for container %s", ctr.ID()) + err = httpAttachNonTerminalCopy(conn, httpBuf, ctr.ID(), attachStdin, attachStdout, attachStderr) + } + stdoutChan <- err + logrus.Debugf("STDOUT/ERR copy completed") + }() + // Next, STDIN. Avoid entirely if attachStdin unset. + if attachStdin { + go func() { + _, err := cutil.CopyDetachable(conn, httpBuf, detach) + logrus.Debugf("STDIN copy completed") + stdinChan <- err + }() + } + + for { + select { + case err := <-stdoutChan: + if err != nil { + return err + } + + return nil + case err := <-stdinChan: + if err != nil { + return err + } + // copy stdin is done, close it + if connErr := conn.CloseWrite(); connErr != nil { + logrus.Errorf("Unable to close conn: %v", connErr) + } + case <-cancel: + return nil + } + } +} + +// isRetryable returns whether the error was caused by a blocked syscall or the +// specified operation on a non blocking file descriptor wasn't ready for completion. +func isRetryable(err error) bool { + var errno syscall.Errno + if errors.As(err, &errno) { + return errno == syscall.EINTR || errno == syscall.EAGAIN + } + return false +} + +// openControlFile opens the terminal control file. +func openControlFile(ctr *Container, parentDir string) (*os.File, error) { + controlPath := filepath.Join(parentDir, "ctl") + for i := 0; i < 600; i++ { + controlFile, err := os.OpenFile(controlPath, unix.O_WRONLY|unix.O_NONBLOCK, 0) + if err == nil { + return controlFile, nil + } + if !isRetryable(err) { + return nil, fmt.Errorf("could not open ctl file for terminal resize for container %s: %w", ctr.ID(), err) + } + time.Sleep(time.Second / 10) + } + return nil, fmt.Errorf("timeout waiting for %q", controlPath) +} + +// AttachResize resizes the terminal used by the given container. +func (r *ConmonOCIRuntime) AttachResize(ctr *Container, newSize resize.TerminalSize) error { + controlFile, err := openControlFile(ctr, ctr.bundlePath()) + if err != nil { + return err + } + defer controlFile.Close() + + logrus.Debugf("Received a resize event for container %s: %+v", ctr.ID(), newSize) + if _, err = fmt.Fprintf(controlFile, "%d %d %d\n", 1, newSize.Height, newSize.Width); err != nil { + return fmt.Errorf("failed to write to ctl file to resize terminal: %w", err) + } + + return nil +} + +// CheckpointContainer checkpoints the given container. +func (r *ConmonOCIRuntime) CheckpointContainer(ctr *Container, options ContainerCheckpointOptions) (int64, error) { + // imagePath is used by CRIU to store the actual checkpoint files + imagePath := ctr.CheckpointPath() + if options.PreCheckPoint { + imagePath = ctr.PreCheckPointPath() + } + // workPath will be used to store dump.log and stats-dump + workPath := ctr.bundlePath() + logrus.Debugf("Writing checkpoint to %s", imagePath) + logrus.Debugf("Writing checkpoint logs to %s", workPath) + logrus.Debugf("Pre-dump the container %t", options.PreCheckPoint) + args := []string{} + args = append(args, r.runtimeFlags...) + args = append(args, "checkpoint") + args = append(args, "--image-path") + args = append(args, imagePath) + args = append(args, "--work-path") + args = append(args, workPath) + if options.KeepRunning { + args = append(args, "--leave-running") + } + if options.TCPEstablished { + args = append(args, "--tcp-established") + } + if options.FileLocks { + args = append(args, "--file-locks") + } + if !options.PreCheckPoint && options.KeepRunning { + args = append(args, "--leave-running") + } + if options.PreCheckPoint { + args = append(args, "--pre-dump") + } + if !options.PreCheckPoint && options.WithPrevious { + args = append( + args, + "--parent-path", + filepath.Join("..", preCheckpointDir), + ) + } + + args = append(args, ctr.ID()) + logrus.Debugf("the args to checkpoint: %s %s", r.path, strings.Join(args, " ")) + + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return 0, err + } + env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} + if path, ok := os.LookupEnv("PATH"); ok { + env = append(env, fmt.Sprintf("PATH=%s", path)) + } + + runtime.LockOSThread() + if err := label.SetSocketLabel(ctr.ProcessLabel()); err != nil { + return 0, err + } + + runtimeCheckpointStarted := time.Now() + err = utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, args...) + // Ignore error returned from SetSocketLabel("") call, + // can't recover. + if labelErr := label.SetSocketLabel(""); labelErr == nil { + // Unlock the thread only if the process label could be restored + // successfully. Otherwise leave the thread locked and the Go runtime + // will terminate it once it returns to the threads pool. + runtime.UnlockOSThread() + } else { + logrus.Errorf("Unable to reset socket label: %q", labelErr) + } + + runtimeCheckpointDuration := func() int64 { + if options.PrintStats { + return time.Since(runtimeCheckpointStarted).Microseconds() + } + return 0 + }() + + return runtimeCheckpointDuration, err +} + +func (r *ConmonOCIRuntime) CheckConmonRunning(ctr *Container) (bool, error) { + if ctr.state.ConmonPID == 0 { + // If the container is running or paused, assume Conmon is + // running. We didn't record Conmon PID on some old versions, so + // that is likely what's going on... + // Unusual enough that we should print a warning message though. + if ctr.ensureState(define.ContainerStateRunning, define.ContainerStatePaused) { + logrus.Warnf("Conmon PID is not set, but container is running!") + return true, nil + } + // Container's not running, so conmon PID being unset is + // expected. Conmon is not running. + return false, nil + } + + // We have a conmon PID. Ping it with signal 0. + if err := unix.Kill(ctr.state.ConmonPID, 0); err != nil { + if err == unix.ESRCH { + return false, nil + } + return false, fmt.Errorf("error pinging container %s conmon with signal 0: %w", ctr.ID(), err) + } + return true, nil +} + +// SupportsCheckpoint checks if the OCI runtime supports checkpointing +// containers. +func (r *ConmonOCIRuntime) SupportsCheckpoint() bool { + return crutils.CRRuntimeSupportsCheckpointRestore(r.path) +} + +// SupportsJSONErrors checks if the OCI runtime supports JSON-formatted error +// messages. +func (r *ConmonOCIRuntime) SupportsJSONErrors() bool { + return r.supportsJSON +} + +// SupportsNoCgroups checks if the OCI runtime supports running containers +// without cgroups (the --cgroup-manager=disabled flag). +func (r *ConmonOCIRuntime) SupportsNoCgroups() bool { + return r.supportsNoCgroups +} + +// SupportsKVM checks if the OCI runtime supports running containers +// without KVM separation +func (r *ConmonOCIRuntime) SupportsKVM() bool { + return r.supportsKVM +} + +// AttachSocketPath is the path to a single container's attach socket. +func (r *ConmonOCIRuntime) AttachSocketPath(ctr *Container) (string, error) { + if ctr == nil { + return "", fmt.Errorf("must provide a valid container to get attach socket path: %w", define.ErrInvalidArg) + } + + return filepath.Join(ctr.bundlePath(), "attach"), nil +} + +// ExitFilePath is the path to a container's exit file. +func (r *ConmonOCIRuntime) ExitFilePath(ctr *Container) (string, error) { + if ctr == nil { + return "", fmt.Errorf("must provide a valid container to get exit file path: %w", define.ErrInvalidArg) + } + return filepath.Join(r.exitsDir, ctr.ID()), nil +} + +// RuntimeInfo provides information on the runtime. +func (r *ConmonOCIRuntime) RuntimeInfo() (*define.ConmonInfo, *define.OCIRuntimeInfo, error) { + runtimePackage := packageVersion(r.path) + conmonPackage := packageVersion(r.conmonPath) + runtimeVersion, err := r.getOCIRuntimeVersion() + if err != nil { + return nil, nil, fmt.Errorf("error getting version of OCI runtime %s: %w", r.name, err) + } + conmonVersion, err := r.getConmonVersion() + if err != nil { + return nil, nil, fmt.Errorf("error getting conmon version: %w", err) + } + + conmon := define.ConmonInfo{ + Package: conmonPackage, + Path: r.conmonPath, + Version: conmonVersion, + } + ocirt := define.OCIRuntimeInfo{ + Name: r.name, + Path: r.path, + Package: runtimePackage, + Version: runtimeVersion, + } + return &conmon, &ocirt, nil +} + +// makeAccessible changes the path permission and each parent directory to have --x--x--x +func makeAccessible(path string, uid, gid int) error { + for ; path != "/"; path = filepath.Dir(path) { + st, err := os.Stat(path) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + if int(st.Sys().(*syscall.Stat_t).Uid) == uid && int(st.Sys().(*syscall.Stat_t).Gid) == gid { + continue + } + if st.Mode()&0111 != 0111 { + if err := os.Chmod(path, st.Mode()|0111); err != nil { + return err + } + } + } + return nil +} + +// Wait for a container which has been sent a signal to stop +func waitContainerStop(ctr *Container, timeout time.Duration) error { + return waitPidStop(ctr.state.PID, timeout) +} + +// Wait for a given PID to stop +func waitPidStop(pid int, timeout time.Duration) error { + done := make(chan struct{}) + chControl := make(chan struct{}) + go func() { + for { + select { + case <-chControl: + return + default: + if err := unix.Kill(pid, 0); err != nil { + if err == unix.ESRCH { + close(done) + return + } + logrus.Errorf("Pinging PID %d with signal 0: %v", pid, err) + } + time.Sleep(100 * time.Millisecond) + } + } + }() + select { + case <-done: + return nil + case <-time.After(timeout): + close(chControl) + return fmt.Errorf("given PIDs did not die within timeout") + } +} + +func (r *ConmonOCIRuntime) getLogTag(ctr *Container) (string, error) { + logTag := ctr.LogTag() + if logTag == "" { + return "", nil + } + data, err := ctr.inspectLocked(false) + if err != nil { + // FIXME: this error should probably be returned + return "", nil //nolint: nilerr + } + tmpl, err := template.New("container").Parse(logTag) + if err != nil { + return "", fmt.Errorf("template parsing error %s: %w", logTag, err) + } + var b bytes.Buffer + err = tmpl.Execute(&b, data) + if err != nil { + return "", err + } + return b.String(), nil +} + +// createOCIContainer generates this container's main conmon instance and prepares it for starting +func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) { + var stderrBuf bytes.Buffer + + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return 0, err + } + + parentSyncPipe, childSyncPipe, err := newPipe() + if err != nil { + return 0, fmt.Errorf("error creating socket pair: %w", err) + } + defer errorhandling.CloseQuiet(parentSyncPipe) + + childStartPipe, parentStartPipe, err := newPipe() + if err != nil { + return 0, fmt.Errorf("error creating socket pair for start pipe: %w", err) + } + + defer errorhandling.CloseQuiet(parentStartPipe) + + var ociLog string + if logrus.GetLevel() != logrus.DebugLevel && r.supportsJSON { + ociLog = filepath.Join(ctr.state.RunDir, "oci-log") + } + + logTag, err := r.getLogTag(ctr) + if err != nil { + return 0, err + } + + if ctr.config.CgroupsMode == cgroupSplit { + if err := utils.MoveUnderCgroupSubtree("runtime"); err != nil { + return 0, err + } + } + + pidfile := ctr.config.PidFile + if pidfile == "" { + pidfile = filepath.Join(ctr.state.RunDir, "pidfile") + } + + args := r.sharedConmonArgs(ctr, ctr.ID(), ctr.bundlePath(), pidfile, ctr.LogPath(), r.exitsDir, ociLog, ctr.LogDriver(), logTag) + + if ctr.config.SdNotifyMode == define.SdNotifyModeContainer && ctr.config.SdNotifySocket != "" { + args = append(args, fmt.Sprintf("--sdnotify-socket=%s", ctr.config.SdNotifySocket)) + } + + if ctr.config.Spec.Process.Terminal { + args = append(args, "-t") + } else if ctr.config.Stdin { + args = append(args, "-i") + } + + if ctr.config.Timeout > 0 { + args = append(args, fmt.Sprintf("--timeout=%d", ctr.config.Timeout)) + } + + if !r.enableKeyring { + args = append(args, "--no-new-keyring") + } + if ctr.config.ConmonPidFile != "" { + args = append(args, "--conmon-pidfile", ctr.config.ConmonPidFile) + } + + if r.noPivot { + args = append(args, "--no-pivot") + } + + exitCommand, err := specgenutil.CreateExitCommandArgs(ctr.runtime.storageConfig, ctr.runtime.config, logrus.IsLevelEnabled(logrus.DebugLevel), ctr.AutoRemove(), false) + if err != nil { + return 0, err + } + exitCommand = append(exitCommand, ctr.config.ID) + + args = append(args, "--exit-command", exitCommand[0]) + for _, arg := range exitCommand[1:] { + args = append(args, []string{"--exit-command-arg", arg}...) + } + + // Pass down the LISTEN_* environment (see #10443). + preserveFDs := ctr.config.PreserveFDs + if val := os.Getenv("LISTEN_FDS"); val != "" { + if ctr.config.PreserveFDs > 0 { + logrus.Warnf("Ignoring LISTEN_FDS to preserve custom user-specified FDs") + } else { + fds, err := strconv.Atoi(val) + if err != nil { + return 0, fmt.Errorf("converting LISTEN_FDS=%s: %w", val, err) + } + preserveFDs = uint(fds) + } + } + + if preserveFDs > 0 { + args = append(args, formatRuntimeOpts("--preserve-fds", fmt.Sprintf("%d", preserveFDs))...) + } + + if restoreOptions != nil { + args = append(args, "--restore", ctr.CheckpointPath()) + if restoreOptions.TCPEstablished { + args = append(args, "--runtime-opt", "--tcp-established") + } + if restoreOptions.FileLocks { + args = append(args, "--runtime-opt", "--file-locks") + } + if restoreOptions.Pod != "" { + mountLabel := ctr.config.MountLabel + processLabel := ctr.config.ProcessLabel + if mountLabel != "" { + args = append( + args, + "--runtime-opt", + fmt.Sprintf( + "--lsm-mount-context=%s", + mountLabel, + ), + ) + } + if processLabel != "" { + args = append( + args, + "--runtime-opt", + fmt.Sprintf( + "--lsm-profile=selinux:%s", + processLabel, + ), + ) + } + } + } + + logrus.WithFields(logrus.Fields{ + "args": args, + }).Debugf("running conmon: %s", r.conmonPath) + + cmd := exec.Command(r.conmonPath, args...) + cmd.SysProcAttr = &syscall.SysProcAttr{ + Setpgid: true, + } + // TODO this is probably a really bad idea for some uses + // Make this configurable + cmd.Stdin = os.Stdin + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + if ctr.config.Spec.Process.Terminal { + cmd.Stderr = &stderrBuf + } + + // 0, 1 and 2 are stdin, stdout and stderr + conmonEnv := r.configureConmonEnv(runtimeDir) + + var filesToClose []*os.File + if preserveFDs > 0 { + for fd := 3; fd < int(3+preserveFDs); fd++ { + f := os.NewFile(uintptr(fd), fmt.Sprintf("fd-%d", fd)) + filesToClose = append(filesToClose, f) + cmd.ExtraFiles = append(cmd.ExtraFiles, f) + } + } + + cmd.Env = r.conmonEnv + // we don't want to step on users fds they asked to preserve + // Since 0-2 are used for stdio, start the fds we pass in at preserveFDs+3 + cmd.Env = append(cmd.Env, fmt.Sprintf("_OCI_SYNCPIPE=%d", preserveFDs+3), fmt.Sprintf("_OCI_STARTPIPE=%d", preserveFDs+4)) + cmd.Env = append(cmd.Env, conmonEnv...) + cmd.ExtraFiles = append(cmd.ExtraFiles, childSyncPipe, childStartPipe) + + if r.reservePorts && !rootless.IsRootless() && !ctr.config.NetMode.IsSlirp4netns() { + ports, err := bindPorts(ctr.convertPortMappings()) + if err != nil { + return 0, err + } + filesToClose = append(filesToClose, ports...) + + // Leak the port we bound in the conmon process. These fd's won't be used + // by the container and conmon will keep the ports busy so that another + // process cannot use them. + cmd.ExtraFiles = append(cmd.ExtraFiles, ports...) + } + + if ctr.config.NetMode.IsSlirp4netns() || rootless.IsRootless() { + if ctr.config.PostConfigureNetNS { + havePortMapping := len(ctr.config.PortMappings) > 0 + if havePortMapping { + ctr.rootlessPortSyncR, ctr.rootlessPortSyncW, err = os.Pipe() + if err != nil { + return 0, fmt.Errorf("failed to create rootless port sync pipe: %w", err) + } + } + ctr.rootlessSlirpSyncR, ctr.rootlessSlirpSyncW, err = os.Pipe() + if err != nil { + return 0, fmt.Errorf("failed to create rootless network sync pipe: %w", err) + } + } else { + if ctr.rootlessSlirpSyncR != nil { + defer errorhandling.CloseQuiet(ctr.rootlessSlirpSyncR) + } + if ctr.rootlessSlirpSyncW != nil { + defer errorhandling.CloseQuiet(ctr.rootlessSlirpSyncW) + } + } + // Leak one end in conmon, the other one will be leaked into slirp4netns + cmd.ExtraFiles = append(cmd.ExtraFiles, ctr.rootlessSlirpSyncW) + + if ctr.rootlessPortSyncW != nil { + defer errorhandling.CloseQuiet(ctr.rootlessPortSyncW) + // Leak one end in conmon, the other one will be leaked into rootlessport + cmd.ExtraFiles = append(cmd.ExtraFiles, ctr.rootlessPortSyncW) + } + } + var runtimeRestoreStarted time.Time + if restoreOptions != nil { + runtimeRestoreStarted = time.Now() + } + err = startCommand(cmd, ctr) + + // regardless of whether we errored or not, we no longer need the children pipes + childSyncPipe.Close() + childStartPipe.Close() + if err != nil { + return 0, err + } + if err := r.moveConmonToCgroupAndSignal(ctr, cmd, parentStartPipe); err != nil { + return 0, err + } + /* Wait for initial setup and fork, and reap child */ + err = cmd.Wait() + if err != nil { + return 0, err + } + + pid, err := readConmonPipeData(r.name, parentSyncPipe, ociLog) + if err != nil { + if err2 := r.DeleteContainer(ctr); err2 != nil { + logrus.Errorf("Removing container %s from runtime after creation failed", ctr.ID()) + } + return 0, err + } + ctr.state.PID = pid + + conmonPID, err := readConmonPidFile(ctr.config.ConmonPidFile) + if err != nil { + logrus.Warnf("Error reading conmon pid file for container %s: %v", ctr.ID(), err) + } else if conmonPID > 0 { + // conmon not having a pid file is a valid state, so don't set it if we don't have it + logrus.Infof("Got Conmon PID as %d", conmonPID) + ctr.state.ConmonPID = conmonPID + } + + runtimeRestoreDuration := func() int64 { + if restoreOptions != nil && restoreOptions.PrintStats { + return time.Since(runtimeRestoreStarted).Microseconds() + } + return 0 + }() + + // These fds were passed down to the runtime. Close them + // and not interfere + for _, f := range filesToClose { + errorhandling.CloseQuiet(f) + } + + return runtimeRestoreDuration, nil +} + +// configureConmonEnv gets the environment values to add to conmon's exec struct +// TODO this may want to be less hardcoded/more configurable in the future +func (r *ConmonOCIRuntime) configureConmonEnv(runtimeDir string) []string { + var env []string + for _, e := range os.Environ() { + if strings.HasPrefix(e, "LC_") { + env = append(env, e) + } + } + conf, ok := os.LookupEnv("CONTAINERS_CONF") + if ok { + env = append(env, fmt.Sprintf("CONTAINERS_CONF=%s", conf)) + } + env = append(env, fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)) + env = append(env, fmt.Sprintf("_CONTAINERS_USERNS_CONFIGURED=%s", os.Getenv("_CONTAINERS_USERNS_CONFIGURED"))) + env = append(env, fmt.Sprintf("_CONTAINERS_ROOTLESS_UID=%s", os.Getenv("_CONTAINERS_ROOTLESS_UID"))) + home := homedir.Get() + if home != "" { + env = append(env, fmt.Sprintf("HOME=%s", home)) + } + + return env +} + +// sharedConmonArgs takes common arguments for exec and create/restore and formats them for the conmon CLI +func (r *ConmonOCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, pidPath, logPath, exitDir, ociLogPath, logDriver, logTag string) []string { + // set the conmon API version to be able to use the correct sync struct keys + args := []string{ + "--api-version", "1", + "-c", ctr.ID(), + "-u", cuuid, + "-r", r.path, + "-b", bundlePath, + "-p", pidPath, + "-n", ctr.Name(), + "--exit-dir", exitDir, + "--full-attach", + } + if len(r.runtimeFlags) > 0 { + rFlags := []string{} + for _, arg := range r.runtimeFlags { + rFlags = append(rFlags, "--runtime-arg", arg) + } + args = append(args, rFlags...) + } + + if ctr.CgroupManager() == config.SystemdCgroupsManager && !ctr.config.NoCgroups && ctr.config.CgroupsMode != cgroupSplit { + args = append(args, "-s") + } + + var logDriverArg string + switch logDriver { + case define.JournaldLogging: + logDriverArg = define.JournaldLogging + case define.NoLogging: + logDriverArg = define.NoLogging + case define.PassthroughLogging: + logDriverArg = define.PassthroughLogging + //lint:ignore ST1015 the default case has to be here + default: //nolint:stylecheck,gocritic + // No case here should happen except JSONLogging, but keep this here in case the options are extended + logrus.Errorf("%s logging specified but not supported. Choosing k8s-file logging instead", ctr.LogDriver()) + fallthrough + case "": + // to get here, either a user would specify `--log-driver ""`, or this came from another place in libpod + // since the former case is obscure, and the latter case isn't an error, let's silently fallthrough + fallthrough + case define.JSONLogging: + fallthrough + case define.KubernetesLogging: + logDriverArg = fmt.Sprintf("%s:%s", define.KubernetesLogging, logPath) + } + + args = append(args, "-l", logDriverArg) + logLevel := logrus.GetLevel() + args = append(args, "--log-level", logLevel.String()) + + if logLevel == logrus.DebugLevel { + logrus.Debugf("%s messages will be logged to syslog", r.conmonPath) + args = append(args, "--syslog") + } + + size := r.logSizeMax + if ctr.config.LogSize > 0 { + size = ctr.config.LogSize + } + if size > 0 { + args = append(args, "--log-size-max", fmt.Sprintf("%v", size)) + } + + if ociLogPath != "" { + args = append(args, "--runtime-arg", "--log-format=json", "--runtime-arg", "--log", fmt.Sprintf("--runtime-arg=%s", ociLogPath)) + } + if logTag != "" { + args = append(args, "--log-tag", logTag) + } + if ctr.config.NoCgroups { + logrus.Debugf("Running with no Cgroups") + args = append(args, "--runtime-arg", "--cgroup-manager", "--runtime-arg", "disabled") + } + return args +} + +func startCommand(cmd *exec.Cmd, ctr *Container) error { + // Make sure to unset the NOTIFY_SOCKET and reset it afterwards if needed. + switch ctr.config.SdNotifyMode { + case define.SdNotifyModeContainer, define.SdNotifyModeIgnore: + if prev := os.Getenv("NOTIFY_SOCKET"); prev != "" { + if err := os.Unsetenv("NOTIFY_SOCKET"); err != nil { + logrus.Warnf("Error unsetting NOTIFY_SOCKET %v", err) + } + defer func() { + if err := os.Setenv("NOTIFY_SOCKET", prev); err != nil { + logrus.Errorf("Resetting NOTIFY_SOCKET=%s", prev) + } + }() + } + } + + return cmd.Start() +} + +// moveConmonToCgroupAndSignal gets a container's cgroupParent and moves the conmon process to that cgroup +// it then signals for conmon to start by sending nonce data down the start fd +func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec.Cmd, startFd *os.File) error { + mustCreateCgroup := true + + if ctr.config.NoCgroups { + mustCreateCgroup = false + } + + // If cgroup creation is disabled - just signal. + switch ctr.config.CgroupsMode { + case "disabled", "no-conmon", cgroupSplit: + mustCreateCgroup = false + } + + // $INVOCATION_ID is set by systemd when running as a service. + if ctr.runtime.RemoteURI() == "" && os.Getenv("INVOCATION_ID") != "" { + mustCreateCgroup = false + } + + if mustCreateCgroup { + // Usually rootless users are not allowed to configure cgroupfs. + // There are cases though, where it is allowed, e.g. if the cgroup + // is manually configured and chowned). Avoid detecting all + // such cases and simply use a lower log level. + logLevel := logrus.WarnLevel + if rootless.IsRootless() { + logLevel = logrus.InfoLevel + } + // TODO: This should be a switch - we are not guaranteed that + // there are only 2 valid cgroup managers + cgroupParent := ctr.CgroupParent() + cgroupPath := filepath.Join(ctr.config.CgroupParent, "conmon") + Resource := ctr.Spec().Linux.Resources + cgroupResources, err := GetLimits(Resource) + if err != nil { + logrus.StandardLogger().Log(logLevel, "Could not get ctr resources") + } + if ctr.CgroupManager() == config.SystemdCgroupsManager { + unitName := createUnitName("libpod-conmon", ctr.ID()) + realCgroupParent := cgroupParent + splitParent := strings.Split(cgroupParent, "/") + if strings.HasSuffix(cgroupParent, ".slice") && len(splitParent) > 1 { + realCgroupParent = splitParent[len(splitParent)-1] + } + + logrus.Infof("Running conmon under slice %s and unitName %s", realCgroupParent, unitName) + if err := utils.RunUnderSystemdScope(cmd.Process.Pid, realCgroupParent, unitName); err != nil { + logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to systemd sandbox cgroup: %v", err) + } + } else { + control, err := cgroups.New(cgroupPath, &cgroupResources) + if err != nil { + logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to cgroupfs sandbox cgroup: %v", err) + } else if err := control.AddPid(cmd.Process.Pid); err != nil { + // we need to remove this defer and delete the cgroup once conmon exits + // maybe need a conmon monitor? + logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to cgroupfs sandbox cgroup: %v", err) + } + } + } + + /* We set the cgroup, now the child can start creating children */ + if err := writeConmonPipeData(startFd); err != nil { + return err + } + return nil +} + +// newPipe creates a unix socket pair for communication. +// Returns two files - first is parent, second is child. +func newPipe() (*os.File, *os.File, error) { + fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0) + if err != nil { + return nil, nil, err + } + return os.NewFile(uintptr(fds[1]), "parent"), os.NewFile(uintptr(fds[0]), "child"), nil +} + +// readConmonPidFile attempts to read conmon's pid from its pid file +func readConmonPidFile(pidFile string) (int, error) { + // Let's try reading the Conmon pid at the same time. + if pidFile != "" { + contents, err := ioutil.ReadFile(pidFile) + if err != nil { + return -1, err + } + // Convert it to an int + conmonPID, err := strconv.Atoi(string(contents)) + if err != nil { + return -1, err + } + return conmonPID, nil + } + return 0, nil +} + +// readConmonPipeData attempts to read a syncInfo struct from the pipe +func readConmonPipeData(runtimeName string, pipe *os.File, ociLog string) (int, error) { + // syncInfo is used to return data from monitor process to daemon + type syncInfo struct { + Data int `json:"data"` + Message string `json:"message,omitempty"` + } + + // Wait to get container pid from conmon + type syncStruct struct { + si *syncInfo + err error + } + ch := make(chan syncStruct) + go func() { + var si *syncInfo + rdr := bufio.NewReader(pipe) + b, err := rdr.ReadBytes('\n') + // ignore EOF here, error is returned even when data was read + // if it is no valid json unmarshal will fail below + if err != nil && !errors.Is(err, io.EOF) { + ch <- syncStruct{err: err} + } + if err := json.Unmarshal(b, &si); err != nil { + ch <- syncStruct{err: fmt.Errorf("conmon bytes %q: %w", string(b), err)} + return + } + ch <- syncStruct{si: si} + }() + + data := -1 //nolint: wastedassign + select { + case ss := <-ch: + if ss.err != nil { + if ociLog != "" { + ociLogData, err := ioutil.ReadFile(ociLog) + if err == nil { + var ociErr ociError + if err := json.Unmarshal(ociLogData, &ociErr); err == nil { + return -1, getOCIRuntimeError(runtimeName, ociErr.Msg) + } + } + } + return -1, fmt.Errorf("container create failed (no logs from conmon): %w", ss.err) + } + logrus.Debugf("Received: %d", ss.si.Data) + if ss.si.Data < 0 { + if ociLog != "" { + ociLogData, err := ioutil.ReadFile(ociLog) + if err == nil { + var ociErr ociError + if err := json.Unmarshal(ociLogData, &ociErr); err == nil { + return ss.si.Data, getOCIRuntimeError(runtimeName, ociErr.Msg) + } + } + } + // If we failed to parse the JSON errors, then print the output as it is + if ss.si.Message != "" { + return ss.si.Data, getOCIRuntimeError(runtimeName, ss.si.Message) + } + return ss.si.Data, fmt.Errorf("container create failed: %w", define.ErrInternal) + } + data = ss.si.Data + case <-time.After(define.ContainerCreateTimeout): + return -1, fmt.Errorf("container creation timeout: %w", define.ErrInternal) + } + return data, nil +} + +// writeConmonPipeData writes nonce data to a pipe +func writeConmonPipeData(pipe *os.File) error { + someData := []byte{0} + _, err := pipe.Write(someData) + return err +} + +// formatRuntimeOpts prepends opts passed to it with --runtime-opt for passing to conmon +func formatRuntimeOpts(opts ...string) []string { + args := make([]string, 0, len(opts)*2) + for _, o := range opts { + args = append(args, "--runtime-opt", o) + } + return args +} + +// getConmonVersion returns a string representation of the conmon version. +func (r *ConmonOCIRuntime) getConmonVersion() (string, error) { + output, err := utils.ExecCmd(r.conmonPath, "--version") + if err != nil { + return "", err + } + return strings.TrimSuffix(strings.Replace(output, "\n", ", ", 1), "\n"), nil +} + +// getOCIRuntimeVersion returns a string representation of the OCI runtime's +// version. +func (r *ConmonOCIRuntime) getOCIRuntimeVersion() (string, error) { + output, err := utils.ExecCmd(r.path, "--version") + if err != nil { + return "", err + } + return strings.TrimSuffix(output, "\n"), nil +} + +// Copy data from container to HTTP connection, for terminal attach. +// Container is the container's attach socket connection, http is a buffer for +// the HTTP connection. cid is the ID of the container the attach session is +// running for (used solely for error messages). +func httpAttachTerminalCopy(container *net.UnixConn, http *bufio.ReadWriter, cid string) error { + buf := make([]byte, bufferSize) + for { + numR, err := container.Read(buf) + logrus.Debugf("Read fd(%d) %d/%d bytes for container %s", int(buf[0]), numR, len(buf), cid) + + if numR > 0 { + switch buf[0] { + case AttachPipeStdout: + // Do nothing + default: + logrus.Errorf("Received unexpected attach type %+d, discarding %d bytes", buf[0], numR) + continue + } + + numW, err2 := http.Write(buf[1:numR]) + if err2 != nil { + if err != nil { + logrus.Errorf("Reading container %s STDOUT: %v", cid, err) + } + return err2 + } else if numW+1 != numR { + return io.ErrShortWrite + } + // We need to force the buffer to write immediately, so + // there isn't a delay on the terminal side. + if err2 := http.Flush(); err2 != nil { + if err != nil { + logrus.Errorf("Reading container %s STDOUT: %v", cid, err) + } + return err2 + } + } + if err != nil { + if err == io.EOF { + return nil + } + return err + } + } +} + +// Copy data from a container to an HTTP connection, for non-terminal attach. +// Appends a header to multiplex input. +func httpAttachNonTerminalCopy(container *net.UnixConn, http *bufio.ReadWriter, cid string, stdin, stdout, stderr bool) error { + buf := make([]byte, bufferSize) + for { + numR, err := container.Read(buf) + if numR > 0 { + var headerBuf []byte + + // Subtract 1 because we strip the first byte (used for + // multiplexing by Conmon). + headerLen := uint32(numR - 1) + // Practically speaking, we could make this buf[0] - 1, + // but we need to validate it anyway. + switch buf[0] { + case AttachPipeStdin: + headerBuf = makeHTTPAttachHeader(0, headerLen) + if !stdin { + continue + } + case AttachPipeStdout: + if !stdout { + continue + } + headerBuf = makeHTTPAttachHeader(1, headerLen) + case AttachPipeStderr: + if !stderr { + continue + } + headerBuf = makeHTTPAttachHeader(2, headerLen) + default: + logrus.Errorf("Received unexpected attach type %+d, discarding %d bytes", buf[0], numR) + continue + } + + numH, err2 := http.Write(headerBuf) + if err2 != nil { + if err != nil { + logrus.Errorf("Reading container %s standard streams: %v", cid, err) + } + + return err2 + } + // Hardcoding header length is pretty gross, but + // fast. Should be safe, as this is a fixed part + // of the protocol. + if numH != 8 { + if err != nil { + logrus.Errorf("Reading container %s standard streams: %v", cid, err) + } + + return io.ErrShortWrite + } + + numW, err2 := http.Write(buf[1:numR]) + if err2 != nil { + if err != nil { + logrus.Errorf("Reading container %s standard streams: %v", cid, err) + } + + return err2 + } else if numW+1 != numR { + if err != nil { + logrus.Errorf("Reading container %s standard streams: %v", cid, err) + } + + return io.ErrShortWrite + } + // We need to force the buffer to write immediately, so + // there isn't a delay on the terminal side. + if err2 := http.Flush(); err2 != nil { + if err != nil { + logrus.Errorf("Reading container %s STDOUT: %v", cid, err) + } + return err2 + } + } + if err != nil { + if err == io.EOF { + return nil + } + + return err + } + } +} + +// GetLimits converts spec resource limits to cgroup consumable limits +func GetLimits(resource *spec.LinuxResources) (runcconfig.Resources, error) { + if resource == nil { + resource = &spec.LinuxResources{} + } + final := &runcconfig.Resources{} + devs := []*devices.Rule{} + + // Devices + for _, entry := range resource.Devices { + if entry.Major == nil || entry.Minor == nil { + continue + } + runeType := 'a' + switch entry.Type { + case "b": + runeType = 'b' + case "c": + runeType = 'c' + } + + devs = append(devs, &devices.Rule{ + Type: devices.Type(runeType), + Major: *entry.Major, + Minor: *entry.Minor, + Permissions: devices.Permissions(entry.Access), + Allow: entry.Allow, + }) + } + final.Devices = devs + + // HugepageLimits + pageLimits := []*runcconfig.HugepageLimit{} + for _, entry := range resource.HugepageLimits { + pageLimits = append(pageLimits, &runcconfig.HugepageLimit{ + Pagesize: entry.Pagesize, + Limit: entry.Limit, + }) + } + final.HugetlbLimit = pageLimits + + // Networking + netPriorities := []*runcconfig.IfPrioMap{} + if resource.Network != nil { + for _, entry := range resource.Network.Priorities { + netPriorities = append(netPriorities, &runcconfig.IfPrioMap{ + Interface: entry.Name, + Priority: int64(entry.Priority), + }) + } + } + final.NetPrioIfpriomap = netPriorities + rdma := make(map[string]runcconfig.LinuxRdma) + for name, entry := range resource.Rdma { + rdma[name] = runcconfig.LinuxRdma{HcaHandles: entry.HcaHandles, HcaObjects: entry.HcaObjects} + } + final.Rdma = rdma + + // Memory + if resource.Memory != nil { + if resource.Memory.Limit != nil { + final.Memory = *resource.Memory.Limit + } + if resource.Memory.Reservation != nil { + final.MemoryReservation = *resource.Memory.Reservation + } + if resource.Memory.Swap != nil { + final.MemorySwap = *resource.Memory.Swap + } + if resource.Memory.Swappiness != nil { + final.MemorySwappiness = resource.Memory.Swappiness + } + } + + // CPU + if resource.CPU != nil { + if resource.CPU.Period != nil { + final.CpuPeriod = *resource.CPU.Period + } + if resource.CPU.Quota != nil { + final.CpuQuota = *resource.CPU.Quota + } + if resource.CPU.RealtimePeriod != nil { + final.CpuRtPeriod = *resource.CPU.RealtimePeriod + } + if resource.CPU.RealtimeRuntime != nil { + final.CpuRtRuntime = *resource.CPU.RealtimeRuntime + } + if resource.CPU.Shares != nil { + final.CpuShares = *resource.CPU.Shares + } + final.CpusetCpus = resource.CPU.Cpus + final.CpusetMems = resource.CPU.Mems + } + + // BlkIO + if resource.BlockIO != nil { + if len(resource.BlockIO.ThrottleReadBpsDevice) > 0 { + for _, entry := range resource.BlockIO.ThrottleReadBpsDevice { + throttle := &runcconfig.ThrottleDevice{} + dev := &runcconfig.BlockIODevice{ + Major: entry.Major, + Minor: entry.Minor, + } + throttle.BlockIODevice = *dev + throttle.Rate = entry.Rate + final.BlkioThrottleReadBpsDevice = append(final.BlkioThrottleReadBpsDevice, throttle) + } + } + if len(resource.BlockIO.ThrottleWriteBpsDevice) > 0 { + for _, entry := range resource.BlockIO.ThrottleWriteBpsDevice { + throttle := &runcconfig.ThrottleDevice{} + dev := &runcconfig.BlockIODevice{ + Major: entry.Major, + Minor: entry.Minor, + } + throttle.BlockIODevice = *dev + throttle.Rate = entry.Rate + final.BlkioThrottleWriteBpsDevice = append(final.BlkioThrottleWriteBpsDevice, throttle) + } + } + if len(resource.BlockIO.ThrottleReadIOPSDevice) > 0 { + for _, entry := range resource.BlockIO.ThrottleReadIOPSDevice { + throttle := &runcconfig.ThrottleDevice{} + dev := &runcconfig.BlockIODevice{ + Major: entry.Major, + Minor: entry.Minor, + } + throttle.BlockIODevice = *dev + throttle.Rate = entry.Rate + final.BlkioThrottleReadIOPSDevice = append(final.BlkioThrottleReadIOPSDevice, throttle) + } + } + if len(resource.BlockIO.ThrottleWriteIOPSDevice) > 0 { + for _, entry := range resource.BlockIO.ThrottleWriteIOPSDevice { + throttle := &runcconfig.ThrottleDevice{} + dev := &runcconfig.BlockIODevice{ + Major: entry.Major, + Minor: entry.Minor, + } + throttle.BlockIODevice = *dev + throttle.Rate = entry.Rate + final.BlkioThrottleWriteIOPSDevice = append(final.BlkioThrottleWriteIOPSDevice, throttle) + } + } + if resource.BlockIO.LeafWeight != nil { + final.BlkioLeafWeight = *resource.BlockIO.LeafWeight + } + if resource.BlockIO.Weight != nil { + final.BlkioWeight = *resource.BlockIO.Weight + } + if len(resource.BlockIO.WeightDevice) > 0 { + for _, entry := range resource.BlockIO.WeightDevice { + weight := &runcconfig.WeightDevice{} + dev := &runcconfig.BlockIODevice{ + Major: entry.Major, + Minor: entry.Minor, + } + if entry.Weight != nil { + weight.Weight = *entry.Weight + } + if entry.LeafWeight != nil { + weight.LeafWeight = *entry.LeafWeight + } + weight.BlockIODevice = *dev + final.BlkioWeightDevice = append(final.BlkioWeightDevice, weight) + } + } + } + + // Pids + if resource.Pids != nil { + final.PidsLimit = resource.Pids.Limit + } + + // Networking + if resource.Network != nil { + if resource.Network.ClassID != nil { + final.NetClsClassid = *resource.Network.ClassID + } + } + + // Unified state + final.Unified = resource.Unified + + return *final, nil +} diff --git a/libpod/oci_conmon_linux.go b/libpod/oci_conmon_linux.go deleted file mode 100644 index 1b654ed33..000000000 --- a/libpod/oci_conmon_linux.go +++ /dev/null @@ -1,1929 +0,0 @@ -//go:build linux -// +build linux - -package libpod - -import ( - "bufio" - "bytes" - "context" - "errors" - "fmt" - "io" - "io/ioutil" - "net" - "net/http" - "os" - "os/exec" - "path/filepath" - "runtime" - "strconv" - "strings" - "sync" - "syscall" - "text/template" - "time" - - runcconfig "github.com/opencontainers/runc/libcontainer/configs" - "github.com/opencontainers/runc/libcontainer/devices" - - "github.com/containers/common/pkg/cgroups" - "github.com/containers/common/pkg/config" - "github.com/containers/common/pkg/resize" - cutil "github.com/containers/common/pkg/util" - conmonConfig "github.com/containers/conmon/runner/config" - "github.com/containers/podman/v4/libpod/define" - "github.com/containers/podman/v4/libpod/logs" - "github.com/containers/podman/v4/pkg/checkpoint/crutils" - "github.com/containers/podman/v4/pkg/errorhandling" - "github.com/containers/podman/v4/pkg/rootless" - "github.com/containers/podman/v4/pkg/specgenutil" - "github.com/containers/podman/v4/pkg/util" - "github.com/containers/podman/v4/utils" - "github.com/containers/storage/pkg/homedir" - pmount "github.com/containers/storage/pkg/mount" - spec "github.com/opencontainers/runtime-spec/specs-go" - "github.com/opencontainers/selinux/go-selinux/label" - "github.com/sirupsen/logrus" - "golang.org/x/sys/unix" -) - -const ( - // This is Conmon's STDIO_BUF_SIZE. I don't believe we have access to it - // directly from the Go code, so const it here - // Important: The conmon attach socket uses an extra byte at the beginning of each - // message to specify the STREAM so we have to increase the buffer size by one - bufferSize = conmonConfig.BufSize + 1 -) - -// ConmonOCIRuntime is an OCI runtime managed by Conmon. -// TODO: Make all calls to OCI runtime have a timeout. -type ConmonOCIRuntime struct { - name string - path string - conmonPath string - conmonEnv []string - tmpDir string - exitsDir string - logSizeMax int64 - noPivot bool - reservePorts bool - runtimeFlags []string - supportsJSON bool - supportsKVM bool - supportsNoCgroups bool - enableKeyring bool -} - -// Make a new Conmon-based OCI runtime with the given options. -// Conmon will wrap the given OCI runtime, which can be `runc`, `crun`, or -// any runtime with a runc-compatible CLI. -// The first path that points to a valid executable will be used. -// Deliberately private. Someone should not be able to construct this outside of -// libpod. -func newConmonOCIRuntime(name string, paths []string, conmonPath string, runtimeFlags []string, runtimeCfg *config.Config) (OCIRuntime, error) { - if name == "" { - return nil, fmt.Errorf("the OCI runtime must be provided a non-empty name: %w", define.ErrInvalidArg) - } - - // Make lookup tables for runtime support - supportsJSON := make(map[string]bool, len(runtimeCfg.Engine.RuntimeSupportsJSON)) - supportsNoCgroups := make(map[string]bool, len(runtimeCfg.Engine.RuntimeSupportsNoCgroups)) - supportsKVM := make(map[string]bool, len(runtimeCfg.Engine.RuntimeSupportsKVM)) - for _, r := range runtimeCfg.Engine.RuntimeSupportsJSON { - supportsJSON[r] = true - } - for _, r := range runtimeCfg.Engine.RuntimeSupportsNoCgroups { - supportsNoCgroups[r] = true - } - for _, r := range runtimeCfg.Engine.RuntimeSupportsKVM { - supportsKVM[r] = true - } - - runtime := new(ConmonOCIRuntime) - runtime.name = name - runtime.conmonPath = conmonPath - runtime.runtimeFlags = runtimeFlags - - runtime.conmonEnv = runtimeCfg.Engine.ConmonEnvVars - runtime.tmpDir = runtimeCfg.Engine.TmpDir - runtime.logSizeMax = runtimeCfg.Containers.LogSizeMax - runtime.noPivot = runtimeCfg.Engine.NoPivotRoot - runtime.reservePorts = runtimeCfg.Engine.EnablePortReservation - runtime.enableKeyring = runtimeCfg.Containers.EnableKeyring - - // TODO: probe OCI runtime for feature and enable automatically if - // available. - - base := filepath.Base(name) - runtime.supportsJSON = supportsJSON[base] - runtime.supportsNoCgroups = supportsNoCgroups[base] - runtime.supportsKVM = supportsKVM[base] - - foundPath := false - for _, path := range paths { - stat, err := os.Stat(path) - if err != nil { - if os.IsNotExist(err) { - continue - } - return nil, fmt.Errorf("cannot stat OCI runtime %s path: %w", name, err) - } - if !stat.Mode().IsRegular() { - continue - } - foundPath = true - logrus.Tracef("found runtime %q", path) - runtime.path = path - break - } - - // Search the $PATH as last fallback - if !foundPath { - if foundRuntime, err := exec.LookPath(name); err == nil { - foundPath = true - runtime.path = foundRuntime - logrus.Debugf("using runtime %q from $PATH: %q", name, foundRuntime) - } - } - - if !foundPath { - return nil, fmt.Errorf("no valid executable found for OCI runtime %s: %w", name, define.ErrInvalidArg) - } - - runtime.exitsDir = filepath.Join(runtime.tmpDir, "exits") - - // Create the exit files and attach sockets directories - if err := os.MkdirAll(runtime.exitsDir, 0750); err != nil { - // The directory is allowed to exist - if !os.IsExist(err) { - return nil, fmt.Errorf("error creating OCI runtime exit files directory: %w", err) - } - } - return runtime, nil -} - -// Name returns the name of the runtime being wrapped by Conmon. -func (r *ConmonOCIRuntime) Name() string { - return r.name -} - -// Path returns the path of the OCI runtime being wrapped by Conmon. -func (r *ConmonOCIRuntime) Path() string { - return r.path -} - -// hasCurrentUserMapped checks whether the current user is mapped inside the container user namespace -func hasCurrentUserMapped(ctr *Container) bool { - if len(ctr.config.IDMappings.UIDMap) == 0 && len(ctr.config.IDMappings.GIDMap) == 0 { - return true - } - uid := os.Geteuid() - for _, m := range ctr.config.IDMappings.UIDMap { - if uid >= m.HostID && uid < m.HostID+m.Size { - return true - } - } - return false -} - -// CreateContainer creates a container. -func (r *ConmonOCIRuntime) CreateContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) { - // always make the run dir accessible to the current user so that the PID files can be read without - // being in the rootless user namespace. - if err := makeAccessible(ctr.state.RunDir, 0, 0); err != nil { - return 0, err - } - if !hasCurrentUserMapped(ctr) { - for _, i := range []string{ctr.state.RunDir, ctr.runtime.config.Engine.TmpDir, ctr.config.StaticDir, ctr.state.Mountpoint, ctr.runtime.config.Engine.VolumePath} { - if err := makeAccessible(i, ctr.RootUID(), ctr.RootGID()); err != nil { - return 0, err - } - } - - // if we are running a non privileged container, be sure to umount some kernel paths so they are not - // bind mounted inside the container at all. - if !ctr.config.Privileged && !rootless.IsRootless() { - type result struct { - restoreDuration int64 - err error - } - ch := make(chan result) - go func() { - runtime.LockOSThread() - restoreDuration, err := func() (int64, error) { - fd, err := os.Open(fmt.Sprintf("/proc/%d/task/%d/ns/mnt", os.Getpid(), unix.Gettid())) - if err != nil { - return 0, err - } - defer errorhandling.CloseQuiet(fd) - - // create a new mountns on the current thread - if err = unix.Unshare(unix.CLONE_NEWNS); err != nil { - return 0, err - } - defer func() { - if err := unix.Setns(int(fd.Fd()), unix.CLONE_NEWNS); err != nil { - logrus.Errorf("Unable to clone new namespace: %q", err) - } - }() - - // don't spread our mounts around. We are setting only /sys to be slave - // so that the cleanup process is still able to umount the storage and the - // changes are propagated to the host. - err = unix.Mount("/sys", "/sys", "none", unix.MS_REC|unix.MS_SLAVE, "") - if err != nil { - return 0, fmt.Errorf("cannot make /sys slave: %w", err) - } - - mounts, err := pmount.GetMounts() - if err != nil { - return 0, err - } - for _, m := range mounts { - if !strings.HasPrefix(m.Mountpoint, "/sys/kernel") { - continue - } - err = unix.Unmount(m.Mountpoint, 0) - if err != nil && !os.IsNotExist(err) { - return 0, fmt.Errorf("cannot unmount %s: %w", m.Mountpoint, err) - } - } - return r.createOCIContainer(ctr, restoreOptions) - }() - ch <- result{ - restoreDuration: restoreDuration, - err: err, - } - }() - r := <-ch - return r.restoreDuration, r.err - } - } - return r.createOCIContainer(ctr, restoreOptions) -} - -// UpdateContainerStatus retrieves the current status of the container from the -// runtime. It updates the container's state but does not save it. -// If useRuntime is false, we will not directly hit runc to see the container's -// status, but will instead only check for the existence of the conmon exit file -// and update state to stopped if it exists. -func (r *ConmonOCIRuntime) UpdateContainerStatus(ctr *Container) error { - runtimeDir, err := util.GetRuntimeDir() - if err != nil { - return err - } - - // Store old state so we know if we were already stopped - oldState := ctr.state.State - - state := new(spec.State) - - cmd := exec.Command(r.path, "state", ctr.ID()) - cmd.Env = append(cmd.Env, fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)) - - outPipe, err := cmd.StdoutPipe() - if err != nil { - return fmt.Errorf("getting stdout pipe: %w", err) - } - errPipe, err := cmd.StderrPipe() - if err != nil { - return fmt.Errorf("getting stderr pipe: %w", err) - } - - if err := cmd.Start(); err != nil { - out, err2 := ioutil.ReadAll(errPipe) - if err2 != nil { - return fmt.Errorf("error getting container %s state: %w", ctr.ID(), err) - } - if strings.Contains(string(out), "does not exist") || strings.Contains(string(out), "No such file") { - if err := ctr.removeConmonFiles(); err != nil { - logrus.Debugf("unable to remove conmon files for container %s", ctr.ID()) - } - ctr.state.ExitCode = -1 - ctr.state.FinishedTime = time.Now() - ctr.state.State = define.ContainerStateExited - return ctr.runtime.state.AddContainerExitCode(ctr.ID(), ctr.state.ExitCode) - } - return fmt.Errorf("error getting container %s state. stderr/out: %s: %w", ctr.ID(), out, err) - } - defer func() { - _ = cmd.Wait() - }() - - if err := errPipe.Close(); err != nil { - return err - } - out, err := ioutil.ReadAll(outPipe) - if err != nil { - return fmt.Errorf("error reading stdout: %s: %w", ctr.ID(), err) - } - if err := json.NewDecoder(bytes.NewBuffer(out)).Decode(state); err != nil { - return fmt.Errorf("error decoding container status for container %s: %w", ctr.ID(), err) - } - ctr.state.PID = state.Pid - - switch state.Status { - case "created": - ctr.state.State = define.ContainerStateCreated - case "paused": - ctr.state.State = define.ContainerStatePaused - case "running": - ctr.state.State = define.ContainerStateRunning - case "stopped": - ctr.state.State = define.ContainerStateStopped - default: - return fmt.Errorf("unrecognized status returned by runtime for container %s: %s: %w", - ctr.ID(), state.Status, define.ErrInternal) - } - - // Only grab exit status if we were not already stopped - // If we were, it should already be in the database - if ctr.state.State == define.ContainerStateStopped && oldState != define.ContainerStateStopped { - if _, err := ctr.Wait(context.Background()); err != nil { - logrus.Errorf("Waiting for container %s to exit: %v", ctr.ID(), err) - } - return nil - } - - // Handle ContainerStateStopping - keep it unless the container - // transitioned to no longer running. - if oldState == define.ContainerStateStopping && (ctr.state.State == define.ContainerStatePaused || ctr.state.State == define.ContainerStateRunning) { - ctr.state.State = define.ContainerStateStopping - } - - return nil -} - -// StartContainer starts the given container. -// Sets time the container was started, but does not save it. -func (r *ConmonOCIRuntime) StartContainer(ctr *Container) error { - // TODO: streams should probably *not* be our STDIN/OUT/ERR - redirect to buffers? - runtimeDir, err := util.GetRuntimeDir() - if err != nil { - return err - } - env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} - if path, ok := os.LookupEnv("PATH"); ok { - env = append(env, fmt.Sprintf("PATH=%s", path)) - } - if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, append(r.runtimeFlags, "start", ctr.ID())...); err != nil { - return err - } - - ctr.state.StartedTime = time.Now() - - return nil -} - -// KillContainer sends the given signal to the given container. -// If all is set, send to all PIDs in the container. -// All is only supported if the container created cgroups. -func (r *ConmonOCIRuntime) KillContainer(ctr *Container, signal uint, all bool) error { - logrus.Debugf("Sending signal %d to container %s", signal, ctr.ID()) - runtimeDir, err := util.GetRuntimeDir() - if err != nil { - return err - } - env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} - var args []string - args = append(args, r.runtimeFlags...) - if all { - args = append(args, "kill", "--all", ctr.ID(), fmt.Sprintf("%d", signal)) - } else { - args = append(args, "kill", ctr.ID(), fmt.Sprintf("%d", signal)) - } - if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, args...); err != nil { - // Update container state - there's a chance we failed because - // the container exited in the meantime. - if err2 := r.UpdateContainerStatus(ctr); err2 != nil { - logrus.Infof("Error updating status for container %s: %v", ctr.ID(), err2) - } - if ctr.ensureState(define.ContainerStateStopped, define.ContainerStateExited) { - return define.ErrCtrStateInvalid - } - return fmt.Errorf("error sending signal to container %s: %w", ctr.ID(), err) - } - - return nil -} - -// StopContainer stops a container, first using its given stop signal (or -// SIGTERM if no signal was specified), then using SIGKILL. -// Timeout is given in seconds. If timeout is 0, the container will be -// immediately kill with SIGKILL. -// Does not set finished time for container, assumes you will run updateStatus -// after to pull the exit code. -func (r *ConmonOCIRuntime) StopContainer(ctr *Container, timeout uint, all bool) error { - logrus.Debugf("Stopping container %s (PID %d)", ctr.ID(), ctr.state.PID) - - // Ping the container to see if it's alive - // If it's not, it's already stopped, return - err := unix.Kill(ctr.state.PID, 0) - if err == unix.ESRCH { - return nil - } - - stopSignal := ctr.config.StopSignal - if stopSignal == 0 { - stopSignal = uint(syscall.SIGTERM) - } - - if timeout > 0 { - if err := r.KillContainer(ctr, stopSignal, all); err != nil { - // Is the container gone? - // If so, it probably died between the first check and - // our sending the signal - // The container is stopped, so exit cleanly - err := unix.Kill(ctr.state.PID, 0) - if err == unix.ESRCH { - return nil - } - - return err - } - - if err := waitContainerStop(ctr, time.Duration(timeout)*time.Second); err != nil { - logrus.Debugf("Timed out stopping container %s with %s, resorting to SIGKILL: %v", ctr.ID(), unix.SignalName(syscall.Signal(stopSignal)), err) - logrus.Warnf("StopSignal %s failed to stop container %s in %d seconds, resorting to SIGKILL", unix.SignalName(syscall.Signal(stopSignal)), ctr.Name(), timeout) - } else { - // No error, the container is dead - return nil - } - } - - if err := r.KillContainer(ctr, 9, all); err != nil { - // Again, check if the container is gone. If it is, exit cleanly. - err := unix.Kill(ctr.state.PID, 0) - if err == unix.ESRCH { - return nil - } - - return fmt.Errorf("error sending SIGKILL to container %s: %w", ctr.ID(), err) - } - - // Give runtime a few seconds to make it happen - if err := waitContainerStop(ctr, killContainerTimeout); err != nil { - return err - } - - return nil -} - -// DeleteContainer deletes a container from the OCI runtime. -func (r *ConmonOCIRuntime) DeleteContainer(ctr *Container) error { - runtimeDir, err := util.GetRuntimeDir() - if err != nil { - return err - } - env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} - return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, append(r.runtimeFlags, "delete", "--force", ctr.ID())...) -} - -// PauseContainer pauses the given container. -func (r *ConmonOCIRuntime) PauseContainer(ctr *Container) error { - runtimeDir, err := util.GetRuntimeDir() - if err != nil { - return err - } - env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} - return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, append(r.runtimeFlags, "pause", ctr.ID())...) -} - -// UnpauseContainer unpauses the given container. -func (r *ConmonOCIRuntime) UnpauseContainer(ctr *Container) error { - runtimeDir, err := util.GetRuntimeDir() - if err != nil { - return err - } - env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} - return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, append(r.runtimeFlags, "resume", ctr.ID())...) -} - -// HTTPAttach performs an attach for the HTTP API. -// The caller must handle closing the HTTP connection after this returns. -// The cancel channel is not closed; it is up to the caller to do so after -// this function returns. -// If this is a container with a terminal, we will stream raw. If it is not, we -// will stream with an 8-byte header to multiplex STDOUT and STDERR. -// Returns any errors that occurred, and whether the connection was successfully -// hijacked before that error occurred. -func (r *ConmonOCIRuntime) HTTPAttach(ctr *Container, req *http.Request, w http.ResponseWriter, streams *HTTPAttachStreams, detachKeys *string, cancel <-chan bool, hijackDone chan<- bool, streamAttach, streamLogs bool) (deferredErr error) { - isTerminal := false - if ctr.config.Spec.Process != nil { - isTerminal = ctr.config.Spec.Process.Terminal - } - - if streams != nil { - if !streams.Stdin && !streams.Stdout && !streams.Stderr { - return fmt.Errorf("must specify at least one stream to attach to: %w", define.ErrInvalidArg) - } - } - - attachSock, err := r.AttachSocketPath(ctr) - if err != nil { - return err - } - - var conn *net.UnixConn - if streamAttach { - newConn, err := openUnixSocket(attachSock) - if err != nil { - return fmt.Errorf("failed to connect to container's attach socket: %v: %w", attachSock, err) - } - conn = newConn - defer func() { - if err := conn.Close(); err != nil { - logrus.Errorf("Unable to close container %s attach socket: %q", ctr.ID(), err) - } - }() - - logrus.Debugf("Successfully connected to container %s attach socket %s", ctr.ID(), attachSock) - } - - detachString := ctr.runtime.config.Engine.DetachKeys - if detachKeys != nil { - detachString = *detachKeys - } - detach, err := processDetachKeys(detachString) - if err != nil { - return err - } - - attachStdout := true - attachStderr := true - attachStdin := true - if streams != nil { - attachStdout = streams.Stdout - attachStderr = streams.Stderr - attachStdin = streams.Stdin - } - - logrus.Debugf("Going to hijack container %s attach connection", ctr.ID()) - - // Alright, let's hijack. - hijacker, ok := w.(http.Hijacker) - if !ok { - return fmt.Errorf("unable to hijack connection") - } - - httpCon, httpBuf, err := hijacker.Hijack() - if err != nil { - return fmt.Errorf("error hijacking connection: %w", err) - } - - hijackDone <- true - - writeHijackHeader(req, httpBuf) - - // Force a flush after the header is written. - if err := httpBuf.Flush(); err != nil { - return fmt.Errorf("error flushing HTTP hijack header: %w", err) - } - - defer func() { - hijackWriteErrorAndClose(deferredErr, ctr.ID(), isTerminal, httpCon, httpBuf) - }() - - logrus.Debugf("Hijack for container %s attach session done, ready to stream", ctr.ID()) - - // TODO: This is gross. Really, really gross. - // I want to say we should read all the logs into an array before - // calling this, in container_api.go, but that could take a lot of - // memory... - // On the whole, we need to figure out a better way of doing this, - // though. - logSize := 0 - if streamLogs { - logrus.Debugf("Will stream logs for container %s attach session", ctr.ID()) - - // Get all logs for the container - logChan := make(chan *logs.LogLine) - logOpts := new(logs.LogOptions) - logOpts.Tail = -1 - logOpts.WaitGroup = new(sync.WaitGroup) - errChan := make(chan error) - go func() { - var err error - // In non-terminal mode we need to prepend with the - // stream header. - logrus.Debugf("Writing logs for container %s to HTTP attach", ctr.ID()) - for logLine := range logChan { - if !isTerminal { - device := logLine.Device - var header []byte - headerLen := uint32(len(logLine.Msg)) - logSize += len(logLine.Msg) - switch strings.ToLower(device) { - case "stdin": - header = makeHTTPAttachHeader(0, headerLen) - case "stdout": - header = makeHTTPAttachHeader(1, headerLen) - case "stderr": - header = makeHTTPAttachHeader(2, headerLen) - default: - logrus.Errorf("Unknown device for log line: %s", device) - header = makeHTTPAttachHeader(1, headerLen) - } - _, err = httpBuf.Write(header) - if err != nil { - break - } - } - _, err = httpBuf.Write([]byte(logLine.Msg)) - if err != nil { - break - } - if !logLine.Partial() { - _, err = httpBuf.Write([]byte("\n")) - if err != nil { - break - } - } - err = httpBuf.Flush() - if err != nil { - break - } - } - errChan <- err - }() - if err := ctr.ReadLog(context.Background(), logOpts, logChan, 0); err != nil { - return err - } - go func() { - logOpts.WaitGroup.Wait() - close(logChan) - }() - logrus.Debugf("Done reading logs for container %s, %d bytes", ctr.ID(), logSize) - if err := <-errChan; err != nil { - return err - } - } - if !streamAttach { - logrus.Debugf("Done streaming logs for container %s attach, exiting as attach streaming not requested", ctr.ID()) - return nil - } - - logrus.Debugf("Forwarding attach output for container %s", ctr.ID()) - - stdoutChan := make(chan error) - stdinChan := make(chan error) - - // Handle STDOUT/STDERR - go func() { - var err error - if isTerminal { - // Hack: return immediately if attachStdout not set to - // emulate Docker. - // Basically, when terminal is set, STDERR goes nowhere. - // Everything does over STDOUT. - // Therefore, if not attaching STDOUT - we'll never copy - // anything from here. - logrus.Debugf("Performing terminal HTTP attach for container %s", ctr.ID()) - if attachStdout { - err = httpAttachTerminalCopy(conn, httpBuf, ctr.ID()) - } - } else { - logrus.Debugf("Performing non-terminal HTTP attach for container %s", ctr.ID()) - err = httpAttachNonTerminalCopy(conn, httpBuf, ctr.ID(), attachStdin, attachStdout, attachStderr) - } - stdoutChan <- err - logrus.Debugf("STDOUT/ERR copy completed") - }() - // Next, STDIN. Avoid entirely if attachStdin unset. - if attachStdin { - go func() { - _, err := cutil.CopyDetachable(conn, httpBuf, detach) - logrus.Debugf("STDIN copy completed") - stdinChan <- err - }() - } - - for { - select { - case err := <-stdoutChan: - if err != nil { - return err - } - - return nil - case err := <-stdinChan: - if err != nil { - return err - } - // copy stdin is done, close it - if connErr := conn.CloseWrite(); connErr != nil { - logrus.Errorf("Unable to close conn: %v", connErr) - } - case <-cancel: - return nil - } - } -} - -// isRetryable returns whether the error was caused by a blocked syscall or the -// specified operation on a non blocking file descriptor wasn't ready for completion. -func isRetryable(err error) bool { - var errno syscall.Errno - if errors.As(err, &errno) { - return errno == syscall.EINTR || errno == syscall.EAGAIN - } - return false -} - -// openControlFile opens the terminal control file. -func openControlFile(ctr *Container, parentDir string) (*os.File, error) { - controlPath := filepath.Join(parentDir, "ctl") - for i := 0; i < 600; i++ { - controlFile, err := os.OpenFile(controlPath, unix.O_WRONLY|unix.O_NONBLOCK, 0) - if err == nil { - return controlFile, nil - } - if !isRetryable(err) { - return nil, fmt.Errorf("could not open ctl file for terminal resize for container %s: %w", ctr.ID(), err) - } - time.Sleep(time.Second / 10) - } - return nil, fmt.Errorf("timeout waiting for %q", controlPath) -} - -// AttachResize resizes the terminal used by the given container. -func (r *ConmonOCIRuntime) AttachResize(ctr *Container, newSize resize.TerminalSize) error { - controlFile, err := openControlFile(ctr, ctr.bundlePath()) - if err != nil { - return err - } - defer controlFile.Close() - - logrus.Debugf("Received a resize event for container %s: %+v", ctr.ID(), newSize) - if _, err = fmt.Fprintf(controlFile, "%d %d %d\n", 1, newSize.Height, newSize.Width); err != nil { - return fmt.Errorf("failed to write to ctl file to resize terminal: %w", err) - } - - return nil -} - -// CheckpointContainer checkpoints the given container. -func (r *ConmonOCIRuntime) CheckpointContainer(ctr *Container, options ContainerCheckpointOptions) (int64, error) { - // imagePath is used by CRIU to store the actual checkpoint files - imagePath := ctr.CheckpointPath() - if options.PreCheckPoint { - imagePath = ctr.PreCheckPointPath() - } - // workPath will be used to store dump.log and stats-dump - workPath := ctr.bundlePath() - logrus.Debugf("Writing checkpoint to %s", imagePath) - logrus.Debugf("Writing checkpoint logs to %s", workPath) - logrus.Debugf("Pre-dump the container %t", options.PreCheckPoint) - args := []string{} - args = append(args, r.runtimeFlags...) - args = append(args, "checkpoint") - args = append(args, "--image-path") - args = append(args, imagePath) - args = append(args, "--work-path") - args = append(args, workPath) - if options.KeepRunning { - args = append(args, "--leave-running") - } - if options.TCPEstablished { - args = append(args, "--tcp-established") - } - if options.FileLocks { - args = append(args, "--file-locks") - } - if !options.PreCheckPoint && options.KeepRunning { - args = append(args, "--leave-running") - } - if options.PreCheckPoint { - args = append(args, "--pre-dump") - } - if !options.PreCheckPoint && options.WithPrevious { - args = append( - args, - "--parent-path", - filepath.Join("..", preCheckpointDir), - ) - } - - args = append(args, ctr.ID()) - logrus.Debugf("the args to checkpoint: %s %s", r.path, strings.Join(args, " ")) - - runtimeDir, err := util.GetRuntimeDir() - if err != nil { - return 0, err - } - env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} - if path, ok := os.LookupEnv("PATH"); ok { - env = append(env, fmt.Sprintf("PATH=%s", path)) - } - - runtime.LockOSThread() - if err := label.SetSocketLabel(ctr.ProcessLabel()); err != nil { - return 0, err - } - - runtimeCheckpointStarted := time.Now() - err = utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, args...) - // Ignore error returned from SetSocketLabel("") call, - // can't recover. - if labelErr := label.SetSocketLabel(""); labelErr == nil { - // Unlock the thread only if the process label could be restored - // successfully. Otherwise leave the thread locked and the Go runtime - // will terminate it once it returns to the threads pool. - runtime.UnlockOSThread() - } else { - logrus.Errorf("Unable to reset socket label: %q", labelErr) - } - - runtimeCheckpointDuration := func() int64 { - if options.PrintStats { - return time.Since(runtimeCheckpointStarted).Microseconds() - } - return 0 - }() - - return runtimeCheckpointDuration, err -} - -func (r *ConmonOCIRuntime) CheckConmonRunning(ctr *Container) (bool, error) { - if ctr.state.ConmonPID == 0 { - // If the container is running or paused, assume Conmon is - // running. We didn't record Conmon PID on some old versions, so - // that is likely what's going on... - // Unusual enough that we should print a warning message though. - if ctr.ensureState(define.ContainerStateRunning, define.ContainerStatePaused) { - logrus.Warnf("Conmon PID is not set, but container is running!") - return true, nil - } - // Container's not running, so conmon PID being unset is - // expected. Conmon is not running. - return false, nil - } - - // We have a conmon PID. Ping it with signal 0. - if err := unix.Kill(ctr.state.ConmonPID, 0); err != nil { - if err == unix.ESRCH { - return false, nil - } - return false, fmt.Errorf("error pinging container %s conmon with signal 0: %w", ctr.ID(), err) - } - return true, nil -} - -// SupportsCheckpoint checks if the OCI runtime supports checkpointing -// containers. -func (r *ConmonOCIRuntime) SupportsCheckpoint() bool { - return crutils.CRRuntimeSupportsCheckpointRestore(r.path) -} - -// SupportsJSONErrors checks if the OCI runtime supports JSON-formatted error -// messages. -func (r *ConmonOCIRuntime) SupportsJSONErrors() bool { - return r.supportsJSON -} - -// SupportsNoCgroups checks if the OCI runtime supports running containers -// without cgroups (the --cgroup-manager=disabled flag). -func (r *ConmonOCIRuntime) SupportsNoCgroups() bool { - return r.supportsNoCgroups -} - -// SupportsKVM checks if the OCI runtime supports running containers -// without KVM separation -func (r *ConmonOCIRuntime) SupportsKVM() bool { - return r.supportsKVM -} - -// AttachSocketPath is the path to a single container's attach socket. -func (r *ConmonOCIRuntime) AttachSocketPath(ctr *Container) (string, error) { - if ctr == nil { - return "", fmt.Errorf("must provide a valid container to get attach socket path: %w", define.ErrInvalidArg) - } - - return filepath.Join(ctr.bundlePath(), "attach"), nil -} - -// ExitFilePath is the path to a container's exit file. -func (r *ConmonOCIRuntime) ExitFilePath(ctr *Container) (string, error) { - if ctr == nil { - return "", fmt.Errorf("must provide a valid container to get exit file path: %w", define.ErrInvalidArg) - } - return filepath.Join(r.exitsDir, ctr.ID()), nil -} - -// RuntimeInfo provides information on the runtime. -func (r *ConmonOCIRuntime) RuntimeInfo() (*define.ConmonInfo, *define.OCIRuntimeInfo, error) { - runtimePackage := packageVersion(r.path) - conmonPackage := packageVersion(r.conmonPath) - runtimeVersion, err := r.getOCIRuntimeVersion() - if err != nil { - return nil, nil, fmt.Errorf("error getting version of OCI runtime %s: %w", r.name, err) - } - conmonVersion, err := r.getConmonVersion() - if err != nil { - return nil, nil, fmt.Errorf("error getting conmon version: %w", err) - } - - conmon := define.ConmonInfo{ - Package: conmonPackage, - Path: r.conmonPath, - Version: conmonVersion, - } - ocirt := define.OCIRuntimeInfo{ - Name: r.name, - Path: r.path, - Package: runtimePackage, - Version: runtimeVersion, - } - return &conmon, &ocirt, nil -} - -// makeAccessible changes the path permission and each parent directory to have --x--x--x -func makeAccessible(path string, uid, gid int) error { - for ; path != "/"; path = filepath.Dir(path) { - st, err := os.Stat(path) - if err != nil { - if os.IsNotExist(err) { - return nil - } - return err - } - if int(st.Sys().(*syscall.Stat_t).Uid) == uid && int(st.Sys().(*syscall.Stat_t).Gid) == gid { - continue - } - if st.Mode()&0111 != 0111 { - if err := os.Chmod(path, st.Mode()|0111); err != nil { - return err - } - } - } - return nil -} - -// Wait for a container which has been sent a signal to stop -func waitContainerStop(ctr *Container, timeout time.Duration) error { - return waitPidStop(ctr.state.PID, timeout) -} - -// Wait for a given PID to stop -func waitPidStop(pid int, timeout time.Duration) error { - done := make(chan struct{}) - chControl := make(chan struct{}) - go func() { - for { - select { - case <-chControl: - return - default: - if err := unix.Kill(pid, 0); err != nil { - if err == unix.ESRCH { - close(done) - return - } - logrus.Errorf("Pinging PID %d with signal 0: %v", pid, err) - } - time.Sleep(100 * time.Millisecond) - } - } - }() - select { - case <-done: - return nil - case <-time.After(timeout): - close(chControl) - return fmt.Errorf("given PIDs did not die within timeout") - } -} - -func (r *ConmonOCIRuntime) getLogTag(ctr *Container) (string, error) { - logTag := ctr.LogTag() - if logTag == "" { - return "", nil - } - data, err := ctr.inspectLocked(false) - if err != nil { - // FIXME: this error should probably be returned - return "", nil //nolint: nilerr - } - tmpl, err := template.New("container").Parse(logTag) - if err != nil { - return "", fmt.Errorf("template parsing error %s: %w", logTag, err) - } - var b bytes.Buffer - err = tmpl.Execute(&b, data) - if err != nil { - return "", err - } - return b.String(), nil -} - -// createOCIContainer generates this container's main conmon instance and prepares it for starting -func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) { - var stderrBuf bytes.Buffer - - runtimeDir, err := util.GetRuntimeDir() - if err != nil { - return 0, err - } - - parentSyncPipe, childSyncPipe, err := newPipe() - if err != nil { - return 0, fmt.Errorf("error creating socket pair: %w", err) - } - defer errorhandling.CloseQuiet(parentSyncPipe) - - childStartPipe, parentStartPipe, err := newPipe() - if err != nil { - return 0, fmt.Errorf("error creating socket pair for start pipe: %w", err) - } - - defer errorhandling.CloseQuiet(parentStartPipe) - - var ociLog string - if logrus.GetLevel() != logrus.DebugLevel && r.supportsJSON { - ociLog = filepath.Join(ctr.state.RunDir, "oci-log") - } - - logTag, err := r.getLogTag(ctr) - if err != nil { - return 0, err - } - - if ctr.config.CgroupsMode == cgroupSplit { - if err := utils.MoveUnderCgroupSubtree("runtime"); err != nil { - return 0, err - } - } - - pidfile := ctr.config.PidFile - if pidfile == "" { - pidfile = filepath.Join(ctr.state.RunDir, "pidfile") - } - - args := r.sharedConmonArgs(ctr, ctr.ID(), ctr.bundlePath(), pidfile, ctr.LogPath(), r.exitsDir, ociLog, ctr.LogDriver(), logTag) - - if ctr.config.SdNotifyMode == define.SdNotifyModeContainer && ctr.config.SdNotifySocket != "" { - args = append(args, fmt.Sprintf("--sdnotify-socket=%s", ctr.config.SdNotifySocket)) - } - - if ctr.config.Spec.Process.Terminal { - args = append(args, "-t") - } else if ctr.config.Stdin { - args = append(args, "-i") - } - - if ctr.config.Timeout > 0 { - args = append(args, fmt.Sprintf("--timeout=%d", ctr.config.Timeout)) - } - - if !r.enableKeyring { - args = append(args, "--no-new-keyring") - } - if ctr.config.ConmonPidFile != "" { - args = append(args, "--conmon-pidfile", ctr.config.ConmonPidFile) - } - - if r.noPivot { - args = append(args, "--no-pivot") - } - - exitCommand, err := specgenutil.CreateExitCommandArgs(ctr.runtime.storageConfig, ctr.runtime.config, logrus.IsLevelEnabled(logrus.DebugLevel), ctr.AutoRemove(), false) - if err != nil { - return 0, err - } - exitCommand = append(exitCommand, ctr.config.ID) - - args = append(args, "--exit-command", exitCommand[0]) - for _, arg := range exitCommand[1:] { - args = append(args, []string{"--exit-command-arg", arg}...) - } - - // Pass down the LISTEN_* environment (see #10443). - preserveFDs := ctr.config.PreserveFDs - if val := os.Getenv("LISTEN_FDS"); val != "" { - if ctr.config.PreserveFDs > 0 { - logrus.Warnf("Ignoring LISTEN_FDS to preserve custom user-specified FDs") - } else { - fds, err := strconv.Atoi(val) - if err != nil { - return 0, fmt.Errorf("converting LISTEN_FDS=%s: %w", val, err) - } - preserveFDs = uint(fds) - } - } - - if preserveFDs > 0 { - args = append(args, formatRuntimeOpts("--preserve-fds", fmt.Sprintf("%d", preserveFDs))...) - } - - if restoreOptions != nil { - args = append(args, "--restore", ctr.CheckpointPath()) - if restoreOptions.TCPEstablished { - args = append(args, "--runtime-opt", "--tcp-established") - } - if restoreOptions.FileLocks { - args = append(args, "--runtime-opt", "--file-locks") - } - if restoreOptions.Pod != "" { - mountLabel := ctr.config.MountLabel - processLabel := ctr.config.ProcessLabel - if mountLabel != "" { - args = append( - args, - "--runtime-opt", - fmt.Sprintf( - "--lsm-mount-context=%s", - mountLabel, - ), - ) - } - if processLabel != "" { - args = append( - args, - "--runtime-opt", - fmt.Sprintf( - "--lsm-profile=selinux:%s", - processLabel, - ), - ) - } - } - } - - logrus.WithFields(logrus.Fields{ - "args": args, - }).Debugf("running conmon: %s", r.conmonPath) - - cmd := exec.Command(r.conmonPath, args...) - cmd.SysProcAttr = &syscall.SysProcAttr{ - Setpgid: true, - } - // TODO this is probably a really bad idea for some uses - // Make this configurable - cmd.Stdin = os.Stdin - cmd.Stdout = os.Stdout - cmd.Stderr = os.Stderr - if ctr.config.Spec.Process.Terminal { - cmd.Stderr = &stderrBuf - } - - // 0, 1 and 2 are stdin, stdout and stderr - conmonEnv := r.configureConmonEnv(runtimeDir) - - var filesToClose []*os.File - if preserveFDs > 0 { - for fd := 3; fd < int(3+preserveFDs); fd++ { - f := os.NewFile(uintptr(fd), fmt.Sprintf("fd-%d", fd)) - filesToClose = append(filesToClose, f) - cmd.ExtraFiles = append(cmd.ExtraFiles, f) - } - } - - cmd.Env = r.conmonEnv - // we don't want to step on users fds they asked to preserve - // Since 0-2 are used for stdio, start the fds we pass in at preserveFDs+3 - cmd.Env = append(cmd.Env, fmt.Sprintf("_OCI_SYNCPIPE=%d", preserveFDs+3), fmt.Sprintf("_OCI_STARTPIPE=%d", preserveFDs+4)) - cmd.Env = append(cmd.Env, conmonEnv...) - cmd.ExtraFiles = append(cmd.ExtraFiles, childSyncPipe, childStartPipe) - - if r.reservePorts && !rootless.IsRootless() && !ctr.config.NetMode.IsSlirp4netns() { - ports, err := bindPorts(ctr.convertPortMappings()) - if err != nil { - return 0, err - } - filesToClose = append(filesToClose, ports...) - - // Leak the port we bound in the conmon process. These fd's won't be used - // by the container and conmon will keep the ports busy so that another - // process cannot use them. - cmd.ExtraFiles = append(cmd.ExtraFiles, ports...) - } - - if ctr.config.NetMode.IsSlirp4netns() || rootless.IsRootless() { - if ctr.config.PostConfigureNetNS { - havePortMapping := len(ctr.config.PortMappings) > 0 - if havePortMapping { - ctr.rootlessPortSyncR, ctr.rootlessPortSyncW, err = os.Pipe() - if err != nil { - return 0, fmt.Errorf("failed to create rootless port sync pipe: %w", err) - } - } - ctr.rootlessSlirpSyncR, ctr.rootlessSlirpSyncW, err = os.Pipe() - if err != nil { - return 0, fmt.Errorf("failed to create rootless network sync pipe: %w", err) - } - } else { - if ctr.rootlessSlirpSyncR != nil { - defer errorhandling.CloseQuiet(ctr.rootlessSlirpSyncR) - } - if ctr.rootlessSlirpSyncW != nil { - defer errorhandling.CloseQuiet(ctr.rootlessSlirpSyncW) - } - } - // Leak one end in conmon, the other one will be leaked into slirp4netns - cmd.ExtraFiles = append(cmd.ExtraFiles, ctr.rootlessSlirpSyncW) - - if ctr.rootlessPortSyncW != nil { - defer errorhandling.CloseQuiet(ctr.rootlessPortSyncW) - // Leak one end in conmon, the other one will be leaked into rootlessport - cmd.ExtraFiles = append(cmd.ExtraFiles, ctr.rootlessPortSyncW) - } - } - var runtimeRestoreStarted time.Time - if restoreOptions != nil { - runtimeRestoreStarted = time.Now() - } - err = startCommand(cmd, ctr) - - // regardless of whether we errored or not, we no longer need the children pipes - childSyncPipe.Close() - childStartPipe.Close() - if err != nil { - return 0, err - } - if err := r.moveConmonToCgroupAndSignal(ctr, cmd, parentStartPipe); err != nil { - return 0, err - } - /* Wait for initial setup and fork, and reap child */ - err = cmd.Wait() - if err != nil { - return 0, err - } - - pid, err := readConmonPipeData(r.name, parentSyncPipe, ociLog) - if err != nil { - if err2 := r.DeleteContainer(ctr); err2 != nil { - logrus.Errorf("Removing container %s from runtime after creation failed", ctr.ID()) - } - return 0, err - } - ctr.state.PID = pid - - conmonPID, err := readConmonPidFile(ctr.config.ConmonPidFile) - if err != nil { - logrus.Warnf("Error reading conmon pid file for container %s: %v", ctr.ID(), err) - } else if conmonPID > 0 { - // conmon not having a pid file is a valid state, so don't set it if we don't have it - logrus.Infof("Got Conmon PID as %d", conmonPID) - ctr.state.ConmonPID = conmonPID - } - - runtimeRestoreDuration := func() int64 { - if restoreOptions != nil && restoreOptions.PrintStats { - return time.Since(runtimeRestoreStarted).Microseconds() - } - return 0 - }() - - // These fds were passed down to the runtime. Close them - // and not interfere - for _, f := range filesToClose { - errorhandling.CloseQuiet(f) - } - - return runtimeRestoreDuration, nil -} - -// configureConmonEnv gets the environment values to add to conmon's exec struct -// TODO this may want to be less hardcoded/more configurable in the future -func (r *ConmonOCIRuntime) configureConmonEnv(runtimeDir string) []string { - var env []string - for _, e := range os.Environ() { - if strings.HasPrefix(e, "LC_") { - env = append(env, e) - } - } - conf, ok := os.LookupEnv("CONTAINERS_CONF") - if ok { - env = append(env, fmt.Sprintf("CONTAINERS_CONF=%s", conf)) - } - env = append(env, fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)) - env = append(env, fmt.Sprintf("_CONTAINERS_USERNS_CONFIGURED=%s", os.Getenv("_CONTAINERS_USERNS_CONFIGURED"))) - env = append(env, fmt.Sprintf("_CONTAINERS_ROOTLESS_UID=%s", os.Getenv("_CONTAINERS_ROOTLESS_UID"))) - home := homedir.Get() - if home != "" { - env = append(env, fmt.Sprintf("HOME=%s", home)) - } - - return env -} - -// sharedConmonArgs takes common arguments for exec and create/restore and formats them for the conmon CLI -func (r *ConmonOCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, pidPath, logPath, exitDir, ociLogPath, logDriver, logTag string) []string { - // set the conmon API version to be able to use the correct sync struct keys - args := []string{ - "--api-version", "1", - "-c", ctr.ID(), - "-u", cuuid, - "-r", r.path, - "-b", bundlePath, - "-p", pidPath, - "-n", ctr.Name(), - "--exit-dir", exitDir, - "--full-attach", - } - if len(r.runtimeFlags) > 0 { - rFlags := []string{} - for _, arg := range r.runtimeFlags { - rFlags = append(rFlags, "--runtime-arg", arg) - } - args = append(args, rFlags...) - } - - if ctr.CgroupManager() == config.SystemdCgroupsManager && !ctr.config.NoCgroups && ctr.config.CgroupsMode != cgroupSplit { - args = append(args, "-s") - } - - var logDriverArg string - switch logDriver { - case define.JournaldLogging: - logDriverArg = define.JournaldLogging - case define.NoLogging: - logDriverArg = define.NoLogging - case define.PassthroughLogging: - logDriverArg = define.PassthroughLogging - //lint:ignore ST1015 the default case has to be here - default: //nolint:stylecheck,gocritic - // No case here should happen except JSONLogging, but keep this here in case the options are extended - logrus.Errorf("%s logging specified but not supported. Choosing k8s-file logging instead", ctr.LogDriver()) - fallthrough - case "": - // to get here, either a user would specify `--log-driver ""`, or this came from another place in libpod - // since the former case is obscure, and the latter case isn't an error, let's silently fallthrough - fallthrough - case define.JSONLogging: - fallthrough - case define.KubernetesLogging: - logDriverArg = fmt.Sprintf("%s:%s", define.KubernetesLogging, logPath) - } - - args = append(args, "-l", logDriverArg) - logLevel := logrus.GetLevel() - args = append(args, "--log-level", logLevel.String()) - - if logLevel == logrus.DebugLevel { - logrus.Debugf("%s messages will be logged to syslog", r.conmonPath) - args = append(args, "--syslog") - } - - size := r.logSizeMax - if ctr.config.LogSize > 0 { - size = ctr.config.LogSize - } - if size > 0 { - args = append(args, "--log-size-max", fmt.Sprintf("%v", size)) - } - - if ociLogPath != "" { - args = append(args, "--runtime-arg", "--log-format=json", "--runtime-arg", "--log", fmt.Sprintf("--runtime-arg=%s", ociLogPath)) - } - if logTag != "" { - args = append(args, "--log-tag", logTag) - } - if ctr.config.NoCgroups { - logrus.Debugf("Running with no Cgroups") - args = append(args, "--runtime-arg", "--cgroup-manager", "--runtime-arg", "disabled") - } - return args -} - -func startCommand(cmd *exec.Cmd, ctr *Container) error { - // Make sure to unset the NOTIFY_SOCKET and reset it afterwards if needed. - switch ctr.config.SdNotifyMode { - case define.SdNotifyModeContainer, define.SdNotifyModeIgnore: - if prev := os.Getenv("NOTIFY_SOCKET"); prev != "" { - if err := os.Unsetenv("NOTIFY_SOCKET"); err != nil { - logrus.Warnf("Error unsetting NOTIFY_SOCKET %v", err) - } - defer func() { - if err := os.Setenv("NOTIFY_SOCKET", prev); err != nil { - logrus.Errorf("Resetting NOTIFY_SOCKET=%s", prev) - } - }() - } - } - - return cmd.Start() -} - -// moveConmonToCgroupAndSignal gets a container's cgroupParent and moves the conmon process to that cgroup -// it then signals for conmon to start by sending nonce data down the start fd -func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec.Cmd, startFd *os.File) error { - mustCreateCgroup := true - - if ctr.config.NoCgroups { - mustCreateCgroup = false - } - - // If cgroup creation is disabled - just signal. - switch ctr.config.CgroupsMode { - case "disabled", "no-conmon", cgroupSplit: - mustCreateCgroup = false - } - - // $INVOCATION_ID is set by systemd when running as a service. - if ctr.runtime.RemoteURI() == "" && os.Getenv("INVOCATION_ID") != "" { - mustCreateCgroup = false - } - - if mustCreateCgroup { - // Usually rootless users are not allowed to configure cgroupfs. - // There are cases though, where it is allowed, e.g. if the cgroup - // is manually configured and chowned). Avoid detecting all - // such cases and simply use a lower log level. - logLevel := logrus.WarnLevel - if rootless.IsRootless() { - logLevel = logrus.InfoLevel - } - // TODO: This should be a switch - we are not guaranteed that - // there are only 2 valid cgroup managers - cgroupParent := ctr.CgroupParent() - cgroupPath := filepath.Join(ctr.config.CgroupParent, "conmon") - Resource := ctr.Spec().Linux.Resources - cgroupResources, err := GetLimits(Resource) - if err != nil { - logrus.StandardLogger().Log(logLevel, "Could not get ctr resources") - } - if ctr.CgroupManager() == config.SystemdCgroupsManager { - unitName := createUnitName("libpod-conmon", ctr.ID()) - realCgroupParent := cgroupParent - splitParent := strings.Split(cgroupParent, "/") - if strings.HasSuffix(cgroupParent, ".slice") && len(splitParent) > 1 { - realCgroupParent = splitParent[len(splitParent)-1] - } - - logrus.Infof("Running conmon under slice %s and unitName %s", realCgroupParent, unitName) - if err := utils.RunUnderSystemdScope(cmd.Process.Pid, realCgroupParent, unitName); err != nil { - logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to systemd sandbox cgroup: %v", err) - } - } else { - control, err := cgroups.New(cgroupPath, &cgroupResources) - if err != nil { - logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to cgroupfs sandbox cgroup: %v", err) - } else if err := control.AddPid(cmd.Process.Pid); err != nil { - // we need to remove this defer and delete the cgroup once conmon exits - // maybe need a conmon monitor? - logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to cgroupfs sandbox cgroup: %v", err) - } - } - } - - /* We set the cgroup, now the child can start creating children */ - if err := writeConmonPipeData(startFd); err != nil { - return err - } - return nil -} - -// newPipe creates a unix socket pair for communication. -// Returns two files - first is parent, second is child. -func newPipe() (*os.File, *os.File, error) { - fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0) - if err != nil { - return nil, nil, err - } - return os.NewFile(uintptr(fds[1]), "parent"), os.NewFile(uintptr(fds[0]), "child"), nil -} - -// readConmonPidFile attempts to read conmon's pid from its pid file -func readConmonPidFile(pidFile string) (int, error) { - // Let's try reading the Conmon pid at the same time. - if pidFile != "" { - contents, err := ioutil.ReadFile(pidFile) - if err != nil { - return -1, err - } - // Convert it to an int - conmonPID, err := strconv.Atoi(string(contents)) - if err != nil { - return -1, err - } - return conmonPID, nil - } - return 0, nil -} - -// readConmonPipeData attempts to read a syncInfo struct from the pipe -func readConmonPipeData(runtimeName string, pipe *os.File, ociLog string) (int, error) { - // syncInfo is used to return data from monitor process to daemon - type syncInfo struct { - Data int `json:"data"` - Message string `json:"message,omitempty"` - } - - // Wait to get container pid from conmon - type syncStruct struct { - si *syncInfo - err error - } - ch := make(chan syncStruct) - go func() { - var si *syncInfo - rdr := bufio.NewReader(pipe) - b, err := rdr.ReadBytes('\n') - // ignore EOF here, error is returned even when data was read - // if it is no valid json unmarshal will fail below - if err != nil && !errors.Is(err, io.EOF) { - ch <- syncStruct{err: err} - } - if err := json.Unmarshal(b, &si); err != nil { - ch <- syncStruct{err: fmt.Errorf("conmon bytes %q: %w", string(b), err)} - return - } - ch <- syncStruct{si: si} - }() - - data := -1 //nolint: wastedassign - select { - case ss := <-ch: - if ss.err != nil { - if ociLog != "" { - ociLogData, err := ioutil.ReadFile(ociLog) - if err == nil { - var ociErr ociError - if err := json.Unmarshal(ociLogData, &ociErr); err == nil { - return -1, getOCIRuntimeError(runtimeName, ociErr.Msg) - } - } - } - return -1, fmt.Errorf("container create failed (no logs from conmon): %w", ss.err) - } - logrus.Debugf("Received: %d", ss.si.Data) - if ss.si.Data < 0 { - if ociLog != "" { - ociLogData, err := ioutil.ReadFile(ociLog) - if err == nil { - var ociErr ociError - if err := json.Unmarshal(ociLogData, &ociErr); err == nil { - return ss.si.Data, getOCIRuntimeError(runtimeName, ociErr.Msg) - } - } - } - // If we failed to parse the JSON errors, then print the output as it is - if ss.si.Message != "" { - return ss.si.Data, getOCIRuntimeError(runtimeName, ss.si.Message) - } - return ss.si.Data, fmt.Errorf("container create failed: %w", define.ErrInternal) - } - data = ss.si.Data - case <-time.After(define.ContainerCreateTimeout): - return -1, fmt.Errorf("container creation timeout: %w", define.ErrInternal) - } - return data, nil -} - -// writeConmonPipeData writes nonce data to a pipe -func writeConmonPipeData(pipe *os.File) error { - someData := []byte{0} - _, err := pipe.Write(someData) - return err -} - -// formatRuntimeOpts prepends opts passed to it with --runtime-opt for passing to conmon -func formatRuntimeOpts(opts ...string) []string { - args := make([]string, 0, len(opts)*2) - for _, o := range opts { - args = append(args, "--runtime-opt", o) - } - return args -} - -// getConmonVersion returns a string representation of the conmon version. -func (r *ConmonOCIRuntime) getConmonVersion() (string, error) { - output, err := utils.ExecCmd(r.conmonPath, "--version") - if err != nil { - return "", err - } - return strings.TrimSuffix(strings.Replace(output, "\n", ", ", 1), "\n"), nil -} - -// getOCIRuntimeVersion returns a string representation of the OCI runtime's -// version. -func (r *ConmonOCIRuntime) getOCIRuntimeVersion() (string, error) { - output, err := utils.ExecCmd(r.path, "--version") - if err != nil { - return "", err - } - return strings.TrimSuffix(output, "\n"), nil -} - -// Copy data from container to HTTP connection, for terminal attach. -// Container is the container's attach socket connection, http is a buffer for -// the HTTP connection. cid is the ID of the container the attach session is -// running for (used solely for error messages). -func httpAttachTerminalCopy(container *net.UnixConn, http *bufio.ReadWriter, cid string) error { - buf := make([]byte, bufferSize) - for { - numR, err := container.Read(buf) - logrus.Debugf("Read fd(%d) %d/%d bytes for container %s", int(buf[0]), numR, len(buf), cid) - - if numR > 0 { - switch buf[0] { - case AttachPipeStdout: - // Do nothing - default: - logrus.Errorf("Received unexpected attach type %+d, discarding %d bytes", buf[0], numR) - continue - } - - numW, err2 := http.Write(buf[1:numR]) - if err2 != nil { - if err != nil { - logrus.Errorf("Reading container %s STDOUT: %v", cid, err) - } - return err2 - } else if numW+1 != numR { - return io.ErrShortWrite - } - // We need to force the buffer to write immediately, so - // there isn't a delay on the terminal side. - if err2 := http.Flush(); err2 != nil { - if err != nil { - logrus.Errorf("Reading container %s STDOUT: %v", cid, err) - } - return err2 - } - } - if err != nil { - if err == io.EOF { - return nil - } - return err - } - } -} - -// Copy data from a container to an HTTP connection, for non-terminal attach. -// Appends a header to multiplex input. -func httpAttachNonTerminalCopy(container *net.UnixConn, http *bufio.ReadWriter, cid string, stdin, stdout, stderr bool) error { - buf := make([]byte, bufferSize) - for { - numR, err := container.Read(buf) - if numR > 0 { - var headerBuf []byte - - // Subtract 1 because we strip the first byte (used for - // multiplexing by Conmon). - headerLen := uint32(numR - 1) - // Practically speaking, we could make this buf[0] - 1, - // but we need to validate it anyway. - switch buf[0] { - case AttachPipeStdin: - headerBuf = makeHTTPAttachHeader(0, headerLen) - if !stdin { - continue - } - case AttachPipeStdout: - if !stdout { - continue - } - headerBuf = makeHTTPAttachHeader(1, headerLen) - case AttachPipeStderr: - if !stderr { - continue - } - headerBuf = makeHTTPAttachHeader(2, headerLen) - default: - logrus.Errorf("Received unexpected attach type %+d, discarding %d bytes", buf[0], numR) - continue - } - - numH, err2 := http.Write(headerBuf) - if err2 != nil { - if err != nil { - logrus.Errorf("Reading container %s standard streams: %v", cid, err) - } - - return err2 - } - // Hardcoding header length is pretty gross, but - // fast. Should be safe, as this is a fixed part - // of the protocol. - if numH != 8 { - if err != nil { - logrus.Errorf("Reading container %s standard streams: %v", cid, err) - } - - return io.ErrShortWrite - } - - numW, err2 := http.Write(buf[1:numR]) - if err2 != nil { - if err != nil { - logrus.Errorf("Reading container %s standard streams: %v", cid, err) - } - - return err2 - } else if numW+1 != numR { - if err != nil { - logrus.Errorf("Reading container %s standard streams: %v", cid, err) - } - - return io.ErrShortWrite - } - // We need to force the buffer to write immediately, so - // there isn't a delay on the terminal side. - if err2 := http.Flush(); err2 != nil { - if err != nil { - logrus.Errorf("Reading container %s STDOUT: %v", cid, err) - } - return err2 - } - } - if err != nil { - if err == io.EOF { - return nil - } - - return err - } - } -} - -// GetLimits converts spec resource limits to cgroup consumable limits -func GetLimits(resource *spec.LinuxResources) (runcconfig.Resources, error) { - if resource == nil { - resource = &spec.LinuxResources{} - } - final := &runcconfig.Resources{} - devs := []*devices.Rule{} - - // Devices - for _, entry := range resource.Devices { - if entry.Major == nil || entry.Minor == nil { - continue - } - runeType := 'a' - switch entry.Type { - case "b": - runeType = 'b' - case "c": - runeType = 'c' - } - - devs = append(devs, &devices.Rule{ - Type: devices.Type(runeType), - Major: *entry.Major, - Minor: *entry.Minor, - Permissions: devices.Permissions(entry.Access), - Allow: entry.Allow, - }) - } - final.Devices = devs - - // HugepageLimits - pageLimits := []*runcconfig.HugepageLimit{} - for _, entry := range resource.HugepageLimits { - pageLimits = append(pageLimits, &runcconfig.HugepageLimit{ - Pagesize: entry.Pagesize, - Limit: entry.Limit, - }) - } - final.HugetlbLimit = pageLimits - - // Networking - netPriorities := []*runcconfig.IfPrioMap{} - if resource.Network != nil { - for _, entry := range resource.Network.Priorities { - netPriorities = append(netPriorities, &runcconfig.IfPrioMap{ - Interface: entry.Name, - Priority: int64(entry.Priority), - }) - } - } - final.NetPrioIfpriomap = netPriorities - rdma := make(map[string]runcconfig.LinuxRdma) - for name, entry := range resource.Rdma { - rdma[name] = runcconfig.LinuxRdma{HcaHandles: entry.HcaHandles, HcaObjects: entry.HcaObjects} - } - final.Rdma = rdma - - // Memory - if resource.Memory != nil { - if resource.Memory.Limit != nil { - final.Memory = *resource.Memory.Limit - } - if resource.Memory.Reservation != nil { - final.MemoryReservation = *resource.Memory.Reservation - } - if resource.Memory.Swap != nil { - final.MemorySwap = *resource.Memory.Swap - } - if resource.Memory.Swappiness != nil { - final.MemorySwappiness = resource.Memory.Swappiness - } - } - - // CPU - if resource.CPU != nil { - if resource.CPU.Period != nil { - final.CpuPeriod = *resource.CPU.Period - } - if resource.CPU.Quota != nil { - final.CpuQuota = *resource.CPU.Quota - } - if resource.CPU.RealtimePeriod != nil { - final.CpuRtPeriod = *resource.CPU.RealtimePeriod - } - if resource.CPU.RealtimeRuntime != nil { - final.CpuRtRuntime = *resource.CPU.RealtimeRuntime - } - if resource.CPU.Shares != nil { - final.CpuShares = *resource.CPU.Shares - } - final.CpusetCpus = resource.CPU.Cpus - final.CpusetMems = resource.CPU.Mems - } - - // BlkIO - if resource.BlockIO != nil { - if len(resource.BlockIO.ThrottleReadBpsDevice) > 0 { - for _, entry := range resource.BlockIO.ThrottleReadBpsDevice { - throttle := &runcconfig.ThrottleDevice{} - dev := &runcconfig.BlockIODevice{ - Major: entry.Major, - Minor: entry.Minor, - } - throttle.BlockIODevice = *dev - throttle.Rate = entry.Rate - final.BlkioThrottleReadBpsDevice = append(final.BlkioThrottleReadBpsDevice, throttle) - } - } - if len(resource.BlockIO.ThrottleWriteBpsDevice) > 0 { - for _, entry := range resource.BlockIO.ThrottleWriteBpsDevice { - throttle := &runcconfig.ThrottleDevice{} - dev := &runcconfig.BlockIODevice{ - Major: entry.Major, - Minor: entry.Minor, - } - throttle.BlockIODevice = *dev - throttle.Rate = entry.Rate - final.BlkioThrottleWriteBpsDevice = append(final.BlkioThrottleWriteBpsDevice, throttle) - } - } - if len(resource.BlockIO.ThrottleReadIOPSDevice) > 0 { - for _, entry := range resource.BlockIO.ThrottleReadIOPSDevice { - throttle := &runcconfig.ThrottleDevice{} - dev := &runcconfig.BlockIODevice{ - Major: entry.Major, - Minor: entry.Minor, - } - throttle.BlockIODevice = *dev - throttle.Rate = entry.Rate - final.BlkioThrottleReadIOPSDevice = append(final.BlkioThrottleReadIOPSDevice, throttle) - } - } - if len(resource.BlockIO.ThrottleWriteIOPSDevice) > 0 { - for _, entry := range resource.BlockIO.ThrottleWriteIOPSDevice { - throttle := &runcconfig.ThrottleDevice{} - dev := &runcconfig.BlockIODevice{ - Major: entry.Major, - Minor: entry.Minor, - } - throttle.BlockIODevice = *dev - throttle.Rate = entry.Rate - final.BlkioThrottleWriteIOPSDevice = append(final.BlkioThrottleWriteIOPSDevice, throttle) - } - } - if resource.BlockIO.LeafWeight != nil { - final.BlkioLeafWeight = *resource.BlockIO.LeafWeight - } - if resource.BlockIO.Weight != nil { - final.BlkioWeight = *resource.BlockIO.Weight - } - if len(resource.BlockIO.WeightDevice) > 0 { - for _, entry := range resource.BlockIO.WeightDevice { - weight := &runcconfig.WeightDevice{} - dev := &runcconfig.BlockIODevice{ - Major: entry.Major, - Minor: entry.Minor, - } - if entry.Weight != nil { - weight.Weight = *entry.Weight - } - if entry.LeafWeight != nil { - weight.LeafWeight = *entry.LeafWeight - } - weight.BlockIODevice = *dev - final.BlkioWeightDevice = append(final.BlkioWeightDevice, weight) - } - } - } - - // Pids - if resource.Pids != nil { - final.PidsLimit = resource.Pids.Limit - } - - // Networking - if resource.Network != nil { - if resource.Network.ClassID != nil { - final.NetClsClassid = *resource.Network.ClassID - } - } - - // Unified state - final.Unified = resource.Unified - - return *final, nil -} -- cgit v1.2.3-54-g00ecf From 8d229c6cdc9ab7325bf1f246e1bab6af79e75afe Mon Sep 17 00:00:00 2001 From: Doug Rabson Date: Wed, 17 Aug 2022 09:36:46 +0100 Subject: libpod: Move oci_conmon_attach_linux.go to oci_conmon_attach_common.go [NO NEW TESTS NEEDED] Signed-off-by: Doug Rabson --- libpod/oci_conmon_attach_common.go | 314 +++++++++++++++++++++++++++++++++++++ libpod/oci_conmon_attach_linux.go | 314 ------------------------------------- 2 files changed, 314 insertions(+), 314 deletions(-) create mode 100644 libpod/oci_conmon_attach_common.go delete mode 100644 libpod/oci_conmon_attach_linux.go (limited to 'libpod') diff --git a/libpod/oci_conmon_attach_common.go b/libpod/oci_conmon_attach_common.go new file mode 100644 index 000000000..aa55aa6f5 --- /dev/null +++ b/libpod/oci_conmon_attach_common.go @@ -0,0 +1,314 @@ +//go:build linux +// +build linux + +package libpod + +import ( + "errors" + "fmt" + "io" + "net" + "os" + "path/filepath" + "syscall" + + "github.com/containers/common/pkg/config" + "github.com/containers/common/pkg/resize" + "github.com/containers/common/pkg/util" + "github.com/containers/podman/v4/libpod/define" + "github.com/containers/podman/v4/pkg/errorhandling" + "github.com/moby/term" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +/* Sync with stdpipe_t in conmon.c */ +const ( + AttachPipeStdin = 1 + AttachPipeStdout = 2 + AttachPipeStderr = 3 +) + +func openUnixSocket(path string) (*net.UnixConn, error) { + fd, err := unix.Open(path, unix.O_PATH, 0) + if err != nil { + return nil, err + } + defer unix.Close(fd) + return net.DialUnix("unixpacket", nil, &net.UnixAddr{Name: fmt.Sprintf("/proc/self/fd/%d", fd), Net: "unixpacket"}) +} + +// Attach to the given container. +// Does not check if state is appropriate. +// started is only required if startContainer is true. +func (r *ConmonOCIRuntime) Attach(c *Container, params *AttachOptions) error { + passthrough := c.LogDriver() == define.PassthroughLogging + + if params == nil || params.Streams == nil { + return fmt.Errorf("must provide parameters to Attach: %w", define.ErrInternal) + } + + if !params.Streams.AttachOutput && !params.Streams.AttachError && !params.Streams.AttachInput && !passthrough { + return fmt.Errorf("must provide at least one stream to attach to: %w", define.ErrInvalidArg) + } + if params.Start && params.Started == nil { + return fmt.Errorf("started chan not passed when startContainer set: %w", define.ErrInternal) + } + + keys := config.DefaultDetachKeys + if params.DetachKeys != nil { + keys = *params.DetachKeys + } + + detachKeys, err := processDetachKeys(keys) + if err != nil { + return err + } + + var conn *net.UnixConn + if !passthrough { + logrus.Debugf("Attaching to container %s", c.ID()) + + // If we have a resize, do it. + if params.InitialSize != nil { + if err := r.AttachResize(c, *params.InitialSize); err != nil { + return err + } + } + + attachSock, err := c.AttachSocketPath() + if err != nil { + return err + } + + conn, err = openUnixSocket(attachSock) + if err != nil { + return fmt.Errorf("failed to connect to container's attach socket: %v: %w", attachSock, err) + } + defer func() { + if err := conn.Close(); err != nil { + logrus.Errorf("unable to close socket: %q", err) + } + }() + } + + // If starting was requested, start the container and notify when that's + // done. + if params.Start { + if err := c.start(); err != nil { + return err + } + params.Started <- true + } + + if passthrough { + return nil + } + + receiveStdoutError, stdinDone := setupStdioChannels(params.Streams, conn, detachKeys) + if params.AttachReady != nil { + params.AttachReady <- true + } + return readStdio(conn, params.Streams, receiveStdoutError, stdinDone) +} + +// Attach to the given container's exec session +// attachFd and startFd must be open file descriptors +// attachFd must be the output side of the fd. attachFd is used for two things: +// conmon will first send a nonce value across the pipe indicating it has set up its side of the console socket +// this ensures attachToExec gets all of the output of the called process +// conmon will then send the exit code of the exec process, or an error in the exec session +// startFd must be the input side of the fd. +// newSize resizes the tty to this size before the process is started, must be nil if the exec session has no tty +// conmon will wait to start the exec session until the parent process has set up the console socket. +// Once attachToExec successfully attaches to the console socket, the child conmon process responsible for calling runtime exec +// will read from the output side of start fd, thus learning to start the child process. +// Thus, the order goes as follow: +// 1. conmon parent process sets up its console socket. sends on attachFd +// 2. attachToExec attaches to the console socket after reading on attachFd and resizes the tty +// 3. child waits on startFd for attachToExec to attach to said console socket +// 4. attachToExec sends on startFd, signalling it has attached to the socket and child is ready to go +// 5. child receives on startFd, runs the runtime exec command +// attachToExec is responsible for closing startFd and attachFd +func (c *Container) attachToExec(streams *define.AttachStreams, keys *string, sessionID string, startFd, attachFd *os.File, newSize *resize.TerminalSize) error { + if !streams.AttachOutput && !streams.AttachError && !streams.AttachInput { + return fmt.Errorf("must provide at least one stream to attach to: %w", define.ErrInvalidArg) + } + if startFd == nil || attachFd == nil { + return fmt.Errorf("start sync pipe and attach sync pipe must be defined for exec attach: %w", define.ErrInvalidArg) + } + + defer errorhandling.CloseQuiet(startFd) + defer errorhandling.CloseQuiet(attachFd) + + detachString := config.DefaultDetachKeys + if keys != nil { + detachString = *keys + } + detachKeys, err := processDetachKeys(detachString) + if err != nil { + return err + } + + logrus.Debugf("Attaching to container %s exec session %s", c.ID(), sessionID) + + // set up the socket path, such that it is the correct length and location for exec + sockPath, err := c.execAttachSocketPath(sessionID) + if err != nil { + return err + } + + // 2: read from attachFd that the parent process has set up the console socket + if _, err := readConmonPipeData(c.ociRuntime.Name(), attachFd, ""); err != nil { + return err + } + + // resize before we start the container process + if newSize != nil { + err = c.ociRuntime.ExecAttachResize(c, sessionID, *newSize) + if err != nil { + logrus.Warnf("Resize failed: %v", err) + } + } + + // 2: then attach + conn, err := openUnixSocket(sockPath) + if err != nil { + return fmt.Errorf("failed to connect to container's attach socket: %v: %w", sockPath, err) + } + defer func() { + if err := conn.Close(); err != nil { + logrus.Errorf("Unable to close socket: %q", err) + } + }() + + // start listening on stdio of the process + receiveStdoutError, stdinDone := setupStdioChannels(streams, conn, detachKeys) + + // 4: send start message to child + if err := writeConmonPipeData(startFd); err != nil { + return err + } + + return readStdio(conn, streams, receiveStdoutError, stdinDone) +} + +func processDetachKeys(keys string) ([]byte, error) { + // Check the validity of the provided keys first + if len(keys) == 0 { + return []byte{}, nil + } + detachKeys, err := term.ToBytes(keys) + if err != nil { + return nil, fmt.Errorf("invalid detach keys: %w", err) + } + return detachKeys, nil +} + +func registerResizeFunc(r <-chan resize.TerminalSize, bundlePath string) { + resize.HandleResizing(r, func(size resize.TerminalSize) { + controlPath := filepath.Join(bundlePath, "ctl") + controlFile, err := os.OpenFile(controlPath, unix.O_WRONLY, 0) + if err != nil { + logrus.Debugf("Could not open ctl file: %v", err) + return + } + defer controlFile.Close() + + logrus.Debugf("Received a resize event: %+v", size) + if _, err = fmt.Fprintf(controlFile, "%d %d %d\n", 1, size.Height, size.Width); err != nil { + logrus.Warnf("Failed to write to control file to resize terminal: %v", err) + } + }) +} + +func setupStdioChannels(streams *define.AttachStreams, conn *net.UnixConn, detachKeys []byte) (chan error, chan error) { + receiveStdoutError := make(chan error) + go func() { + receiveStdoutError <- redirectResponseToOutputStreams(streams.OutputStream, streams.ErrorStream, streams.AttachOutput, streams.AttachError, conn) + }() + + stdinDone := make(chan error) + go func() { + var err error + if streams.AttachInput { + _, err = util.CopyDetachable(conn, streams.InputStream, detachKeys) + } + stdinDone <- err + }() + + return receiveStdoutError, stdinDone +} + +func redirectResponseToOutputStreams(outputStream, errorStream io.Writer, writeOutput, writeError bool, conn io.Reader) error { + var err error + buf := make([]byte, 8192+1) /* Sync with conmon STDIO_BUF_SIZE */ + for { + nr, er := conn.Read(buf) + if nr > 0 { + var dst io.Writer + var doWrite bool + switch buf[0] { + case AttachPipeStdout: + dst = outputStream + doWrite = writeOutput + case AttachPipeStderr: + dst = errorStream + doWrite = writeError + default: + logrus.Infof("Received unexpected attach type %+d", buf[0]) + } + if dst == nil { + return errors.New("output destination cannot be nil") + } + + if doWrite { + nw, ew := dst.Write(buf[1:nr]) + if ew != nil { + err = ew + break + } + if nr != nw+1 { + err = io.ErrShortWrite + break + } + } + } + if errors.Is(er, io.EOF) || errors.Is(er, syscall.ECONNRESET) { + break + } + if er != nil { + err = er + break + } + } + return err +} + +func readStdio(conn *net.UnixConn, streams *define.AttachStreams, receiveStdoutError, stdinDone chan error) error { + var err error + select { + case err = <-receiveStdoutError: + if err := conn.CloseWrite(); err != nil { + logrus.Errorf("Failed to close stdin: %v", err) + } + return err + case err = <-stdinDone: + if err == define.ErrDetach { + if err := conn.CloseWrite(); err != nil { + logrus.Errorf("Failed to close stdin: %v", err) + } + return err + } + if err == nil { + // copy stdin is done, close it + if connErr := conn.CloseWrite(); connErr != nil { + logrus.Errorf("Unable to close conn: %v", connErr) + } + } + if streams.AttachOutput || streams.AttachError { + return <-receiveStdoutError + } + } + return nil +} diff --git a/libpod/oci_conmon_attach_linux.go b/libpod/oci_conmon_attach_linux.go deleted file mode 100644 index aa55aa6f5..000000000 --- a/libpod/oci_conmon_attach_linux.go +++ /dev/null @@ -1,314 +0,0 @@ -//go:build linux -// +build linux - -package libpod - -import ( - "errors" - "fmt" - "io" - "net" - "os" - "path/filepath" - "syscall" - - "github.com/containers/common/pkg/config" - "github.com/containers/common/pkg/resize" - "github.com/containers/common/pkg/util" - "github.com/containers/podman/v4/libpod/define" - "github.com/containers/podman/v4/pkg/errorhandling" - "github.com/moby/term" - "github.com/sirupsen/logrus" - "golang.org/x/sys/unix" -) - -/* Sync with stdpipe_t in conmon.c */ -const ( - AttachPipeStdin = 1 - AttachPipeStdout = 2 - AttachPipeStderr = 3 -) - -func openUnixSocket(path string) (*net.UnixConn, error) { - fd, err := unix.Open(path, unix.O_PATH, 0) - if err != nil { - return nil, err - } - defer unix.Close(fd) - return net.DialUnix("unixpacket", nil, &net.UnixAddr{Name: fmt.Sprintf("/proc/self/fd/%d", fd), Net: "unixpacket"}) -} - -// Attach to the given container. -// Does not check if state is appropriate. -// started is only required if startContainer is true. -func (r *ConmonOCIRuntime) Attach(c *Container, params *AttachOptions) error { - passthrough := c.LogDriver() == define.PassthroughLogging - - if params == nil || params.Streams == nil { - return fmt.Errorf("must provide parameters to Attach: %w", define.ErrInternal) - } - - if !params.Streams.AttachOutput && !params.Streams.AttachError && !params.Streams.AttachInput && !passthrough { - return fmt.Errorf("must provide at least one stream to attach to: %w", define.ErrInvalidArg) - } - if params.Start && params.Started == nil { - return fmt.Errorf("started chan not passed when startContainer set: %w", define.ErrInternal) - } - - keys := config.DefaultDetachKeys - if params.DetachKeys != nil { - keys = *params.DetachKeys - } - - detachKeys, err := processDetachKeys(keys) - if err != nil { - return err - } - - var conn *net.UnixConn - if !passthrough { - logrus.Debugf("Attaching to container %s", c.ID()) - - // If we have a resize, do it. - if params.InitialSize != nil { - if err := r.AttachResize(c, *params.InitialSize); err != nil { - return err - } - } - - attachSock, err := c.AttachSocketPath() - if err != nil { - return err - } - - conn, err = openUnixSocket(attachSock) - if err != nil { - return fmt.Errorf("failed to connect to container's attach socket: %v: %w", attachSock, err) - } - defer func() { - if err := conn.Close(); err != nil { - logrus.Errorf("unable to close socket: %q", err) - } - }() - } - - // If starting was requested, start the container and notify when that's - // done. - if params.Start { - if err := c.start(); err != nil { - return err - } - params.Started <- true - } - - if passthrough { - return nil - } - - receiveStdoutError, stdinDone := setupStdioChannels(params.Streams, conn, detachKeys) - if params.AttachReady != nil { - params.AttachReady <- true - } - return readStdio(conn, params.Streams, receiveStdoutError, stdinDone) -} - -// Attach to the given container's exec session -// attachFd and startFd must be open file descriptors -// attachFd must be the output side of the fd. attachFd is used for two things: -// conmon will first send a nonce value across the pipe indicating it has set up its side of the console socket -// this ensures attachToExec gets all of the output of the called process -// conmon will then send the exit code of the exec process, or an error in the exec session -// startFd must be the input side of the fd. -// newSize resizes the tty to this size before the process is started, must be nil if the exec session has no tty -// conmon will wait to start the exec session until the parent process has set up the console socket. -// Once attachToExec successfully attaches to the console socket, the child conmon process responsible for calling runtime exec -// will read from the output side of start fd, thus learning to start the child process. -// Thus, the order goes as follow: -// 1. conmon parent process sets up its console socket. sends on attachFd -// 2. attachToExec attaches to the console socket after reading on attachFd and resizes the tty -// 3. child waits on startFd for attachToExec to attach to said console socket -// 4. attachToExec sends on startFd, signalling it has attached to the socket and child is ready to go -// 5. child receives on startFd, runs the runtime exec command -// attachToExec is responsible for closing startFd and attachFd -func (c *Container) attachToExec(streams *define.AttachStreams, keys *string, sessionID string, startFd, attachFd *os.File, newSize *resize.TerminalSize) error { - if !streams.AttachOutput && !streams.AttachError && !streams.AttachInput { - return fmt.Errorf("must provide at least one stream to attach to: %w", define.ErrInvalidArg) - } - if startFd == nil || attachFd == nil { - return fmt.Errorf("start sync pipe and attach sync pipe must be defined for exec attach: %w", define.ErrInvalidArg) - } - - defer errorhandling.CloseQuiet(startFd) - defer errorhandling.CloseQuiet(attachFd) - - detachString := config.DefaultDetachKeys - if keys != nil { - detachString = *keys - } - detachKeys, err := processDetachKeys(detachString) - if err != nil { - return err - } - - logrus.Debugf("Attaching to container %s exec session %s", c.ID(), sessionID) - - // set up the socket path, such that it is the correct length and location for exec - sockPath, err := c.execAttachSocketPath(sessionID) - if err != nil { - return err - } - - // 2: read from attachFd that the parent process has set up the console socket - if _, err := readConmonPipeData(c.ociRuntime.Name(), attachFd, ""); err != nil { - return err - } - - // resize before we start the container process - if newSize != nil { - err = c.ociRuntime.ExecAttachResize(c, sessionID, *newSize) - if err != nil { - logrus.Warnf("Resize failed: %v", err) - } - } - - // 2: then attach - conn, err := openUnixSocket(sockPath) - if err != nil { - return fmt.Errorf("failed to connect to container's attach socket: %v: %w", sockPath, err) - } - defer func() { - if err := conn.Close(); err != nil { - logrus.Errorf("Unable to close socket: %q", err) - } - }() - - // start listening on stdio of the process - receiveStdoutError, stdinDone := setupStdioChannels(streams, conn, detachKeys) - - // 4: send start message to child - if err := writeConmonPipeData(startFd); err != nil { - return err - } - - return readStdio(conn, streams, receiveStdoutError, stdinDone) -} - -func processDetachKeys(keys string) ([]byte, error) { - // Check the validity of the provided keys first - if len(keys) == 0 { - return []byte{}, nil - } - detachKeys, err := term.ToBytes(keys) - if err != nil { - return nil, fmt.Errorf("invalid detach keys: %w", err) - } - return detachKeys, nil -} - -func registerResizeFunc(r <-chan resize.TerminalSize, bundlePath string) { - resize.HandleResizing(r, func(size resize.TerminalSize) { - controlPath := filepath.Join(bundlePath, "ctl") - controlFile, err := os.OpenFile(controlPath, unix.O_WRONLY, 0) - if err != nil { - logrus.Debugf("Could not open ctl file: %v", err) - return - } - defer controlFile.Close() - - logrus.Debugf("Received a resize event: %+v", size) - if _, err = fmt.Fprintf(controlFile, "%d %d %d\n", 1, size.Height, size.Width); err != nil { - logrus.Warnf("Failed to write to control file to resize terminal: %v", err) - } - }) -} - -func setupStdioChannels(streams *define.AttachStreams, conn *net.UnixConn, detachKeys []byte) (chan error, chan error) { - receiveStdoutError := make(chan error) - go func() { - receiveStdoutError <- redirectResponseToOutputStreams(streams.OutputStream, streams.ErrorStream, streams.AttachOutput, streams.AttachError, conn) - }() - - stdinDone := make(chan error) - go func() { - var err error - if streams.AttachInput { - _, err = util.CopyDetachable(conn, streams.InputStream, detachKeys) - } - stdinDone <- err - }() - - return receiveStdoutError, stdinDone -} - -func redirectResponseToOutputStreams(outputStream, errorStream io.Writer, writeOutput, writeError bool, conn io.Reader) error { - var err error - buf := make([]byte, 8192+1) /* Sync with conmon STDIO_BUF_SIZE */ - for { - nr, er := conn.Read(buf) - if nr > 0 { - var dst io.Writer - var doWrite bool - switch buf[0] { - case AttachPipeStdout: - dst = outputStream - doWrite = writeOutput - case AttachPipeStderr: - dst = errorStream - doWrite = writeError - default: - logrus.Infof("Received unexpected attach type %+d", buf[0]) - } - if dst == nil { - return errors.New("output destination cannot be nil") - } - - if doWrite { - nw, ew := dst.Write(buf[1:nr]) - if ew != nil { - err = ew - break - } - if nr != nw+1 { - err = io.ErrShortWrite - break - } - } - } - if errors.Is(er, io.EOF) || errors.Is(er, syscall.ECONNRESET) { - break - } - if er != nil { - err = er - break - } - } - return err -} - -func readStdio(conn *net.UnixConn, streams *define.AttachStreams, receiveStdoutError, stdinDone chan error) error { - var err error - select { - case err = <-receiveStdoutError: - if err := conn.CloseWrite(); err != nil { - logrus.Errorf("Failed to close stdin: %v", err) - } - return err - case err = <-stdinDone: - if err == define.ErrDetach { - if err := conn.CloseWrite(); err != nil { - logrus.Errorf("Failed to close stdin: %v", err) - } - return err - } - if err == nil { - // copy stdin is done, close it - if connErr := conn.CloseWrite(); connErr != nil { - logrus.Errorf("Unable to close conn: %v", connErr) - } - } - if streams.AttachOutput || streams.AttachError { - return <-receiveStdoutError - } - } - return nil -} -- cgit v1.2.3-54-g00ecf From 68b2450d3de0344b2a4cfacdcabed8d1c854cb68 Mon Sep 17 00:00:00 2001 From: Doug Rabson Date: Wed, 17 Aug 2022 09:37:07 +0100 Subject: libpod: Move oci_conmon_exec_linux.go to oci_conmon_exec_common.go [NO NEW TESTS NEEDED] Signed-off-by: Doug Rabson --- libpod/oci_conmon_exec_common.go | 797 +++++++++++++++++++++++++++++++++++++++ libpod/oci_conmon_exec_linux.go | 797 --------------------------------------- 2 files changed, 797 insertions(+), 797 deletions(-) create mode 100644 libpod/oci_conmon_exec_common.go delete mode 100644 libpod/oci_conmon_exec_linux.go (limited to 'libpod') diff --git a/libpod/oci_conmon_exec_common.go b/libpod/oci_conmon_exec_common.go new file mode 100644 index 000000000..16cd7ef9f --- /dev/null +++ b/libpod/oci_conmon_exec_common.go @@ -0,0 +1,797 @@ +package libpod + +import ( + "errors" + "fmt" + "io/ioutil" + "net/http" + "os" + "os/exec" + "path/filepath" + "strings" + "syscall" + "time" + + "github.com/containers/common/pkg/capabilities" + "github.com/containers/common/pkg/config" + "github.com/containers/common/pkg/resize" + cutil "github.com/containers/common/pkg/util" + "github.com/containers/podman/v4/libpod/define" + "github.com/containers/podman/v4/pkg/errorhandling" + "github.com/containers/podman/v4/pkg/lookup" + "github.com/containers/podman/v4/pkg/util" + spec "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +// ExecContainer executes a command in a running container +func (r *ConmonOCIRuntime) ExecContainer(c *Container, sessionID string, options *ExecOptions, streams *define.AttachStreams, newSize *resize.TerminalSize) (int, chan error, error) { + if options == nil { + return -1, nil, fmt.Errorf("must provide an ExecOptions struct to ExecContainer: %w", define.ErrInvalidArg) + } + if len(options.Cmd) == 0 { + return -1, nil, fmt.Errorf("must provide a command to execute: %w", define.ErrInvalidArg) + } + + if sessionID == "" { + return -1, nil, fmt.Errorf("must provide a session ID for exec: %w", define.ErrEmptyID) + } + + // TODO: Should we default this to false? + // Or maybe make streams mandatory? + attachStdin := true + if streams != nil { + attachStdin = streams.AttachInput + } + + var ociLog string + if logrus.GetLevel() != logrus.DebugLevel && r.supportsJSON { + ociLog = c.execOCILog(sessionID) + } + + execCmd, pipes, err := r.startExec(c, sessionID, options, attachStdin, ociLog) + if err != nil { + return -1, nil, err + } + + // Only close sync pipe. Start and attach are consumed in the attach + // goroutine. + defer func() { + if pipes.syncPipe != nil && !pipes.syncClosed { + errorhandling.CloseQuiet(pipes.syncPipe) + pipes.syncClosed = true + } + }() + + // TODO Only create if !detach + // Attach to the container before starting it + attachChan := make(chan error) + go func() { + // attachToExec is responsible for closing pipes + attachChan <- c.attachToExec(streams, options.DetachKeys, sessionID, pipes.startPipe, pipes.attachPipe, newSize) + close(attachChan) + }() + + if err := execCmd.Wait(); err != nil { + return -1, nil, fmt.Errorf("cannot run conmon: %w", err) + } + + pid, err := readConmonPipeData(r.name, pipes.syncPipe, ociLog) + + return pid, attachChan, err +} + +// ExecContainerHTTP executes a new command in an existing container and +// forwards its standard streams over an attach +func (r *ConmonOCIRuntime) ExecContainerHTTP(ctr *Container, sessionID string, options *ExecOptions, req *http.Request, w http.ResponseWriter, + streams *HTTPAttachStreams, cancel <-chan bool, hijackDone chan<- bool, holdConnOpen <-chan bool, newSize *resize.TerminalSize) (int, chan error, error) { + if streams != nil { + if !streams.Stdin && !streams.Stdout && !streams.Stderr { + return -1, nil, fmt.Errorf("must provide at least one stream to attach to: %w", define.ErrInvalidArg) + } + } + + if options == nil { + return -1, nil, fmt.Errorf("must provide exec options to ExecContainerHTTP: %w", define.ErrInvalidArg) + } + + detachString := config.DefaultDetachKeys + if options.DetachKeys != nil { + detachString = *options.DetachKeys + } + detachKeys, err := processDetachKeys(detachString) + if err != nil { + return -1, nil, err + } + + // TODO: Should we default this to false? + // Or maybe make streams mandatory? + attachStdin := true + if streams != nil { + attachStdin = streams.Stdin + } + + var ociLog string + if logrus.GetLevel() != logrus.DebugLevel && r.supportsJSON { + ociLog = ctr.execOCILog(sessionID) + } + + execCmd, pipes, err := r.startExec(ctr, sessionID, options, attachStdin, ociLog) + if err != nil { + return -1, nil, err + } + + // Only close sync pipe. Start and attach are consumed in the attach + // goroutine. + defer func() { + if pipes.syncPipe != nil && !pipes.syncClosed { + errorhandling.CloseQuiet(pipes.syncPipe) + pipes.syncClosed = true + } + }() + + attachChan := make(chan error) + conmonPipeDataChan := make(chan conmonPipeData) + go func() { + // attachToExec is responsible for closing pipes + attachChan <- attachExecHTTP(ctr, sessionID, req, w, streams, pipes, detachKeys, options.Terminal, cancel, hijackDone, holdConnOpen, execCmd, conmonPipeDataChan, ociLog, newSize, r.name) + close(attachChan) + }() + + // NOTE: the channel is needed to communicate conmon's data. In case + // of an error, the error will be written on the hijacked http + // connection such that remote clients will receive the error. + pipeData := <-conmonPipeDataChan + + return pipeData.pid, attachChan, pipeData.err +} + +// conmonPipeData contains the data when reading from conmon's pipe. +type conmonPipeData struct { + pid int + err error +} + +// ExecContainerDetached executes a command in a running container, but does +// not attach to it. +func (r *ConmonOCIRuntime) ExecContainerDetached(ctr *Container, sessionID string, options *ExecOptions, stdin bool) (int, error) { + if options == nil { + return -1, fmt.Errorf("must provide exec options to ExecContainerHTTP: %w", define.ErrInvalidArg) + } + + var ociLog string + if logrus.GetLevel() != logrus.DebugLevel && r.supportsJSON { + ociLog = ctr.execOCILog(sessionID) + } + + execCmd, pipes, err := r.startExec(ctr, sessionID, options, stdin, ociLog) + if err != nil { + return -1, err + } + + defer func() { + pipes.cleanup() + }() + + // Wait for Conmon to tell us we're ready to attach. + // We aren't actually *going* to attach, but this means that we're good + // to proceed. + if _, err := readConmonPipeData(r.name, pipes.attachPipe, ""); err != nil { + return -1, err + } + + // Start the exec session + if err := writeConmonPipeData(pipes.startPipe); err != nil { + return -1, err + } + + // Wait for conmon to succeed, when return. + if err := execCmd.Wait(); err != nil { + return -1, fmt.Errorf("cannot run conmon: %w", err) + } + + pid, err := readConmonPipeData(r.name, pipes.syncPipe, ociLog) + + return pid, err +} + +// ExecAttachResize resizes the TTY of the given exec session. +func (r *ConmonOCIRuntime) ExecAttachResize(ctr *Container, sessionID string, newSize resize.TerminalSize) error { + controlFile, err := openControlFile(ctr, ctr.execBundlePath(sessionID)) + if err != nil { + return err + } + defer controlFile.Close() + + if _, err = fmt.Fprintf(controlFile, "%d %d %d\n", 1, newSize.Height, newSize.Width); err != nil { + return fmt.Errorf("failed to write to ctl file to resize terminal: %w", err) + } + + return nil +} + +// ExecStopContainer stops a given exec session in a running container. +func (r *ConmonOCIRuntime) ExecStopContainer(ctr *Container, sessionID string, timeout uint) error { + pid, err := ctr.getExecSessionPID(sessionID) + if err != nil { + return err + } + + logrus.Debugf("Going to stop container %s exec session %s", ctr.ID(), sessionID) + + // Is the session dead? + // Ping the PID with signal 0 to see if it still exists. + if err := unix.Kill(pid, 0); err != nil { + if err == unix.ESRCH { + return nil + } + return fmt.Errorf("error pinging container %s exec session %s PID %d with signal 0: %w", ctr.ID(), sessionID, pid, err) + } + + if timeout > 0 { + // Use SIGTERM by default, then SIGSTOP after timeout. + logrus.Debugf("Killing exec session %s (PID %d) of container %s with SIGTERM", sessionID, pid, ctr.ID()) + if err := unix.Kill(pid, unix.SIGTERM); err != nil { + if err == unix.ESRCH { + return nil + } + return fmt.Errorf("error killing container %s exec session %s PID %d with SIGTERM: %w", ctr.ID(), sessionID, pid, err) + } + + // Wait for the PID to stop + if err := waitPidStop(pid, time.Duration(timeout)*time.Second); err != nil { + logrus.Infof("Timed out waiting for container %s exec session %s to stop, resorting to SIGKILL: %v", ctr.ID(), sessionID, err) + } else { + // No error, container is dead + return nil + } + } + + // SIGTERM did not work. On to SIGKILL. + logrus.Debugf("Killing exec session %s (PID %d) of container %s with SIGKILL", sessionID, pid, ctr.ID()) + if err := unix.Kill(pid, unix.SIGTERM); err != nil { + if err == unix.ESRCH { + return nil + } + return fmt.Errorf("error killing container %s exec session %s PID %d with SIGKILL: %w", ctr.ID(), sessionID, pid, err) + } + + // Wait for the PID to stop + if err := waitPidStop(pid, killContainerTimeout); err != nil { + return fmt.Errorf("timed out waiting for container %s exec session %s PID %d to stop after SIGKILL: %w", ctr.ID(), sessionID, pid, err) + } + + return nil +} + +// ExecUpdateStatus checks if the given exec session is still running. +func (r *ConmonOCIRuntime) ExecUpdateStatus(ctr *Container, sessionID string) (bool, error) { + pid, err := ctr.getExecSessionPID(sessionID) + if err != nil { + return false, err + } + + logrus.Debugf("Checking status of container %s exec session %s", ctr.ID(), sessionID) + + // Is the session dead? + // Ping the PID with signal 0 to see if it still exists. + if err := unix.Kill(pid, 0); err != nil { + if err == unix.ESRCH { + return false, nil + } + return false, fmt.Errorf("error pinging container %s exec session %s PID %d with signal 0: %w", ctr.ID(), sessionID, pid, err) + } + + return true, nil +} + +// ExecAttachSocketPath is the path to a container's exec session attach socket. +func (r *ConmonOCIRuntime) ExecAttachSocketPath(ctr *Container, sessionID string) (string, error) { + // We don't even use container, so don't validity check it + if sessionID == "" { + return "", fmt.Errorf("must provide a valid session ID to get attach socket path: %w", define.ErrInvalidArg) + } + + return filepath.Join(ctr.execBundlePath(sessionID), "attach"), nil +} + +// This contains pipes used by the exec API. +type execPipes struct { + syncPipe *os.File + syncClosed bool + startPipe *os.File + startClosed bool + attachPipe *os.File + attachClosed bool +} + +func (p *execPipes) cleanup() { + if p.syncPipe != nil && !p.syncClosed { + errorhandling.CloseQuiet(p.syncPipe) + p.syncClosed = true + } + if p.startPipe != nil && !p.startClosed { + errorhandling.CloseQuiet(p.startPipe) + p.startClosed = true + } + if p.attachPipe != nil && !p.attachClosed { + errorhandling.CloseQuiet(p.attachPipe) + p.attachClosed = true + } +} + +// Start an exec session's conmon parent from the given options. +func (r *ConmonOCIRuntime) startExec(c *Container, sessionID string, options *ExecOptions, attachStdin bool, ociLog string) (_ *exec.Cmd, _ *execPipes, deferredErr error) { + pipes := new(execPipes) + + if options == nil { + return nil, nil, fmt.Errorf("must provide an ExecOptions struct to ExecContainer: %w", define.ErrInvalidArg) + } + if len(options.Cmd) == 0 { + return nil, nil, fmt.Errorf("must provide a command to execute: %w", define.ErrInvalidArg) + } + + if sessionID == "" { + return nil, nil, fmt.Errorf("must provide a session ID for exec: %w", define.ErrEmptyID) + } + + // create sync pipe to receive the pid + parentSyncPipe, childSyncPipe, err := newPipe() + if err != nil { + return nil, nil, fmt.Errorf("error creating socket pair: %w", err) + } + pipes.syncPipe = parentSyncPipe + + defer func() { + if deferredErr != nil { + pipes.cleanup() + } + }() + + // create start pipe to set the cgroup before running + // attachToExec is responsible for closing parentStartPipe + childStartPipe, parentStartPipe, err := newPipe() + if err != nil { + return nil, nil, fmt.Errorf("error creating socket pair: %w", err) + } + pipes.startPipe = parentStartPipe + + // create the attach pipe to allow attach socket to be created before + // $RUNTIME exec starts running. This is to make sure we can capture all output + // from the process through that socket, rather than half reading the log, half attaching to the socket + // attachToExec is responsible for closing parentAttachPipe + parentAttachPipe, childAttachPipe, err := newPipe() + if err != nil { + return nil, nil, fmt.Errorf("error creating socket pair: %w", err) + } + pipes.attachPipe = parentAttachPipe + + childrenClosed := false + defer func() { + if !childrenClosed { + errorhandling.CloseQuiet(childSyncPipe) + errorhandling.CloseQuiet(childAttachPipe) + errorhandling.CloseQuiet(childStartPipe) + } + }() + + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return nil, nil, err + } + + finalEnv := make([]string, 0, len(options.Env)) + for k, v := range options.Env { + finalEnv = append(finalEnv, fmt.Sprintf("%s=%s", k, v)) + } + + processFile, err := prepareProcessExec(c, options, finalEnv, sessionID) + if err != nil { + return nil, nil, err + } + defer processFile.Close() + + args := r.sharedConmonArgs(c, sessionID, c.execBundlePath(sessionID), c.execPidPath(sessionID), c.execLogPath(sessionID), c.execExitFileDir(sessionID), ociLog, define.NoLogging, c.config.LogTag) + + if options.PreserveFDs > 0 { + args = append(args, formatRuntimeOpts("--preserve-fds", fmt.Sprintf("%d", options.PreserveFDs))...) + } + + if options.Terminal { + args = append(args, "-t") + } + + if attachStdin { + args = append(args, "-i") + } + + // Append container ID and command + args = append(args, "-e") + // TODO make this optional when we can detach + args = append(args, "--exec-attach") + args = append(args, "--exec-process-spec", processFile.Name()) + + if len(options.ExitCommand) > 0 { + args = append(args, "--exit-command", options.ExitCommand[0]) + for _, arg := range options.ExitCommand[1:] { + args = append(args, []string{"--exit-command-arg", arg}...) + } + if options.ExitCommandDelay > 0 { + args = append(args, []string{"--exit-delay", fmt.Sprintf("%d", options.ExitCommandDelay)}...) + } + } + + logrus.WithFields(logrus.Fields{ + "args": args, + }).Debugf("running conmon: %s", r.conmonPath) + execCmd := exec.Command(r.conmonPath, args...) + + // TODO: This is commented because it doesn't make much sense in HTTP + // attach, and I'm not certain it does for non-HTTP attach as well. + // if streams != nil { + // // Don't add the InputStream to the execCmd. Instead, the data should be passed + // // through CopyDetachable + // if streams.AttachOutput { + // execCmd.Stdout = options.Streams.OutputStream + // } + // if streams.AttachError { + // execCmd.Stderr = options.Streams.ErrorStream + // } + // } + + conmonEnv := r.configureConmonEnv(runtimeDir) + + var filesToClose []*os.File + if options.PreserveFDs > 0 { + for fd := 3; fd < int(3+options.PreserveFDs); fd++ { + f := os.NewFile(uintptr(fd), fmt.Sprintf("fd-%d", fd)) + filesToClose = append(filesToClose, f) + execCmd.ExtraFiles = append(execCmd.ExtraFiles, f) + } + } + + // we don't want to step on users fds they asked to preserve + // Since 0-2 are used for stdio, start the fds we pass in at preserveFDs+3 + execCmd.Env = r.conmonEnv + execCmd.Env = append(execCmd.Env, fmt.Sprintf("_OCI_SYNCPIPE=%d", options.PreserveFDs+3), fmt.Sprintf("_OCI_STARTPIPE=%d", options.PreserveFDs+4), fmt.Sprintf("_OCI_ATTACHPIPE=%d", options.PreserveFDs+5)) + execCmd.Env = append(execCmd.Env, conmonEnv...) + + execCmd.ExtraFiles = append(execCmd.ExtraFiles, childSyncPipe, childStartPipe, childAttachPipe) + execCmd.Dir = c.execBundlePath(sessionID) + execCmd.SysProcAttr = &syscall.SysProcAttr{ + Setpgid: true, + } + + err = startCommand(execCmd, c) + + // We don't need children pipes on the parent side + errorhandling.CloseQuiet(childSyncPipe) + errorhandling.CloseQuiet(childAttachPipe) + errorhandling.CloseQuiet(childStartPipe) + childrenClosed = true + + if err != nil { + return nil, nil, fmt.Errorf("cannot start container %s: %w", c.ID(), err) + } + if err := r.moveConmonToCgroupAndSignal(c, execCmd, parentStartPipe); err != nil { + return nil, nil, err + } + + // These fds were passed down to the runtime. Close them + // and not interfere + for _, f := range filesToClose { + errorhandling.CloseQuiet(f) + } + + return execCmd, pipes, nil +} + +// Attach to a container over HTTP +func attachExecHTTP(c *Container, sessionID string, r *http.Request, w http.ResponseWriter, streams *HTTPAttachStreams, pipes *execPipes, detachKeys []byte, isTerminal bool, cancel <-chan bool, hijackDone chan<- bool, holdConnOpen <-chan bool, execCmd *exec.Cmd, conmonPipeDataChan chan<- conmonPipeData, ociLog string, newSize *resize.TerminalSize, runtimeName string) (deferredErr error) { + // NOTE: As you may notice, the attach code is quite complex. + // Many things happen concurrently and yet are interdependent. + // If you ever change this function, make sure to write to the + // conmonPipeDataChan in case of an error. + + if pipes == nil || pipes.startPipe == nil || pipes.attachPipe == nil { + err := fmt.Errorf("must provide a start and attach pipe to finish an exec attach: %w", define.ErrInvalidArg) + conmonPipeDataChan <- conmonPipeData{-1, err} + return err + } + + defer func() { + if !pipes.startClosed { + errorhandling.CloseQuiet(pipes.startPipe) + pipes.startClosed = true + } + if !pipes.attachClosed { + errorhandling.CloseQuiet(pipes.attachPipe) + pipes.attachClosed = true + } + }() + + logrus.Debugf("Attaching to container %s exec session %s", c.ID(), sessionID) + + // set up the socket path, such that it is the correct length and location for exec + sockPath, err := c.execAttachSocketPath(sessionID) + if err != nil { + conmonPipeDataChan <- conmonPipeData{-1, err} + return err + } + + // 2: read from attachFd that the parent process has set up the console socket + if _, err := readConmonPipeData(runtimeName, pipes.attachPipe, ""); err != nil { + conmonPipeDataChan <- conmonPipeData{-1, err} + return err + } + + // resize before we start the container process + if newSize != nil { + err = c.ociRuntime.ExecAttachResize(c, sessionID, *newSize) + if err != nil { + logrus.Warnf("Resize failed: %v", err) + } + } + + // 2: then attach + conn, err := openUnixSocket(sockPath) + if err != nil { + conmonPipeDataChan <- conmonPipeData{-1, err} + return fmt.Errorf("failed to connect to container's attach socket: %v: %w", sockPath, err) + } + defer func() { + if err := conn.Close(); err != nil { + logrus.Errorf("Unable to close socket: %q", err) + } + }() + + attachStdout := true + attachStderr := true + attachStdin := true + if streams != nil { + attachStdout = streams.Stdout + attachStderr = streams.Stderr + attachStdin = streams.Stdin + } + + // Perform hijack + hijacker, ok := w.(http.Hijacker) + if !ok { + conmonPipeDataChan <- conmonPipeData{-1, err} + return errors.New("unable to hijack connection") + } + + httpCon, httpBuf, err := hijacker.Hijack() + if err != nil { + conmonPipeDataChan <- conmonPipeData{-1, err} + return fmt.Errorf("error hijacking connection: %w", err) + } + + hijackDone <- true + + // Write a header to let the client know what happened + writeHijackHeader(r, httpBuf) + + // Force a flush after the header is written. + if err := httpBuf.Flush(); err != nil { + conmonPipeDataChan <- conmonPipeData{-1, err} + return fmt.Errorf("error flushing HTTP hijack header: %w", err) + } + + go func() { + // Wait for conmon to succeed, when return. + if err := execCmd.Wait(); err != nil { + conmonPipeDataChan <- conmonPipeData{-1, err} + } else { + pid, err := readConmonPipeData(runtimeName, pipes.syncPipe, ociLog) + if err != nil { + hijackWriteError(err, c.ID(), isTerminal, httpBuf) + conmonPipeDataChan <- conmonPipeData{pid, err} + } else { + conmonPipeDataChan <- conmonPipeData{pid, err} + } + } + // We need to hold the connection open until the complete exec + // function has finished. This channel will be closed in a defer + // in that function, so we can wait for it here. + // Can't be a defer, because this would block the function from + // returning. + <-holdConnOpen + hijackWriteErrorAndClose(deferredErr, c.ID(), isTerminal, httpCon, httpBuf) + }() + + stdoutChan := make(chan error) + stdinChan := make(chan error) + + // Next, STDIN. Avoid entirely if attachStdin unset. + if attachStdin { + go func() { + logrus.Debugf("Beginning STDIN copy") + _, err := cutil.CopyDetachable(conn, httpBuf, detachKeys) + logrus.Debugf("STDIN copy completed") + stdinChan <- err + }() + } + + // 4: send start message to child + if err := writeConmonPipeData(pipes.startPipe); err != nil { + return err + } + + // Handle STDOUT/STDERR *after* start message is sent + go func() { + var err error + if isTerminal { + // Hack: return immediately if attachStdout not set to + // emulate Docker. + // Basically, when terminal is set, STDERR goes nowhere. + // Everything does over STDOUT. + // Therefore, if not attaching STDOUT - we'll never copy + // anything from here. + logrus.Debugf("Performing terminal HTTP attach for container %s", c.ID()) + if attachStdout { + err = httpAttachTerminalCopy(conn, httpBuf, c.ID()) + } + } else { + logrus.Debugf("Performing non-terminal HTTP attach for container %s", c.ID()) + err = httpAttachNonTerminalCopy(conn, httpBuf, c.ID(), attachStdin, attachStdout, attachStderr) + } + stdoutChan <- err + logrus.Debugf("STDOUT/ERR copy completed") + }() + + for { + select { + case err := <-stdoutChan: + if err != nil { + return err + } + + return nil + case err := <-stdinChan: + if err != nil { + return err + } + // copy stdin is done, close it + if connErr := conn.CloseWrite(); connErr != nil { + logrus.Errorf("Unable to close conn: %v", connErr) + } + case <-cancel: + return nil + } + } +} + +// prepareProcessExec returns the path of the process.json used in runc exec -p +// caller is responsible to close the returned *os.File if needed. +func prepareProcessExec(c *Container, options *ExecOptions, env []string, sessionID string) (*os.File, error) { + f, err := ioutil.TempFile(c.execBundlePath(sessionID), "exec-process-") + if err != nil { + return nil, err + } + pspec := new(spec.Process) + if err := JSONDeepCopy(c.config.Spec.Process, pspec); err != nil { + return nil, err + } + pspec.SelinuxLabel = c.config.ProcessLabel + pspec.Args = options.Cmd + + // We need to default this to false else it will inherit terminal as true + // from the container. + pspec.Terminal = false + if options.Terminal { + pspec.Terminal = true + } + if len(env) > 0 { + pspec.Env = append(pspec.Env, env...) + } + + // Add secret envs if they exist + manager, err := c.runtime.SecretsManager() + if err != nil { + return nil, err + } + for name, secr := range c.config.EnvSecrets { + _, data, err := manager.LookupSecretData(secr.Name) + if err != nil { + return nil, err + } + pspec.Env = append(pspec.Env, fmt.Sprintf("%s=%s", name, string(data))) + } + + if options.Cwd != "" { + pspec.Cwd = options.Cwd + } + + var addGroups []string + var sgids []uint32 + + // if the user is empty, we should inherit the user that the container is currently running with + user := options.User + if user == "" { + logrus.Debugf("Set user to %s", c.config.User) + user = c.config.User + addGroups = c.config.Groups + } + + overrides := c.getUserOverrides() + execUser, err := lookup.GetUserGroupInfo(c.state.Mountpoint, user, overrides) + if err != nil { + return nil, err + } + + if len(addGroups) > 0 { + sgids, err = lookup.GetContainerGroups(addGroups, c.state.Mountpoint, overrides) + if err != nil { + return nil, fmt.Errorf("error looking up supplemental groups for container %s exec session %s: %w", c.ID(), sessionID, err) + } + } + + // If user was set, look it up in the container to get a UID to use on + // the host + if user != "" || len(sgids) > 0 { + if user != "" { + for _, sgid := range execUser.Sgids { + sgids = append(sgids, uint32(sgid)) + } + } + processUser := spec.User{ + UID: uint32(execUser.Uid), + GID: uint32(execUser.Gid), + AdditionalGids: sgids, + } + + pspec.User = processUser + } + + ctrSpec, err := c.specFromState() + if err != nil { + return nil, err + } + + allCaps, err := capabilities.BoundingSet() + if err != nil { + return nil, err + } + if options.Privileged { + pspec.Capabilities.Bounding = allCaps + } else { + pspec.Capabilities.Bounding = ctrSpec.Process.Capabilities.Bounding + } + + // Always unset the inheritable capabilities similarly to what the Linux kernel does + // They are used only when using capabilities with uid != 0. + pspec.Capabilities.Inheritable = []string{} + + if execUser.Uid == 0 { + pspec.Capabilities.Effective = pspec.Capabilities.Bounding + pspec.Capabilities.Permitted = pspec.Capabilities.Bounding + } else if user == c.config.User { + pspec.Capabilities.Effective = ctrSpec.Process.Capabilities.Effective + pspec.Capabilities.Inheritable = ctrSpec.Process.Capabilities.Effective + pspec.Capabilities.Permitted = ctrSpec.Process.Capabilities.Effective + pspec.Capabilities.Ambient = ctrSpec.Process.Capabilities.Effective + } + + hasHomeSet := false + for _, s := range pspec.Env { + if strings.HasPrefix(s, "HOME=") { + hasHomeSet = true + break + } + } + if !hasHomeSet { + pspec.Env = append(pspec.Env, fmt.Sprintf("HOME=%s", execUser.Home)) + } + + processJSON, err := json.Marshal(pspec) + if err != nil { + return nil, err + } + + if err := ioutil.WriteFile(f.Name(), processJSON, 0644); err != nil { + return nil, err + } + return f, nil +} diff --git a/libpod/oci_conmon_exec_linux.go b/libpod/oci_conmon_exec_linux.go deleted file mode 100644 index 16cd7ef9f..000000000 --- a/libpod/oci_conmon_exec_linux.go +++ /dev/null @@ -1,797 +0,0 @@ -package libpod - -import ( - "errors" - "fmt" - "io/ioutil" - "net/http" - "os" - "os/exec" - "path/filepath" - "strings" - "syscall" - "time" - - "github.com/containers/common/pkg/capabilities" - "github.com/containers/common/pkg/config" - "github.com/containers/common/pkg/resize" - cutil "github.com/containers/common/pkg/util" - "github.com/containers/podman/v4/libpod/define" - "github.com/containers/podman/v4/pkg/errorhandling" - "github.com/containers/podman/v4/pkg/lookup" - "github.com/containers/podman/v4/pkg/util" - spec "github.com/opencontainers/runtime-spec/specs-go" - "github.com/sirupsen/logrus" - "golang.org/x/sys/unix" -) - -// ExecContainer executes a command in a running container -func (r *ConmonOCIRuntime) ExecContainer(c *Container, sessionID string, options *ExecOptions, streams *define.AttachStreams, newSize *resize.TerminalSize) (int, chan error, error) { - if options == nil { - return -1, nil, fmt.Errorf("must provide an ExecOptions struct to ExecContainer: %w", define.ErrInvalidArg) - } - if len(options.Cmd) == 0 { - return -1, nil, fmt.Errorf("must provide a command to execute: %w", define.ErrInvalidArg) - } - - if sessionID == "" { - return -1, nil, fmt.Errorf("must provide a session ID for exec: %w", define.ErrEmptyID) - } - - // TODO: Should we default this to false? - // Or maybe make streams mandatory? - attachStdin := true - if streams != nil { - attachStdin = streams.AttachInput - } - - var ociLog string - if logrus.GetLevel() != logrus.DebugLevel && r.supportsJSON { - ociLog = c.execOCILog(sessionID) - } - - execCmd, pipes, err := r.startExec(c, sessionID, options, attachStdin, ociLog) - if err != nil { - return -1, nil, err - } - - // Only close sync pipe. Start and attach are consumed in the attach - // goroutine. - defer func() { - if pipes.syncPipe != nil && !pipes.syncClosed { - errorhandling.CloseQuiet(pipes.syncPipe) - pipes.syncClosed = true - } - }() - - // TODO Only create if !detach - // Attach to the container before starting it - attachChan := make(chan error) - go func() { - // attachToExec is responsible for closing pipes - attachChan <- c.attachToExec(streams, options.DetachKeys, sessionID, pipes.startPipe, pipes.attachPipe, newSize) - close(attachChan) - }() - - if err := execCmd.Wait(); err != nil { - return -1, nil, fmt.Errorf("cannot run conmon: %w", err) - } - - pid, err := readConmonPipeData(r.name, pipes.syncPipe, ociLog) - - return pid, attachChan, err -} - -// ExecContainerHTTP executes a new command in an existing container and -// forwards its standard streams over an attach -func (r *ConmonOCIRuntime) ExecContainerHTTP(ctr *Container, sessionID string, options *ExecOptions, req *http.Request, w http.ResponseWriter, - streams *HTTPAttachStreams, cancel <-chan bool, hijackDone chan<- bool, holdConnOpen <-chan bool, newSize *resize.TerminalSize) (int, chan error, error) { - if streams != nil { - if !streams.Stdin && !streams.Stdout && !streams.Stderr { - return -1, nil, fmt.Errorf("must provide at least one stream to attach to: %w", define.ErrInvalidArg) - } - } - - if options == nil { - return -1, nil, fmt.Errorf("must provide exec options to ExecContainerHTTP: %w", define.ErrInvalidArg) - } - - detachString := config.DefaultDetachKeys - if options.DetachKeys != nil { - detachString = *options.DetachKeys - } - detachKeys, err := processDetachKeys(detachString) - if err != nil { - return -1, nil, err - } - - // TODO: Should we default this to false? - // Or maybe make streams mandatory? - attachStdin := true - if streams != nil { - attachStdin = streams.Stdin - } - - var ociLog string - if logrus.GetLevel() != logrus.DebugLevel && r.supportsJSON { - ociLog = ctr.execOCILog(sessionID) - } - - execCmd, pipes, err := r.startExec(ctr, sessionID, options, attachStdin, ociLog) - if err != nil { - return -1, nil, err - } - - // Only close sync pipe. Start and attach are consumed in the attach - // goroutine. - defer func() { - if pipes.syncPipe != nil && !pipes.syncClosed { - errorhandling.CloseQuiet(pipes.syncPipe) - pipes.syncClosed = true - } - }() - - attachChan := make(chan error) - conmonPipeDataChan := make(chan conmonPipeData) - go func() { - // attachToExec is responsible for closing pipes - attachChan <- attachExecHTTP(ctr, sessionID, req, w, streams, pipes, detachKeys, options.Terminal, cancel, hijackDone, holdConnOpen, execCmd, conmonPipeDataChan, ociLog, newSize, r.name) - close(attachChan) - }() - - // NOTE: the channel is needed to communicate conmon's data. In case - // of an error, the error will be written on the hijacked http - // connection such that remote clients will receive the error. - pipeData := <-conmonPipeDataChan - - return pipeData.pid, attachChan, pipeData.err -} - -// conmonPipeData contains the data when reading from conmon's pipe. -type conmonPipeData struct { - pid int - err error -} - -// ExecContainerDetached executes a command in a running container, but does -// not attach to it. -func (r *ConmonOCIRuntime) ExecContainerDetached(ctr *Container, sessionID string, options *ExecOptions, stdin bool) (int, error) { - if options == nil { - return -1, fmt.Errorf("must provide exec options to ExecContainerHTTP: %w", define.ErrInvalidArg) - } - - var ociLog string - if logrus.GetLevel() != logrus.DebugLevel && r.supportsJSON { - ociLog = ctr.execOCILog(sessionID) - } - - execCmd, pipes, err := r.startExec(ctr, sessionID, options, stdin, ociLog) - if err != nil { - return -1, err - } - - defer func() { - pipes.cleanup() - }() - - // Wait for Conmon to tell us we're ready to attach. - // We aren't actually *going* to attach, but this means that we're good - // to proceed. - if _, err := readConmonPipeData(r.name, pipes.attachPipe, ""); err != nil { - return -1, err - } - - // Start the exec session - if err := writeConmonPipeData(pipes.startPipe); err != nil { - return -1, err - } - - // Wait for conmon to succeed, when return. - if err := execCmd.Wait(); err != nil { - return -1, fmt.Errorf("cannot run conmon: %w", err) - } - - pid, err := readConmonPipeData(r.name, pipes.syncPipe, ociLog) - - return pid, err -} - -// ExecAttachResize resizes the TTY of the given exec session. -func (r *ConmonOCIRuntime) ExecAttachResize(ctr *Container, sessionID string, newSize resize.TerminalSize) error { - controlFile, err := openControlFile(ctr, ctr.execBundlePath(sessionID)) - if err != nil { - return err - } - defer controlFile.Close() - - if _, err = fmt.Fprintf(controlFile, "%d %d %d\n", 1, newSize.Height, newSize.Width); err != nil { - return fmt.Errorf("failed to write to ctl file to resize terminal: %w", err) - } - - return nil -} - -// ExecStopContainer stops a given exec session in a running container. -func (r *ConmonOCIRuntime) ExecStopContainer(ctr *Container, sessionID string, timeout uint) error { - pid, err := ctr.getExecSessionPID(sessionID) - if err != nil { - return err - } - - logrus.Debugf("Going to stop container %s exec session %s", ctr.ID(), sessionID) - - // Is the session dead? - // Ping the PID with signal 0 to see if it still exists. - if err := unix.Kill(pid, 0); err != nil { - if err == unix.ESRCH { - return nil - } - return fmt.Errorf("error pinging container %s exec session %s PID %d with signal 0: %w", ctr.ID(), sessionID, pid, err) - } - - if timeout > 0 { - // Use SIGTERM by default, then SIGSTOP after timeout. - logrus.Debugf("Killing exec session %s (PID %d) of container %s with SIGTERM", sessionID, pid, ctr.ID()) - if err := unix.Kill(pid, unix.SIGTERM); err != nil { - if err == unix.ESRCH { - return nil - } - return fmt.Errorf("error killing container %s exec session %s PID %d with SIGTERM: %w", ctr.ID(), sessionID, pid, err) - } - - // Wait for the PID to stop - if err := waitPidStop(pid, time.Duration(timeout)*time.Second); err != nil { - logrus.Infof("Timed out waiting for container %s exec session %s to stop, resorting to SIGKILL: %v", ctr.ID(), sessionID, err) - } else { - // No error, container is dead - return nil - } - } - - // SIGTERM did not work. On to SIGKILL. - logrus.Debugf("Killing exec session %s (PID %d) of container %s with SIGKILL", sessionID, pid, ctr.ID()) - if err := unix.Kill(pid, unix.SIGTERM); err != nil { - if err == unix.ESRCH { - return nil - } - return fmt.Errorf("error killing container %s exec session %s PID %d with SIGKILL: %w", ctr.ID(), sessionID, pid, err) - } - - // Wait for the PID to stop - if err := waitPidStop(pid, killContainerTimeout); err != nil { - return fmt.Errorf("timed out waiting for container %s exec session %s PID %d to stop after SIGKILL: %w", ctr.ID(), sessionID, pid, err) - } - - return nil -} - -// ExecUpdateStatus checks if the given exec session is still running. -func (r *ConmonOCIRuntime) ExecUpdateStatus(ctr *Container, sessionID string) (bool, error) { - pid, err := ctr.getExecSessionPID(sessionID) - if err != nil { - return false, err - } - - logrus.Debugf("Checking status of container %s exec session %s", ctr.ID(), sessionID) - - // Is the session dead? - // Ping the PID with signal 0 to see if it still exists. - if err := unix.Kill(pid, 0); err != nil { - if err == unix.ESRCH { - return false, nil - } - return false, fmt.Errorf("error pinging container %s exec session %s PID %d with signal 0: %w", ctr.ID(), sessionID, pid, err) - } - - return true, nil -} - -// ExecAttachSocketPath is the path to a container's exec session attach socket. -func (r *ConmonOCIRuntime) ExecAttachSocketPath(ctr *Container, sessionID string) (string, error) { - // We don't even use container, so don't validity check it - if sessionID == "" { - return "", fmt.Errorf("must provide a valid session ID to get attach socket path: %w", define.ErrInvalidArg) - } - - return filepath.Join(ctr.execBundlePath(sessionID), "attach"), nil -} - -// This contains pipes used by the exec API. -type execPipes struct { - syncPipe *os.File - syncClosed bool - startPipe *os.File - startClosed bool - attachPipe *os.File - attachClosed bool -} - -func (p *execPipes) cleanup() { - if p.syncPipe != nil && !p.syncClosed { - errorhandling.CloseQuiet(p.syncPipe) - p.syncClosed = true - } - if p.startPipe != nil && !p.startClosed { - errorhandling.CloseQuiet(p.startPipe) - p.startClosed = true - } - if p.attachPipe != nil && !p.attachClosed { - errorhandling.CloseQuiet(p.attachPipe) - p.attachClosed = true - } -} - -// Start an exec session's conmon parent from the given options. -func (r *ConmonOCIRuntime) startExec(c *Container, sessionID string, options *ExecOptions, attachStdin bool, ociLog string) (_ *exec.Cmd, _ *execPipes, deferredErr error) { - pipes := new(execPipes) - - if options == nil { - return nil, nil, fmt.Errorf("must provide an ExecOptions struct to ExecContainer: %w", define.ErrInvalidArg) - } - if len(options.Cmd) == 0 { - return nil, nil, fmt.Errorf("must provide a command to execute: %w", define.ErrInvalidArg) - } - - if sessionID == "" { - return nil, nil, fmt.Errorf("must provide a session ID for exec: %w", define.ErrEmptyID) - } - - // create sync pipe to receive the pid - parentSyncPipe, childSyncPipe, err := newPipe() - if err != nil { - return nil, nil, fmt.Errorf("error creating socket pair: %w", err) - } - pipes.syncPipe = parentSyncPipe - - defer func() { - if deferredErr != nil { - pipes.cleanup() - } - }() - - // create start pipe to set the cgroup before running - // attachToExec is responsible for closing parentStartPipe - childStartPipe, parentStartPipe, err := newPipe() - if err != nil { - return nil, nil, fmt.Errorf("error creating socket pair: %w", err) - } - pipes.startPipe = parentStartPipe - - // create the attach pipe to allow attach socket to be created before - // $RUNTIME exec starts running. This is to make sure we can capture all output - // from the process through that socket, rather than half reading the log, half attaching to the socket - // attachToExec is responsible for closing parentAttachPipe - parentAttachPipe, childAttachPipe, err := newPipe() - if err != nil { - return nil, nil, fmt.Errorf("error creating socket pair: %w", err) - } - pipes.attachPipe = parentAttachPipe - - childrenClosed := false - defer func() { - if !childrenClosed { - errorhandling.CloseQuiet(childSyncPipe) - errorhandling.CloseQuiet(childAttachPipe) - errorhandling.CloseQuiet(childStartPipe) - } - }() - - runtimeDir, err := util.GetRuntimeDir() - if err != nil { - return nil, nil, err - } - - finalEnv := make([]string, 0, len(options.Env)) - for k, v := range options.Env { - finalEnv = append(finalEnv, fmt.Sprintf("%s=%s", k, v)) - } - - processFile, err := prepareProcessExec(c, options, finalEnv, sessionID) - if err != nil { - return nil, nil, err - } - defer processFile.Close() - - args := r.sharedConmonArgs(c, sessionID, c.execBundlePath(sessionID), c.execPidPath(sessionID), c.execLogPath(sessionID), c.execExitFileDir(sessionID), ociLog, define.NoLogging, c.config.LogTag) - - if options.PreserveFDs > 0 { - args = append(args, formatRuntimeOpts("--preserve-fds", fmt.Sprintf("%d", options.PreserveFDs))...) - } - - if options.Terminal { - args = append(args, "-t") - } - - if attachStdin { - args = append(args, "-i") - } - - // Append container ID and command - args = append(args, "-e") - // TODO make this optional when we can detach - args = append(args, "--exec-attach") - args = append(args, "--exec-process-spec", processFile.Name()) - - if len(options.ExitCommand) > 0 { - args = append(args, "--exit-command", options.ExitCommand[0]) - for _, arg := range options.ExitCommand[1:] { - args = append(args, []string{"--exit-command-arg", arg}...) - } - if options.ExitCommandDelay > 0 { - args = append(args, []string{"--exit-delay", fmt.Sprintf("%d", options.ExitCommandDelay)}...) - } - } - - logrus.WithFields(logrus.Fields{ - "args": args, - }).Debugf("running conmon: %s", r.conmonPath) - execCmd := exec.Command(r.conmonPath, args...) - - // TODO: This is commented because it doesn't make much sense in HTTP - // attach, and I'm not certain it does for non-HTTP attach as well. - // if streams != nil { - // // Don't add the InputStream to the execCmd. Instead, the data should be passed - // // through CopyDetachable - // if streams.AttachOutput { - // execCmd.Stdout = options.Streams.OutputStream - // } - // if streams.AttachError { - // execCmd.Stderr = options.Streams.ErrorStream - // } - // } - - conmonEnv := r.configureConmonEnv(runtimeDir) - - var filesToClose []*os.File - if options.PreserveFDs > 0 { - for fd := 3; fd < int(3+options.PreserveFDs); fd++ { - f := os.NewFile(uintptr(fd), fmt.Sprintf("fd-%d", fd)) - filesToClose = append(filesToClose, f) - execCmd.ExtraFiles = append(execCmd.ExtraFiles, f) - } - } - - // we don't want to step on users fds they asked to preserve - // Since 0-2 are used for stdio, start the fds we pass in at preserveFDs+3 - execCmd.Env = r.conmonEnv - execCmd.Env = append(execCmd.Env, fmt.Sprintf("_OCI_SYNCPIPE=%d", options.PreserveFDs+3), fmt.Sprintf("_OCI_STARTPIPE=%d", options.PreserveFDs+4), fmt.Sprintf("_OCI_ATTACHPIPE=%d", options.PreserveFDs+5)) - execCmd.Env = append(execCmd.Env, conmonEnv...) - - execCmd.ExtraFiles = append(execCmd.ExtraFiles, childSyncPipe, childStartPipe, childAttachPipe) - execCmd.Dir = c.execBundlePath(sessionID) - execCmd.SysProcAttr = &syscall.SysProcAttr{ - Setpgid: true, - } - - err = startCommand(execCmd, c) - - // We don't need children pipes on the parent side - errorhandling.CloseQuiet(childSyncPipe) - errorhandling.CloseQuiet(childAttachPipe) - errorhandling.CloseQuiet(childStartPipe) - childrenClosed = true - - if err != nil { - return nil, nil, fmt.Errorf("cannot start container %s: %w", c.ID(), err) - } - if err := r.moveConmonToCgroupAndSignal(c, execCmd, parentStartPipe); err != nil { - return nil, nil, err - } - - // These fds were passed down to the runtime. Close them - // and not interfere - for _, f := range filesToClose { - errorhandling.CloseQuiet(f) - } - - return execCmd, pipes, nil -} - -// Attach to a container over HTTP -func attachExecHTTP(c *Container, sessionID string, r *http.Request, w http.ResponseWriter, streams *HTTPAttachStreams, pipes *execPipes, detachKeys []byte, isTerminal bool, cancel <-chan bool, hijackDone chan<- bool, holdConnOpen <-chan bool, execCmd *exec.Cmd, conmonPipeDataChan chan<- conmonPipeData, ociLog string, newSize *resize.TerminalSize, runtimeName string) (deferredErr error) { - // NOTE: As you may notice, the attach code is quite complex. - // Many things happen concurrently and yet are interdependent. - // If you ever change this function, make sure to write to the - // conmonPipeDataChan in case of an error. - - if pipes == nil || pipes.startPipe == nil || pipes.attachPipe == nil { - err := fmt.Errorf("must provide a start and attach pipe to finish an exec attach: %w", define.ErrInvalidArg) - conmonPipeDataChan <- conmonPipeData{-1, err} - return err - } - - defer func() { - if !pipes.startClosed { - errorhandling.CloseQuiet(pipes.startPipe) - pipes.startClosed = true - } - if !pipes.attachClosed { - errorhandling.CloseQuiet(pipes.attachPipe) - pipes.attachClosed = true - } - }() - - logrus.Debugf("Attaching to container %s exec session %s", c.ID(), sessionID) - - // set up the socket path, such that it is the correct length and location for exec - sockPath, err := c.execAttachSocketPath(sessionID) - if err != nil { - conmonPipeDataChan <- conmonPipeData{-1, err} - return err - } - - // 2: read from attachFd that the parent process has set up the console socket - if _, err := readConmonPipeData(runtimeName, pipes.attachPipe, ""); err != nil { - conmonPipeDataChan <- conmonPipeData{-1, err} - return err - } - - // resize before we start the container process - if newSize != nil { - err = c.ociRuntime.ExecAttachResize(c, sessionID, *newSize) - if err != nil { - logrus.Warnf("Resize failed: %v", err) - } - } - - // 2: then attach - conn, err := openUnixSocket(sockPath) - if err != nil { - conmonPipeDataChan <- conmonPipeData{-1, err} - return fmt.Errorf("failed to connect to container's attach socket: %v: %w", sockPath, err) - } - defer func() { - if err := conn.Close(); err != nil { - logrus.Errorf("Unable to close socket: %q", err) - } - }() - - attachStdout := true - attachStderr := true - attachStdin := true - if streams != nil { - attachStdout = streams.Stdout - attachStderr = streams.Stderr - attachStdin = streams.Stdin - } - - // Perform hijack - hijacker, ok := w.(http.Hijacker) - if !ok { - conmonPipeDataChan <- conmonPipeData{-1, err} - return errors.New("unable to hijack connection") - } - - httpCon, httpBuf, err := hijacker.Hijack() - if err != nil { - conmonPipeDataChan <- conmonPipeData{-1, err} - return fmt.Errorf("error hijacking connection: %w", err) - } - - hijackDone <- true - - // Write a header to let the client know what happened - writeHijackHeader(r, httpBuf) - - // Force a flush after the header is written. - if err := httpBuf.Flush(); err != nil { - conmonPipeDataChan <- conmonPipeData{-1, err} - return fmt.Errorf("error flushing HTTP hijack header: %w", err) - } - - go func() { - // Wait for conmon to succeed, when return. - if err := execCmd.Wait(); err != nil { - conmonPipeDataChan <- conmonPipeData{-1, err} - } else { - pid, err := readConmonPipeData(runtimeName, pipes.syncPipe, ociLog) - if err != nil { - hijackWriteError(err, c.ID(), isTerminal, httpBuf) - conmonPipeDataChan <- conmonPipeData{pid, err} - } else { - conmonPipeDataChan <- conmonPipeData{pid, err} - } - } - // We need to hold the connection open until the complete exec - // function has finished. This channel will be closed in a defer - // in that function, so we can wait for it here. - // Can't be a defer, because this would block the function from - // returning. - <-holdConnOpen - hijackWriteErrorAndClose(deferredErr, c.ID(), isTerminal, httpCon, httpBuf) - }() - - stdoutChan := make(chan error) - stdinChan := make(chan error) - - // Next, STDIN. Avoid entirely if attachStdin unset. - if attachStdin { - go func() { - logrus.Debugf("Beginning STDIN copy") - _, err := cutil.CopyDetachable(conn, httpBuf, detachKeys) - logrus.Debugf("STDIN copy completed") - stdinChan <- err - }() - } - - // 4: send start message to child - if err := writeConmonPipeData(pipes.startPipe); err != nil { - return err - } - - // Handle STDOUT/STDERR *after* start message is sent - go func() { - var err error - if isTerminal { - // Hack: return immediately if attachStdout not set to - // emulate Docker. - // Basically, when terminal is set, STDERR goes nowhere. - // Everything does over STDOUT. - // Therefore, if not attaching STDOUT - we'll never copy - // anything from here. - logrus.Debugf("Performing terminal HTTP attach for container %s", c.ID()) - if attachStdout { - err = httpAttachTerminalCopy(conn, httpBuf, c.ID()) - } - } else { - logrus.Debugf("Performing non-terminal HTTP attach for container %s", c.ID()) - err = httpAttachNonTerminalCopy(conn, httpBuf, c.ID(), attachStdin, attachStdout, attachStderr) - } - stdoutChan <- err - logrus.Debugf("STDOUT/ERR copy completed") - }() - - for { - select { - case err := <-stdoutChan: - if err != nil { - return err - } - - return nil - case err := <-stdinChan: - if err != nil { - return err - } - // copy stdin is done, close it - if connErr := conn.CloseWrite(); connErr != nil { - logrus.Errorf("Unable to close conn: %v", connErr) - } - case <-cancel: - return nil - } - } -} - -// prepareProcessExec returns the path of the process.json used in runc exec -p -// caller is responsible to close the returned *os.File if needed. -func prepareProcessExec(c *Container, options *ExecOptions, env []string, sessionID string) (*os.File, error) { - f, err := ioutil.TempFile(c.execBundlePath(sessionID), "exec-process-") - if err != nil { - return nil, err - } - pspec := new(spec.Process) - if err := JSONDeepCopy(c.config.Spec.Process, pspec); err != nil { - return nil, err - } - pspec.SelinuxLabel = c.config.ProcessLabel - pspec.Args = options.Cmd - - // We need to default this to false else it will inherit terminal as true - // from the container. - pspec.Terminal = false - if options.Terminal { - pspec.Terminal = true - } - if len(env) > 0 { - pspec.Env = append(pspec.Env, env...) - } - - // Add secret envs if they exist - manager, err := c.runtime.SecretsManager() - if err != nil { - return nil, err - } - for name, secr := range c.config.EnvSecrets { - _, data, err := manager.LookupSecretData(secr.Name) - if err != nil { - return nil, err - } - pspec.Env = append(pspec.Env, fmt.Sprintf("%s=%s", name, string(data))) - } - - if options.Cwd != "" { - pspec.Cwd = options.Cwd - } - - var addGroups []string - var sgids []uint32 - - // if the user is empty, we should inherit the user that the container is currently running with - user := options.User - if user == "" { - logrus.Debugf("Set user to %s", c.config.User) - user = c.config.User - addGroups = c.config.Groups - } - - overrides := c.getUserOverrides() - execUser, err := lookup.GetUserGroupInfo(c.state.Mountpoint, user, overrides) - if err != nil { - return nil, err - } - - if len(addGroups) > 0 { - sgids, err = lookup.GetContainerGroups(addGroups, c.state.Mountpoint, overrides) - if err != nil { - return nil, fmt.Errorf("error looking up supplemental groups for container %s exec session %s: %w", c.ID(), sessionID, err) - } - } - - // If user was set, look it up in the container to get a UID to use on - // the host - if user != "" || len(sgids) > 0 { - if user != "" { - for _, sgid := range execUser.Sgids { - sgids = append(sgids, uint32(sgid)) - } - } - processUser := spec.User{ - UID: uint32(execUser.Uid), - GID: uint32(execUser.Gid), - AdditionalGids: sgids, - } - - pspec.User = processUser - } - - ctrSpec, err := c.specFromState() - if err != nil { - return nil, err - } - - allCaps, err := capabilities.BoundingSet() - if err != nil { - return nil, err - } - if options.Privileged { - pspec.Capabilities.Bounding = allCaps - } else { - pspec.Capabilities.Bounding = ctrSpec.Process.Capabilities.Bounding - } - - // Always unset the inheritable capabilities similarly to what the Linux kernel does - // They are used only when using capabilities with uid != 0. - pspec.Capabilities.Inheritable = []string{} - - if execUser.Uid == 0 { - pspec.Capabilities.Effective = pspec.Capabilities.Bounding - pspec.Capabilities.Permitted = pspec.Capabilities.Bounding - } else if user == c.config.User { - pspec.Capabilities.Effective = ctrSpec.Process.Capabilities.Effective - pspec.Capabilities.Inheritable = ctrSpec.Process.Capabilities.Effective - pspec.Capabilities.Permitted = ctrSpec.Process.Capabilities.Effective - pspec.Capabilities.Ambient = ctrSpec.Process.Capabilities.Effective - } - - hasHomeSet := false - for _, s := range pspec.Env { - if strings.HasPrefix(s, "HOME=") { - hasHomeSet = true - break - } - } - if !hasHomeSet { - pspec.Env = append(pspec.Env, fmt.Sprintf("HOME=%s", execUser.Home)) - } - - processJSON, err := json.Marshal(pspec) - if err != nil { - return nil, err - } - - if err := ioutil.WriteFile(f.Name(), processJSON, 0644); err != nil { - return nil, err - } - return f, nil -} -- cgit v1.2.3-54-g00ecf From 6791cdbdf153a0b3103810679995cc09ea8db340 Mon Sep 17 00:00:00 2001 From: Doug Rabson Date: Wed, 17 Aug 2022 10:29:40 +0100 Subject: libpod: Move rootless handling from oci_conmon_common.go to oci_conmon_linux.go [NO NEW TESTS NEEDED] Signed-off-by: Doug Rabson --- libpod/oci_conmon_common.go | 56 +----------------------------------- libpod/oci_conmon_linux.go | 70 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+), 55 deletions(-) create mode 100644 libpod/oci_conmon_linux.go (limited to 'libpod') diff --git a/libpod/oci_conmon_common.go b/libpod/oci_conmon_common.go index 1b654ed33..4ca2d6e34 100644 --- a/libpod/oci_conmon_common.go +++ b/libpod/oci_conmon_common.go @@ -41,7 +41,6 @@ import ( "github.com/containers/podman/v4/pkg/util" "github.com/containers/podman/v4/utils" "github.com/containers/storage/pkg/homedir" - pmount "github.com/containers/storage/pkg/mount" spec "github.com/opencontainers/runtime-spec/specs-go" "github.com/opencontainers/selinux/go-selinux/label" "github.com/sirupsen/logrus" @@ -204,60 +203,7 @@ func (r *ConmonOCIRuntime) CreateContainer(ctr *Container, restoreOptions *Conta // if we are running a non privileged container, be sure to umount some kernel paths so they are not // bind mounted inside the container at all. if !ctr.config.Privileged && !rootless.IsRootless() { - type result struct { - restoreDuration int64 - err error - } - ch := make(chan result) - go func() { - runtime.LockOSThread() - restoreDuration, err := func() (int64, error) { - fd, err := os.Open(fmt.Sprintf("/proc/%d/task/%d/ns/mnt", os.Getpid(), unix.Gettid())) - if err != nil { - return 0, err - } - defer errorhandling.CloseQuiet(fd) - - // create a new mountns on the current thread - if err = unix.Unshare(unix.CLONE_NEWNS); err != nil { - return 0, err - } - defer func() { - if err := unix.Setns(int(fd.Fd()), unix.CLONE_NEWNS); err != nil { - logrus.Errorf("Unable to clone new namespace: %q", err) - } - }() - - // don't spread our mounts around. We are setting only /sys to be slave - // so that the cleanup process is still able to umount the storage and the - // changes are propagated to the host. - err = unix.Mount("/sys", "/sys", "none", unix.MS_REC|unix.MS_SLAVE, "") - if err != nil { - return 0, fmt.Errorf("cannot make /sys slave: %w", err) - } - - mounts, err := pmount.GetMounts() - if err != nil { - return 0, err - } - for _, m := range mounts { - if !strings.HasPrefix(m.Mountpoint, "/sys/kernel") { - continue - } - err = unix.Unmount(m.Mountpoint, 0) - if err != nil && !os.IsNotExist(err) { - return 0, fmt.Errorf("cannot unmount %s: %w", m.Mountpoint, err) - } - } - return r.createOCIContainer(ctr, restoreOptions) - }() - ch <- result{ - restoreDuration: restoreDuration, - err: err, - } - }() - r := <-ch - return r.restoreDuration, r.err + return r.createRootlessContainer(ctr, restoreOptions) } } return r.createOCIContainer(ctr, restoreOptions) diff --git a/libpod/oci_conmon_linux.go b/libpod/oci_conmon_linux.go new file mode 100644 index 000000000..4e8bbafd6 --- /dev/null +++ b/libpod/oci_conmon_linux.go @@ -0,0 +1,70 @@ +package libpod + +import ( + "fmt" + "os" + "runtime" + "strings" + + "github.com/containers/podman/v4/pkg/errorhandling" + pmount "github.com/containers/storage/pkg/mount" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) { + type result struct { + restoreDuration int64 + err error + } + ch := make(chan result) + go func() { + runtime.LockOSThread() + restoreDuration, err := func() (int64, error) { + fd, err := os.Open(fmt.Sprintf("/proc/%d/task/%d/ns/mnt", os.Getpid(), unix.Gettid())) + if err != nil { + return 0, err + } + defer errorhandling.CloseQuiet(fd) + + // create a new mountns on the current thread + if err = unix.Unshare(unix.CLONE_NEWNS); err != nil { + return 0, err + } + defer func() { + if err := unix.Setns(int(fd.Fd()), unix.CLONE_NEWNS); err != nil { + logrus.Errorf("Unable to clone new namespace: %q", err) + } + }() + + // don't spread our mounts around. We are setting only /sys to be slave + // so that the cleanup process is still able to umount the storage and the + // changes are propagated to the host. + err = unix.Mount("/sys", "/sys", "none", unix.MS_REC|unix.MS_SLAVE, "") + if err != nil { + return 0, fmt.Errorf("cannot make /sys slave: %w", err) + } + + mounts, err := pmount.GetMounts() + if err != nil { + return 0, err + } + for _, m := range mounts { + if !strings.HasPrefix(m.Mountpoint, "/sys/kernel") { + continue + } + err = unix.Unmount(m.Mountpoint, 0) + if err != nil && !os.IsNotExist(err) { + return 0, fmt.Errorf("cannot unmount %s: %w", m.Mountpoint, err) + } + } + return r.createOCIContainer(ctr, restoreOptions) + }() + ch <- result{ + restoreDuration: restoreDuration, + err: err, + } + }() + res := <-ch + return res.restoreDuration, res.err +} -- cgit v1.2.3-54-g00ecf From 93bad904864aa71c45b6b72d217a752c05eb254b Mon Sep 17 00:00:00 2001 From: Doug Rabson Date: Wed, 17 Aug 2022 10:30:30 +0100 Subject: libpod: Move socket label handling from oci_conmon_common.go to oci_conmon_linux.go [NO NEW TESTS NEEDED] Signed-off-by: Doug Rabson --- libpod/oci_conmon_common.go | 24 +++++------------------- libpod/oci_conmon_linux.go | 21 +++++++++++++++++++++ 2 files changed, 26 insertions(+), 19 deletions(-) (limited to 'libpod') diff --git a/libpod/oci_conmon_common.go b/libpod/oci_conmon_common.go index 4ca2d6e34..aee0c36c8 100644 --- a/libpod/oci_conmon_common.go +++ b/libpod/oci_conmon_common.go @@ -16,7 +16,6 @@ import ( "os" "os/exec" "path/filepath" - "runtime" "strconv" "strings" "sync" @@ -42,7 +41,6 @@ import ( "github.com/containers/podman/v4/utils" "github.com/containers/storage/pkg/homedir" spec "github.com/opencontainers/runtime-spec/specs-go" - "github.com/opencontainers/selinux/go-selinux/label" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) @@ -763,23 +761,11 @@ func (r *ConmonOCIRuntime) CheckpointContainer(ctr *Container, options Container env = append(env, fmt.Sprintf("PATH=%s", path)) } - runtime.LockOSThread() - if err := label.SetSocketLabel(ctr.ProcessLabel()); err != nil { - return 0, err - } - - runtimeCheckpointStarted := time.Now() - err = utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, args...) - // Ignore error returned from SetSocketLabel("") call, - // can't recover. - if labelErr := label.SetSocketLabel(""); labelErr == nil { - // Unlock the thread only if the process label could be restored - // successfully. Otherwise leave the thread locked and the Go runtime - // will terminate it once it returns to the threads pool. - runtime.UnlockOSThread() - } else { - logrus.Errorf("Unable to reset socket label: %q", labelErr) - } + var runtimeCheckpointStarted time.Time + err = r.withContainerSocketLabel(ctr, func() error { + runtimeCheckpointStarted = time.Now() + return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, args...) + }) runtimeCheckpointDuration := func() int64 { if options.PrintStats { diff --git a/libpod/oci_conmon_linux.go b/libpod/oci_conmon_linux.go index 4e8bbafd6..ce6eaf32a 100644 --- a/libpod/oci_conmon_linux.go +++ b/libpod/oci_conmon_linux.go @@ -8,6 +8,7 @@ import ( "github.com/containers/podman/v4/pkg/errorhandling" pmount "github.com/containers/storage/pkg/mount" + "github.com/opencontainers/selinux/go-selinux/label" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) @@ -68,3 +69,23 @@ func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOption res := <-ch return res.restoreDuration, res.err } + +// Run the closure with the container's socket label set +func (r *ConmonOCIRuntime) withContainerSocketLabel(ctr *Container, closure func() error) error { + runtime.LockOSThread() + if err := label.SetSocketLabel(ctr.ProcessLabel()); err != nil { + return err + } + err := closure() + // Ignore error returned from SetSocketLabel("") call, + // can't recover. + if labelErr := label.SetSocketLabel(""); labelErr == nil { + // Unlock the thread only if the process label could be restored + // successfully. Otherwise leave the thread locked and the Go runtime + // will terminate it once it returns to the threads pool. + runtime.UnlockOSThread() + } else { + logrus.Errorf("Unable to reset socket label: %q", labelErr) + } + return err +} -- cgit v1.2.3-54-g00ecf From d43fac20f3025096cdfe45ae32f41886b39e4659 Mon Sep 17 00:00:00 2001 From: Doug Rabson Date: Wed, 17 Aug 2022 11:15:37 +0100 Subject: libpod: Move moveConmonToCgroupAndSignal and GetLimits to oci_conmon_linux.go [NO NEW TESTS NEEDED] Signed-off-by: Doug Rabson --- libpod/oci_conmon_common.go | 261 ------------------------------------------- libpod/oci_conmon_linux.go | 267 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 267 insertions(+), 261 deletions(-) (limited to 'libpod') diff --git a/libpod/oci_conmon_common.go b/libpod/oci_conmon_common.go index aee0c36c8..222fec9ca 100644 --- a/libpod/oci_conmon_common.go +++ b/libpod/oci_conmon_common.go @@ -23,10 +23,6 @@ import ( "text/template" "time" - runcconfig "github.com/opencontainers/runc/libcontainer/configs" - "github.com/opencontainers/runc/libcontainer/devices" - - "github.com/containers/common/pkg/cgroups" "github.com/containers/common/pkg/config" "github.com/containers/common/pkg/resize" cutil "github.com/containers/common/pkg/util" @@ -1338,75 +1334,6 @@ func startCommand(cmd *exec.Cmd, ctr *Container) error { return cmd.Start() } -// moveConmonToCgroupAndSignal gets a container's cgroupParent and moves the conmon process to that cgroup -// it then signals for conmon to start by sending nonce data down the start fd -func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec.Cmd, startFd *os.File) error { - mustCreateCgroup := true - - if ctr.config.NoCgroups { - mustCreateCgroup = false - } - - // If cgroup creation is disabled - just signal. - switch ctr.config.CgroupsMode { - case "disabled", "no-conmon", cgroupSplit: - mustCreateCgroup = false - } - - // $INVOCATION_ID is set by systemd when running as a service. - if ctr.runtime.RemoteURI() == "" && os.Getenv("INVOCATION_ID") != "" { - mustCreateCgroup = false - } - - if mustCreateCgroup { - // Usually rootless users are not allowed to configure cgroupfs. - // There are cases though, where it is allowed, e.g. if the cgroup - // is manually configured and chowned). Avoid detecting all - // such cases and simply use a lower log level. - logLevel := logrus.WarnLevel - if rootless.IsRootless() { - logLevel = logrus.InfoLevel - } - // TODO: This should be a switch - we are not guaranteed that - // there are only 2 valid cgroup managers - cgroupParent := ctr.CgroupParent() - cgroupPath := filepath.Join(ctr.config.CgroupParent, "conmon") - Resource := ctr.Spec().Linux.Resources - cgroupResources, err := GetLimits(Resource) - if err != nil { - logrus.StandardLogger().Log(logLevel, "Could not get ctr resources") - } - if ctr.CgroupManager() == config.SystemdCgroupsManager { - unitName := createUnitName("libpod-conmon", ctr.ID()) - realCgroupParent := cgroupParent - splitParent := strings.Split(cgroupParent, "/") - if strings.HasSuffix(cgroupParent, ".slice") && len(splitParent) > 1 { - realCgroupParent = splitParent[len(splitParent)-1] - } - - logrus.Infof("Running conmon under slice %s and unitName %s", realCgroupParent, unitName) - if err := utils.RunUnderSystemdScope(cmd.Process.Pid, realCgroupParent, unitName); err != nil { - logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to systemd sandbox cgroup: %v", err) - } - } else { - control, err := cgroups.New(cgroupPath, &cgroupResources) - if err != nil { - logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to cgroupfs sandbox cgroup: %v", err) - } else if err := control.AddPid(cmd.Process.Pid); err != nil { - // we need to remove this defer and delete the cgroup once conmon exits - // maybe need a conmon monitor? - logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to cgroupfs sandbox cgroup: %v", err) - } - } - } - - /* We set the cgroup, now the child can start creating children */ - if err := writeConmonPipeData(startFd); err != nil { - return err - } - return nil -} - // newPipe creates a unix socket pair for communication. // Returns two files - first is parent, second is child. func newPipe() (*os.File, *os.File, error) { @@ -1671,191 +1598,3 @@ func httpAttachNonTerminalCopy(container *net.UnixConn, http *bufio.ReadWriter, } } } - -// GetLimits converts spec resource limits to cgroup consumable limits -func GetLimits(resource *spec.LinuxResources) (runcconfig.Resources, error) { - if resource == nil { - resource = &spec.LinuxResources{} - } - final := &runcconfig.Resources{} - devs := []*devices.Rule{} - - // Devices - for _, entry := range resource.Devices { - if entry.Major == nil || entry.Minor == nil { - continue - } - runeType := 'a' - switch entry.Type { - case "b": - runeType = 'b' - case "c": - runeType = 'c' - } - - devs = append(devs, &devices.Rule{ - Type: devices.Type(runeType), - Major: *entry.Major, - Minor: *entry.Minor, - Permissions: devices.Permissions(entry.Access), - Allow: entry.Allow, - }) - } - final.Devices = devs - - // HugepageLimits - pageLimits := []*runcconfig.HugepageLimit{} - for _, entry := range resource.HugepageLimits { - pageLimits = append(pageLimits, &runcconfig.HugepageLimit{ - Pagesize: entry.Pagesize, - Limit: entry.Limit, - }) - } - final.HugetlbLimit = pageLimits - - // Networking - netPriorities := []*runcconfig.IfPrioMap{} - if resource.Network != nil { - for _, entry := range resource.Network.Priorities { - netPriorities = append(netPriorities, &runcconfig.IfPrioMap{ - Interface: entry.Name, - Priority: int64(entry.Priority), - }) - } - } - final.NetPrioIfpriomap = netPriorities - rdma := make(map[string]runcconfig.LinuxRdma) - for name, entry := range resource.Rdma { - rdma[name] = runcconfig.LinuxRdma{HcaHandles: entry.HcaHandles, HcaObjects: entry.HcaObjects} - } - final.Rdma = rdma - - // Memory - if resource.Memory != nil { - if resource.Memory.Limit != nil { - final.Memory = *resource.Memory.Limit - } - if resource.Memory.Reservation != nil { - final.MemoryReservation = *resource.Memory.Reservation - } - if resource.Memory.Swap != nil { - final.MemorySwap = *resource.Memory.Swap - } - if resource.Memory.Swappiness != nil { - final.MemorySwappiness = resource.Memory.Swappiness - } - } - - // CPU - if resource.CPU != nil { - if resource.CPU.Period != nil { - final.CpuPeriod = *resource.CPU.Period - } - if resource.CPU.Quota != nil { - final.CpuQuota = *resource.CPU.Quota - } - if resource.CPU.RealtimePeriod != nil { - final.CpuRtPeriod = *resource.CPU.RealtimePeriod - } - if resource.CPU.RealtimeRuntime != nil { - final.CpuRtRuntime = *resource.CPU.RealtimeRuntime - } - if resource.CPU.Shares != nil { - final.CpuShares = *resource.CPU.Shares - } - final.CpusetCpus = resource.CPU.Cpus - final.CpusetMems = resource.CPU.Mems - } - - // BlkIO - if resource.BlockIO != nil { - if len(resource.BlockIO.ThrottleReadBpsDevice) > 0 { - for _, entry := range resource.BlockIO.ThrottleReadBpsDevice { - throttle := &runcconfig.ThrottleDevice{} - dev := &runcconfig.BlockIODevice{ - Major: entry.Major, - Minor: entry.Minor, - } - throttle.BlockIODevice = *dev - throttle.Rate = entry.Rate - final.BlkioThrottleReadBpsDevice = append(final.BlkioThrottleReadBpsDevice, throttle) - } - } - if len(resource.BlockIO.ThrottleWriteBpsDevice) > 0 { - for _, entry := range resource.BlockIO.ThrottleWriteBpsDevice { - throttle := &runcconfig.ThrottleDevice{} - dev := &runcconfig.BlockIODevice{ - Major: entry.Major, - Minor: entry.Minor, - } - throttle.BlockIODevice = *dev - throttle.Rate = entry.Rate - final.BlkioThrottleWriteBpsDevice = append(final.BlkioThrottleWriteBpsDevice, throttle) - } - } - if len(resource.BlockIO.ThrottleReadIOPSDevice) > 0 { - for _, entry := range resource.BlockIO.ThrottleReadIOPSDevice { - throttle := &runcconfig.ThrottleDevice{} - dev := &runcconfig.BlockIODevice{ - Major: entry.Major, - Minor: entry.Minor, - } - throttle.BlockIODevice = *dev - throttle.Rate = entry.Rate - final.BlkioThrottleReadIOPSDevice = append(final.BlkioThrottleReadIOPSDevice, throttle) - } - } - if len(resource.BlockIO.ThrottleWriteIOPSDevice) > 0 { - for _, entry := range resource.BlockIO.ThrottleWriteIOPSDevice { - throttle := &runcconfig.ThrottleDevice{} - dev := &runcconfig.BlockIODevice{ - Major: entry.Major, - Minor: entry.Minor, - } - throttle.BlockIODevice = *dev - throttle.Rate = entry.Rate - final.BlkioThrottleWriteIOPSDevice = append(final.BlkioThrottleWriteIOPSDevice, throttle) - } - } - if resource.BlockIO.LeafWeight != nil { - final.BlkioLeafWeight = *resource.BlockIO.LeafWeight - } - if resource.BlockIO.Weight != nil { - final.BlkioWeight = *resource.BlockIO.Weight - } - if len(resource.BlockIO.WeightDevice) > 0 { - for _, entry := range resource.BlockIO.WeightDevice { - weight := &runcconfig.WeightDevice{} - dev := &runcconfig.BlockIODevice{ - Major: entry.Major, - Minor: entry.Minor, - } - if entry.Weight != nil { - weight.Weight = *entry.Weight - } - if entry.LeafWeight != nil { - weight.LeafWeight = *entry.LeafWeight - } - weight.BlockIODevice = *dev - final.BlkioWeightDevice = append(final.BlkioWeightDevice, weight) - } - } - } - - // Pids - if resource.Pids != nil { - final.PidsLimit = resource.Pids.Limit - } - - // Networking - if resource.Network != nil { - if resource.Network.ClassID != nil { - final.NetClsClassid = *resource.Network.ClassID - } - } - - // Unified state - final.Unified = resource.Unified - - return *final, nil -} diff --git a/libpod/oci_conmon_linux.go b/libpod/oci_conmon_linux.go index ce6eaf32a..0964d4ea3 100644 --- a/libpod/oci_conmon_linux.go +++ b/libpod/oci_conmon_linux.go @@ -3,11 +3,21 @@ package libpod import ( "fmt" "os" + "os/exec" + "path/filepath" "runtime" "strings" + runcconfig "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/devices" + + "github.com/containers/common/pkg/cgroups" + "github.com/containers/common/pkg/config" "github.com/containers/podman/v4/pkg/errorhandling" + "github.com/containers/podman/v4/pkg/rootless" + "github.com/containers/podman/v4/utils" pmount "github.com/containers/storage/pkg/mount" + spec "github.com/opencontainers/runtime-spec/specs-go" "github.com/opencontainers/selinux/go-selinux/label" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" @@ -89,3 +99,260 @@ func (r *ConmonOCIRuntime) withContainerSocketLabel(ctr *Container, closure func } return err } + +// moveConmonToCgroupAndSignal gets a container's cgroupParent and moves the conmon process to that cgroup +// it then signals for conmon to start by sending nonce data down the start fd +func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec.Cmd, startFd *os.File) error { + mustCreateCgroup := true + + if ctr.config.NoCgroups { + mustCreateCgroup = false + } + + // If cgroup creation is disabled - just signal. + switch ctr.config.CgroupsMode { + case "disabled", "no-conmon", cgroupSplit: + mustCreateCgroup = false + } + + // $INVOCATION_ID is set by systemd when running as a service. + if ctr.runtime.RemoteURI() == "" && os.Getenv("INVOCATION_ID") != "" { + mustCreateCgroup = false + } + + if mustCreateCgroup { + // Usually rootless users are not allowed to configure cgroupfs. + // There are cases though, where it is allowed, e.g. if the cgroup + // is manually configured and chowned). Avoid detecting all + // such cases and simply use a lower log level. + logLevel := logrus.WarnLevel + if rootless.IsRootless() { + logLevel = logrus.InfoLevel + } + // TODO: This should be a switch - we are not guaranteed that + // there are only 2 valid cgroup managers + cgroupParent := ctr.CgroupParent() + cgroupPath := filepath.Join(ctr.config.CgroupParent, "conmon") + Resource := ctr.Spec().Linux.Resources + cgroupResources, err := GetLimits(Resource) + if err != nil { + logrus.StandardLogger().Log(logLevel, "Could not get ctr resources") + } + if ctr.CgroupManager() == config.SystemdCgroupsManager { + unitName := createUnitName("libpod-conmon", ctr.ID()) + realCgroupParent := cgroupParent + splitParent := strings.Split(cgroupParent, "/") + if strings.HasSuffix(cgroupParent, ".slice") && len(splitParent) > 1 { + realCgroupParent = splitParent[len(splitParent)-1] + } + + logrus.Infof("Running conmon under slice %s and unitName %s", realCgroupParent, unitName) + if err := utils.RunUnderSystemdScope(cmd.Process.Pid, realCgroupParent, unitName); err != nil { + logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to systemd sandbox cgroup: %v", err) + } + } else { + control, err := cgroups.New(cgroupPath, &cgroupResources) + if err != nil { + logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to cgroupfs sandbox cgroup: %v", err) + } else if err := control.AddPid(cmd.Process.Pid); err != nil { + // we need to remove this defer and delete the cgroup once conmon exits + // maybe need a conmon monitor? + logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to cgroupfs sandbox cgroup: %v", err) + } + } + } + + /* We set the cgroup, now the child can start creating children */ + if err := writeConmonPipeData(startFd); err != nil { + return err + } + return nil +} + +// GetLimits converts spec resource limits to cgroup consumable limits +func GetLimits(resource *spec.LinuxResources) (runcconfig.Resources, error) { + if resource == nil { + resource = &spec.LinuxResources{} + } + final := &runcconfig.Resources{} + devs := []*devices.Rule{} + + // Devices + for _, entry := range resource.Devices { + if entry.Major == nil || entry.Minor == nil { + continue + } + runeType := 'a' + switch entry.Type { + case "b": + runeType = 'b' + case "c": + runeType = 'c' + } + + devs = append(devs, &devices.Rule{ + Type: devices.Type(runeType), + Major: *entry.Major, + Minor: *entry.Minor, + Permissions: devices.Permissions(entry.Access), + Allow: entry.Allow, + }) + } + final.Devices = devs + + // HugepageLimits + pageLimits := []*runcconfig.HugepageLimit{} + for _, entry := range resource.HugepageLimits { + pageLimits = append(pageLimits, &runcconfig.HugepageLimit{ + Pagesize: entry.Pagesize, + Limit: entry.Limit, + }) + } + final.HugetlbLimit = pageLimits + + // Networking + netPriorities := []*runcconfig.IfPrioMap{} + if resource.Network != nil { + for _, entry := range resource.Network.Priorities { + netPriorities = append(netPriorities, &runcconfig.IfPrioMap{ + Interface: entry.Name, + Priority: int64(entry.Priority), + }) + } + } + final.NetPrioIfpriomap = netPriorities + rdma := make(map[string]runcconfig.LinuxRdma) + for name, entry := range resource.Rdma { + rdma[name] = runcconfig.LinuxRdma{HcaHandles: entry.HcaHandles, HcaObjects: entry.HcaObjects} + } + final.Rdma = rdma + + // Memory + if resource.Memory != nil { + if resource.Memory.Limit != nil { + final.Memory = *resource.Memory.Limit + } + if resource.Memory.Reservation != nil { + final.MemoryReservation = *resource.Memory.Reservation + } + if resource.Memory.Swap != nil { + final.MemorySwap = *resource.Memory.Swap + } + if resource.Memory.Swappiness != nil { + final.MemorySwappiness = resource.Memory.Swappiness + } + } + + // CPU + if resource.CPU != nil { + if resource.CPU.Period != nil { + final.CpuPeriod = *resource.CPU.Period + } + if resource.CPU.Quota != nil { + final.CpuQuota = *resource.CPU.Quota + } + if resource.CPU.RealtimePeriod != nil { + final.CpuRtPeriod = *resource.CPU.RealtimePeriod + } + if resource.CPU.RealtimeRuntime != nil { + final.CpuRtRuntime = *resource.CPU.RealtimeRuntime + } + if resource.CPU.Shares != nil { + final.CpuShares = *resource.CPU.Shares + } + final.CpusetCpus = resource.CPU.Cpus + final.CpusetMems = resource.CPU.Mems + } + + // BlkIO + if resource.BlockIO != nil { + if len(resource.BlockIO.ThrottleReadBpsDevice) > 0 { + for _, entry := range resource.BlockIO.ThrottleReadBpsDevice { + throttle := &runcconfig.ThrottleDevice{} + dev := &runcconfig.BlockIODevice{ + Major: entry.Major, + Minor: entry.Minor, + } + throttle.BlockIODevice = *dev + throttle.Rate = entry.Rate + final.BlkioThrottleReadBpsDevice = append(final.BlkioThrottleReadBpsDevice, throttle) + } + } + if len(resource.BlockIO.ThrottleWriteBpsDevice) > 0 { + for _, entry := range resource.BlockIO.ThrottleWriteBpsDevice { + throttle := &runcconfig.ThrottleDevice{} + dev := &runcconfig.BlockIODevice{ + Major: entry.Major, + Minor: entry.Minor, + } + throttle.BlockIODevice = *dev + throttle.Rate = entry.Rate + final.BlkioThrottleWriteBpsDevice = append(final.BlkioThrottleWriteBpsDevice, throttle) + } + } + if len(resource.BlockIO.ThrottleReadIOPSDevice) > 0 { + for _, entry := range resource.BlockIO.ThrottleReadIOPSDevice { + throttle := &runcconfig.ThrottleDevice{} + dev := &runcconfig.BlockIODevice{ + Major: entry.Major, + Minor: entry.Minor, + } + throttle.BlockIODevice = *dev + throttle.Rate = entry.Rate + final.BlkioThrottleReadIOPSDevice = append(final.BlkioThrottleReadIOPSDevice, throttle) + } + } + if len(resource.BlockIO.ThrottleWriteIOPSDevice) > 0 { + for _, entry := range resource.BlockIO.ThrottleWriteIOPSDevice { + throttle := &runcconfig.ThrottleDevice{} + dev := &runcconfig.BlockIODevice{ + Major: entry.Major, + Minor: entry.Minor, + } + throttle.BlockIODevice = *dev + throttle.Rate = entry.Rate + final.BlkioThrottleWriteIOPSDevice = append(final.BlkioThrottleWriteIOPSDevice, throttle) + } + } + if resource.BlockIO.LeafWeight != nil { + final.BlkioLeafWeight = *resource.BlockIO.LeafWeight + } + if resource.BlockIO.Weight != nil { + final.BlkioWeight = *resource.BlockIO.Weight + } + if len(resource.BlockIO.WeightDevice) > 0 { + for _, entry := range resource.BlockIO.WeightDevice { + weight := &runcconfig.WeightDevice{} + dev := &runcconfig.BlockIODevice{ + Major: entry.Major, + Minor: entry.Minor, + } + if entry.Weight != nil { + weight.Weight = *entry.Weight + } + if entry.LeafWeight != nil { + weight.LeafWeight = *entry.LeafWeight + } + weight.BlockIODevice = *dev + final.BlkioWeightDevice = append(final.BlkioWeightDevice, weight) + } + } + } + + // Pids + if resource.Pids != nil { + final.PidsLimit = resource.Pids.Limit + } + + // Networking + if resource.Network != nil { + if resource.Network.ClassID != nil { + final.NetClsClassid = *resource.Network.ClassID + } + } + + // Unified state + final.Unified = resource.Unified + + return *final, nil +} -- cgit v1.2.3-54-g00ecf From cb4158889e7a115b4d8bb77c76cc99032d5e8363 Mon Sep 17 00:00:00 2001 From: Doug Rabson Date: Wed, 17 Aug 2022 11:18:15 +0100 Subject: libpod: Move openUnixSocket to oci_conmon_attach_linux.go This function depends on linux-specific functionality in /proc/fd to allow connecting to local domain sockets with pathnames too long for sockaddr_un. [NO NEW TESTS NEEDED] Signed-off-by: Doug Rabson --- libpod/oci_conmon_attach_common.go | 9 --------- libpod/oci_conmon_attach_linux.go | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 9 deletions(-) create mode 100644 libpod/oci_conmon_attach_linux.go (limited to 'libpod') diff --git a/libpod/oci_conmon_attach_common.go b/libpod/oci_conmon_attach_common.go index aa55aa6f5..adc374503 100644 --- a/libpod/oci_conmon_attach_common.go +++ b/libpod/oci_conmon_attach_common.go @@ -29,15 +29,6 @@ const ( AttachPipeStderr = 3 ) -func openUnixSocket(path string) (*net.UnixConn, error) { - fd, err := unix.Open(path, unix.O_PATH, 0) - if err != nil { - return nil, err - } - defer unix.Close(fd) - return net.DialUnix("unixpacket", nil, &net.UnixAddr{Name: fmt.Sprintf("/proc/self/fd/%d", fd), Net: "unixpacket"}) -} - // Attach to the given container. // Does not check if state is appropriate. // started is only required if startContainer is true. diff --git a/libpod/oci_conmon_attach_linux.go b/libpod/oci_conmon_attach_linux.go new file mode 100644 index 000000000..f1aa89d3e --- /dev/null +++ b/libpod/oci_conmon_attach_linux.go @@ -0,0 +1,17 @@ +package libpod + +import ( + "fmt" + "net" + + "golang.org/x/sys/unix" +) + +func openUnixSocket(path string) (*net.UnixConn, error) { + fd, err := unix.Open(path, unix.O_PATH, 0) + if err != nil { + return nil, err + } + defer unix.Close(fd) + return net.DialUnix("unixpacket", nil, &net.UnixAddr{Name: fmt.Sprintf("/proc/self/fd/%d", fd), Net: "unixpacket"}) +} -- cgit v1.2.3-54-g00ecf From 054d64710736250c4d238e159884c1588eb7218a Mon Sep 17 00:00:00 2001 From: Doug Rabson Date: Wed, 17 Aug 2022 09:43:43 +0100 Subject: libpod: Build oci_conmon_common.go and oci_conmon_attach_common on FreeBSD This also adds FreeBSD equivalents to the functions moved to oci_conmon*_linux.go. For openUnixSocket, we create a temporary symlink to shorten the path to something that fits into sockaddr_un. [NO NEW TESTS NEEDED] Signed-off-by: Doug Rabson --- libpod/networking_unsupported.go | 7 +++++++ libpod/oci_conmon_attach_common.go | 4 ++-- libpod/oci_conmon_attach_freebsd.go | 21 +++++++++++++++++++++ libpod/oci_conmon_common.go | 4 ++-- libpod/oci_conmon_freebsd.go | 24 ++++++++++++++++++++++++ libpod/oci_conmon_unsupported.go | 4 ++-- 6 files changed, 58 insertions(+), 6 deletions(-) create mode 100644 libpod/oci_conmon_attach_freebsd.go create mode 100644 libpod/oci_conmon_freebsd.go (limited to 'libpod') diff --git a/libpod/networking_unsupported.go b/libpod/networking_unsupported.go index 227b512cd..76ffabb5e 100644 --- a/libpod/networking_unsupported.go +++ b/libpod/networking_unsupported.go @@ -77,3 +77,10 @@ func (r *RootlessNetNS) Cleanup(runtime *Runtime) error { func (r *Runtime) GetRootlessNetNs(new bool) (*RootlessNetNS, error) { return nil, errors.New("not implemented (*Runtime) GetRootlessNetNs") } + +// convertPortMappings will remove the HostIP part from the ports when running inside podman machine. +// This is need because a HostIP of 127.0.0.1 would now allow the gvproxy forwarder to reach to open ports. +// For machine the HostIP must only be used by gvproxy and never in the VM. +func (c *Container) convertPortMappings() []types.PortMapping { + return []types.PortMapping{} +} diff --git a/libpod/oci_conmon_attach_common.go b/libpod/oci_conmon_attach_common.go index adc374503..a9e9b2bb5 100644 --- a/libpod/oci_conmon_attach_common.go +++ b/libpod/oci_conmon_attach_common.go @@ -1,5 +1,5 @@ -//go:build linux -// +build linux +//go:build linux || freebsd +// +build linux freebsd package libpod diff --git a/libpod/oci_conmon_attach_freebsd.go b/libpod/oci_conmon_attach_freebsd.go new file mode 100644 index 000000000..de0054381 --- /dev/null +++ b/libpod/oci_conmon_attach_freebsd.go @@ -0,0 +1,21 @@ +package libpod + +import ( + "net" + "os" + "path/filepath" +) + +func openUnixSocket(path string) (*net.UnixConn, error) { + // socket paths can be too long to fit into a sockaddr_un so we create a shorter symlink. + tmpdir, err := os.MkdirTemp("", "podman") + if err != nil { + return nil, err + } + defer os.RemoveAll(tmpdir) + tmpsockpath := filepath.Join(tmpdir, "sock") + if err := os.Symlink(path, tmpsockpath); err != nil { + return nil, err + } + return net.DialUnix("unixpacket", nil, &net.UnixAddr{Name: tmpsockpath, Net: "unixpacket"}) +} diff --git a/libpod/oci_conmon_common.go b/libpod/oci_conmon_common.go index 222fec9ca..c3725cdb4 100644 --- a/libpod/oci_conmon_common.go +++ b/libpod/oci_conmon_common.go @@ -1,5 +1,5 @@ -//go:build linux -// +build linux +//go:build linux || freebsd +// +build linux freebsd package libpod diff --git a/libpod/oci_conmon_freebsd.go b/libpod/oci_conmon_freebsd.go new file mode 100644 index 000000000..6f7ac7fc6 --- /dev/null +++ b/libpod/oci_conmon_freebsd.go @@ -0,0 +1,24 @@ +package libpod + +import ( + "errors" + "os" + "os/exec" +) + +func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) { + return -1, errors.New("unsupported (*ConmonOCIRuntime) createRootlessContainer") +} + +// Run the closure with the container's socket label set +func (r *ConmonOCIRuntime) withContainerSocketLabel(ctr *Container, closure func() error) error { + // No label support yet + return closure() +} + +// moveConmonToCgroupAndSignal gets a container's cgroupParent and moves the conmon process to that cgroup +// it then signals for conmon to start by sending nonce data down the start fd +func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec.Cmd, startFd *os.File) error { + // No equivalent on FreeBSD + return nil +} diff --git a/libpod/oci_conmon_unsupported.go b/libpod/oci_conmon_unsupported.go index c72dc0f0d..cc6d68e89 100644 --- a/libpod/oci_conmon_unsupported.go +++ b/libpod/oci_conmon_unsupported.go @@ -1,5 +1,5 @@ -//go:build !linux -// +build !linux +//go:build !linux && !freebsd +// +build !linux,!freebsd package libpod -- cgit v1.2.3-54-g00ecf