//go:build linux || freebsd // +build linux freebsd package libpod import ( "bufio" "bytes" "context" "errors" "fmt" "io" "io/ioutil" "net" "net/http" "os" "os/exec" "path/filepath" "strconv" "strings" "sync" "syscall" "text/template" "time" "github.com/containers/common/pkg/config" "github.com/containers/common/pkg/resize" cutil "github.com/containers/common/pkg/util" conmonConfig "github.com/containers/conmon/runner/config" "github.com/containers/podman/v4/libpod/define" "github.com/containers/podman/v4/libpod/logs" "github.com/containers/podman/v4/pkg/checkpoint/crutils" "github.com/containers/podman/v4/pkg/errorhandling" "github.com/containers/podman/v4/pkg/rootless" "github.com/containers/podman/v4/pkg/specgenutil" "github.com/containers/podman/v4/pkg/util" "github.com/containers/podman/v4/utils" "github.com/containers/storage/pkg/homedir" spec "github.com/opencontainers/runtime-spec/specs-go" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) const ( // This is Conmon's STDIO_BUF_SIZE. I don't believe we have access to it // directly from the Go code, so const it here // Important: The conmon attach socket uses an extra byte at the beginning of each // message to specify the STREAM so we have to increase the buffer size by one bufferSize = conmonConfig.BufSize + 1 ) // ConmonOCIRuntime is an OCI runtime managed by Conmon. // TODO: Make all calls to OCI runtime have a timeout. type ConmonOCIRuntime struct { name string path string conmonPath string conmonEnv []string tmpDir string exitsDir string logSizeMax int64 noPivot bool reservePorts bool runtimeFlags []string supportsJSON bool supportsKVM bool supportsNoCgroups bool enableKeyring bool } // Make a new Conmon-based OCI runtime with the given options. // Conmon will wrap the given OCI runtime, which can be `runc`, `crun`, or // any runtime with a runc-compatible CLI. // The first path that points to a valid executable will be used. // Deliberately private. Someone should not be able to construct this outside of // libpod. func newConmonOCIRuntime(name string, paths []string, conmonPath string, runtimeFlags []string, runtimeCfg *config.Config) (OCIRuntime, error) { if name == "" { return nil, fmt.Errorf("the OCI runtime must be provided a non-empty name: %w", define.ErrInvalidArg) } // Make lookup tables for runtime support supportsJSON := make(map[string]bool, len(runtimeCfg.Engine.RuntimeSupportsJSON)) supportsNoCgroups := make(map[string]bool, len(runtimeCfg.Engine.RuntimeSupportsNoCgroups)) supportsKVM := make(map[string]bool, len(runtimeCfg.Engine.RuntimeSupportsKVM)) for _, r := range runtimeCfg.Engine.RuntimeSupportsJSON { supportsJSON[r] = true } for _, r := range runtimeCfg.Engine.RuntimeSupportsNoCgroups { supportsNoCgroups[r] = true } for _, r := range runtimeCfg.Engine.RuntimeSupportsKVM { supportsKVM[r] = true } runtime := new(ConmonOCIRuntime) runtime.name = name runtime.conmonPath = conmonPath runtime.runtimeFlags = runtimeFlags runtime.conmonEnv = runtimeCfg.Engine.ConmonEnvVars runtime.tmpDir = runtimeCfg.Engine.TmpDir runtime.logSizeMax = runtimeCfg.Containers.LogSizeMax runtime.noPivot = runtimeCfg.Engine.NoPivotRoot runtime.reservePorts = runtimeCfg.Engine.EnablePortReservation runtime.enableKeyring = runtimeCfg.Containers.EnableKeyring // TODO: probe OCI runtime for feature and enable automatically if // available. base := filepath.Base(name) runtime.supportsJSON = supportsJSON[base] runtime.supportsNoCgroups = supportsNoCgroups[base] runtime.supportsKVM = supportsKVM[base] foundPath := false for _, path := range paths { stat, err := os.Stat(path) if err != nil { if os.IsNotExist(err) { continue } return nil, fmt.Errorf("cannot stat OCI runtime %s path: %w", name, err) } if !stat.Mode().IsRegular() { continue } foundPath = true logrus.Tracef("found runtime %q", path) runtime.path = path break } // Search the $PATH as last fallback if !foundPath { if foundRuntime, err := exec.LookPath(name); err == nil { foundPath = true runtime.path = foundRuntime logrus.Debugf("using runtime %q from $PATH: %q", name, foundRuntime) } } if !foundPath { return nil, fmt.Errorf("no valid executable found for OCI runtime %s: %w", name, define.ErrInvalidArg) } runtime.exitsDir = filepath.Join(runtime.tmpDir, "exits") // Create the exit files and attach sockets directories if err := os.MkdirAll(runtime.exitsDir, 0750); err != nil { // The directory is allowed to exist if !os.IsExist(err) { return nil, fmt.Errorf("error creating OCI runtime exit files directory: %w", err) } } return runtime, nil } // Name returns the name of the runtime being wrapped by Conmon. func (r *ConmonOCIRuntime) Name() string { return r.name } // Path returns the path of the OCI runtime being wrapped by Conmon. func (r *ConmonOCIRuntime) Path() string { return r.path } // hasCurrentUserMapped checks whether the current user is mapped inside the container user namespace func hasCurrentUserMapped(ctr *Container) bool { if len(ctr.config.IDMappings.UIDMap) == 0 && len(ctr.config.IDMappings.GIDMap) == 0 { return true } uid := os.Geteuid() for _, m := range ctr.config.IDMappings.UIDMap { if uid >= m.HostID && uid < m.HostID+m.Size { return true } } return false } // CreateContainer creates a container. func (r *ConmonOCIRuntime) CreateContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) { // always make the run dir accessible to the current user so that the PID files can be read without // being in the rootless user namespace. if err := makeAccessible(ctr.state.RunDir, 0, 0); err != nil { return 0, err } if !hasCurrentUserMapped(ctr) { for _, i := range []string{ctr.state.RunDir, ctr.runtime.config.Engine.TmpDir, ctr.config.StaticDir, ctr.state.Mountpoint, ctr.runtime.config.Engine.VolumePath} { if err := makeAccessible(i, ctr.RootUID(), ctr.RootGID()); err != nil { return 0, err } } // if we are running a non privileged container, be sure to umount some kernel paths so they are not // bind mounted inside the container at all. if !ctr.config.Privileged && !rootless.IsRootless() { return r.createRootlessContainer(ctr, restoreOptions) } } return r.createOCIContainer(ctr, restoreOptions) } // UpdateContainerStatus retrieves the current status of the container from the // runtime. It updates the container's state but does not save it. // If useRuntime is false, we will not directly hit runc to see the container's // status, but will instead only check for the existence of the conmon exit file // and update state to stopped if it exists. func (r *ConmonOCIRuntime) UpdateContainerStatus(ctr *Container) error { runtimeDir, err := util.GetRuntimeDir() if err != nil { return err } // Store old state so we know if we were already stopped oldState := ctr.state.State state := new(spec.State) cmd := exec.Command(r.path, "state", ctr.ID()) cmd.Env = append(cmd.Env, fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)) outPipe, err := cmd.StdoutPipe() if err != nil { return fmt.Errorf("getting stdout pipe: %w", err) } errPipe, err := cmd.StderrPipe() if err != nil { return fmt.Errorf("getting stderr pipe: %w", err) } if err := cmd.Start(); err != nil { out, err2 := ioutil.ReadAll(errPipe) if err2 != nil { return fmt.Errorf("error getting container %s state: %w", ctr.ID(), err) } if strings.Contains(string(out), "does not exist") || strings.Contains(string(out), "No such file") { if err := ctr.removeConmonFiles(); err != nil { logrus.Debugf("unable to remove conmon files for container %s", ctr.ID()) } ctr.state.ExitCode = -1 ctr.state.FinishedTime = time.Now() ctr.state.State = define.ContainerStateExited return ctr.runtime.state.AddContainerExitCode(ctr.ID(), ctr.state.ExitCode) } return fmt.Errorf("error getting container %s state. stderr/out: %s: %w", ctr.ID(), out, err) } defer func() { _ = cmd.Wait() }() if err := errPipe.Close(); err != nil { return err } out, err := ioutil.ReadAll(outPipe) if err != nil { return fmt.Errorf("error reading stdout: %s: %w", ctr.ID(), err) } if err := json.NewDecoder(bytes.NewBuffer(out)).Decode(state); err != nil { return fmt.Errorf("error decoding container status for container %s: %w", ctr.ID(), err) } ctr.state.PID = state.Pid switch state.Status { case "created": ctr.state.State = define.ContainerStateCreated case "paused": ctr.state.State = define.ContainerStatePaused case "running": ctr.state.State = define.ContainerStateRunning case "stopped": ctr.state.State = define.ContainerStateStopped default: return fmt.Errorf("unrecognized status returned by runtime for container %s: %s: %w", ctr.ID(), state.Status, define.ErrInternal) } // Handle ContainerStateStopping - keep it unless the container // transitioned to no longer running. if oldState == define.ContainerStateStopping && (ctr.state.State == define.ContainerStatePaused || ctr.state.State == define.ContainerStateRunning) { ctr.state.State = define.ContainerStateStopping } return nil } // StartContainer starts the given container. // Sets time the container was started, but does not save it. func (r *ConmonOCIRuntime) StartContainer(ctr *Container) error { // TODO: streams should probably *not* be our STDIN/OUT/ERR - redirect to buffers? runtimeDir, err := util.GetRuntimeDir() if err != nil { return err } env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} if path, ok := os.LookupEnv("PATH"); ok { env = append(env, fmt.Sprintf("PATH=%s", path)) } if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, append(r.runtimeFlags, "start", ctr.ID())...); err != nil { return err } ctr.state.StartedTime = time.Now() return nil } // KillContainer sends the given signal to the given container. // If all is set, send to all PIDs in the container. // All is only supported if the container created cgroups. func (r *ConmonOCIRuntime) KillContainer(ctr *Container, signal uint, all bool) error { logrus.Debugf("Sending signal %d to container %s", signal, ctr.ID()) runtimeDir, err := util.GetRuntimeDir() if err != nil { return err } env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} var args []string args = append(args, r.runtimeFlags...) if all { args = append(args, "kill", "--all", ctr.ID(), fmt.Sprintf("%d", signal)) } else { args = append(args, "kill", ctr.ID(), fmt.Sprintf("%d", signal)) } if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, args...); err != nil { // Update container state - there's a chance we failed because // the container exited in the meantime. if err2 := r.UpdateContainerStatus(ctr); err2 != nil { logrus.Infof("Error updating status for container %s: %v", ctr.ID(), err2) } if ctr.ensureState(define.ContainerStateStopped, define.ContainerStateExited) { return define.ErrCtrStateInvalid } return fmt.Errorf("error sending signal to container %s: %w", ctr.ID(), err) } return nil } // StopContainer stops a container, first using its given stop signal (or // SIGTERM if no signal was specified), then using SIGKILL. // Timeout is given in seconds. If timeout is 0, the container will be // immediately kill with SIGKILL. // Does not set finished time for container, assumes you will run updateStatus // after to pull the exit code. func (r *ConmonOCIRuntime) StopContainer(ctr *Container, timeout uint, all bool) error { logrus.Debugf("Stopping container %s (PID %d)", ctr.ID(), ctr.state.PID) // Ping the container to see if it's alive // If it's not, it's already stopped, return err := unix.Kill(ctr.state.PID, 0) if err == unix.ESRCH { return nil } stopSignal := ctr.config.StopSignal if stopSignal == 0 { stopSignal = uint(syscall.SIGTERM) } if timeout > 0 { if err := r.KillContainer(ctr, stopSignal, all); err != nil { // Is the container gone? // If so, it probably died between the first check and // our sending the signal // The container is stopped, so exit cleanly err := unix.Kill(ctr.state.PID, 0) if err == unix.ESRCH { return nil } return err } if err := waitContainerStop(ctr, time.Duration(timeout)*time.Second); err != nil { logrus.Debugf("Timed out stopping container %s with %s, resorting to SIGKILL: %v", ctr.ID(), unix.SignalName(syscall.Signal(stopSignal)), err) logrus.Warnf("StopSignal %s failed to stop container %s in %d seconds, resorting to SIGKILL", unix.SignalName(syscall.Signal(stopSignal)), ctr.Name(), timeout) } else { // No error, the container is dead return nil } } if err := r.KillContainer(ctr, 9, all); err != nil { // Again, check if the container is gone. If it is, exit cleanly. err := unix.Kill(ctr.state.PID, 0) if err == unix.ESRCH { return nil } return fmt.Errorf("error sending SIGKILL to container %s: %w", ctr.ID(), err) } // Give runtime a few seconds to make it happen if err := waitContainerStop(ctr, killContainerTimeout); err != nil { return err } return nil } // DeleteContainer deletes a container from the OCI runtime. func (r *ConmonOCIRuntime) DeleteContainer(ctr *Container) error { runtimeDir, err := util.GetRuntimeDir() if err != nil { return err } env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, append(r.runtimeFlags, "delete", "--force", ctr.ID())...) } // PauseContainer pauses the given container. func (r *ConmonOCIRuntime) PauseContainer(ctr *Container) error { runtimeDir, err := util.GetRuntimeDir() if err != nil { return err } env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, append(r.runtimeFlags, "pause", ctr.ID())...) } // UnpauseContainer unpauses the given container. func (r *ConmonOCIRuntime) UnpauseContainer(ctr *Container) error { runtimeDir, err := util.GetRuntimeDir() if err != nil { return err } env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, append(r.runtimeFlags, "resume", ctr.ID())...) } // HTTPAttach performs an attach for the HTTP API. // The caller must handle closing the HTTP connection after this returns. // The cancel channel is not closed; it is up to the caller to do so after // this function returns. // If this is a container with a terminal, we will stream raw. If it is not, we // will stream with an 8-byte header to multiplex STDOUT and STDERR. // Returns any errors that occurred, and whether the connection was successfully // hijacked before that error occurred. func (r *ConmonOCIRuntime) HTTPAttach(ctr *Container, req *http.Request, w http.ResponseWriter, streams *HTTPAttachStreams, detachKeys *string, cancel <-chan bool, hijackDone chan<- bool, streamAttach, streamLogs bool) (deferredErr error) { isTerminal := false if ctr.config.Spec.Process != nil { isTerminal = ctr.config.Spec.Process.Terminal } if streams != nil { if !streams.Stdin && !streams.Stdout && !streams.Stderr { return fmt.Errorf("must specify at least one stream to attach to: %w", define.ErrInvalidArg) } } attachSock, err := r.AttachSocketPath(ctr) if err != nil { return err } var conn *net.UnixConn if streamAttach { newConn, err := openUnixSocket(attachSock) if err != nil { return fmt.Errorf("failed to connect to container's attach socket: %v: %w", attachSock, err) } conn = newConn defer func() { if err := conn.Close(); err != nil { logrus.Errorf("Unable to close container %s attach socket: %q", ctr.ID(), err) } }() logrus.Debugf("Successfully connected to container %s attach socket %s", ctr.ID(), attachSock) } detachString := ctr.runtime.config.Engine.DetachKeys if detachKeys != nil { detachString = *detachKeys } detach, err := processDetachKeys(detachString) if err != nil { return err } attachStdout := true attachStderr := true attachStdin := true if streams != nil { attachStdout = streams.Stdout attachStderr = streams.Stderr attachStdin = streams.Stdin } logrus.Debugf("Going to hijack container %s attach connection", ctr.ID()) // Alright, let's hijack. hijacker, ok := w.(http.Hijacker) if !ok { return fmt.Errorf("unable to hijack connection") } httpCon, httpBuf, err := hijacker.Hijack() if err != nil { return fmt.Errorf("error hijacking connection: %w", err) } hijackDone <- true writeHijackHeader(req, httpBuf) // Force a flush after the header is written. if err := httpBuf.Flush(); err != nil { return fmt.Errorf("error flushing HTTP hijack header: %w", err) } defer func() { hijackWriteErrorAndClose(deferredErr, ctr.ID(), isTerminal, httpCon, httpBuf) }() logrus.Debugf("Hijack for container %s attach session done, ready to stream", ctr.ID()) // TODO: This is gross. Really, really gross. // I want to say we should read all the logs into an array before // calling this, in container_api.go, but that could take a lot of // memory... // On the whole, we need to figure out a better way of doing this, // though. logSize := 0 if streamLogs { logrus.Debugf("Will stream logs for container %s attach session", ctr.ID()) // Get all logs for the container logChan := make(chan *logs.LogLine) logOpts := new(logs.LogOptions) logOpts.Tail = -1 logOpts.WaitGroup = new(sync.WaitGroup) errChan := make(chan error) go func() { var err error // In non-terminal mode we need to prepend with the // stream header. logrus.Debugf("Writing logs for container %s to HTTP attach", ctr.ID()) for logLine := range logChan { if !isTerminal { device := logLine.Device var header []byte headerLen := uint32(len(logLine.Msg)) logSize += len(logLine.Msg) switch strings.ToLower(device) { case "stdin": header = makeHTTPAttachHeader(0, headerLen) case "stdout": header = makeHTTPAttachHeader(1, headerLen) case "stderr": header = makeHTTPAttachHeader(2, headerLen) default: logrus.Errorf("Unknown device for log line: %s", device) header = makeHTTPAttachHeader(1, headerLen) } _, err = httpBuf.Write(header) if err != nil { break } } _, err = httpBuf.Write([]byte(logLine.Msg)) if err != nil { break } if !logLine.Partial() { _, err = httpBuf.Write([]byte("\n")) if err != nil { break } } err = httpBuf.Flush() if err != nil { break } } errChan <- err }() if err := ctr.ReadLog(context.Background(), logOpts, logChan, 0); err != nil { return err } go func() { logOpts.WaitGroup.Wait() close(logChan) }() logrus.Debugf("Done reading logs for container %s, %d bytes", ctr.ID(), logSize) if err := <-errChan; err != nil { return err } } if !streamAttach { logrus.Debugf("Done streaming logs for container %s attach, exiting as attach streaming not requested", ctr.ID()) return nil } logrus.Debugf("Forwarding attach output for container %s", ctr.ID()) stdoutChan := make(chan error) stdinChan := make(chan error) // Handle STDOUT/STDERR go func() { var err error if isTerminal { // Hack: return immediately if attachStdout not set to // emulate Docker. // Basically, when terminal is set, STDERR goes nowhere. // Everything does over STDOUT. // Therefore, if not attaching STDOUT - we'll never copy // anything from here. logrus.Debugf("Performing terminal HTTP attach for container %s", ctr.ID()) if attachStdout { err = httpAttachTerminalCopy(conn, httpBuf, ctr.ID()) } } else { logrus.Debugf("Performing non-terminal HTTP attach for container %s", ctr.ID()) err = httpAttachNonTerminalCopy(conn, httpBuf, ctr.ID(), attachStdin, attachStdout, attachStderr) } stdoutChan <- err logrus.Debugf("STDOUT/ERR copy completed") }() // Next, STDIN. Avoid entirely if attachStdin unset. if attachStdin { go func() { _, err := cutil.CopyDetachable(conn, httpBuf, detach) logrus.Debugf("STDIN copy completed") stdinChan <- err }() } for { select { case err := <-stdoutChan: if err != nil { return err } return nil case err := <-stdinChan: if err != nil { return err } // copy stdin is done, close it if connErr := conn.CloseWrite(); connErr != nil { logrus.Errorf("Unable to close conn: %v", connErr) } case <-cancel: return nil } } } // isRetryable returns whether the error was caused by a blocked syscall or the // specified operation on a non blocking file descriptor wasn't ready for completion. func isRetryable(err error) bool { var errno syscall.Errno if errors.As(err, &errno) { return errno == syscall.EINTR || errno == syscall.EAGAIN } return false } // openControlFile opens the terminal control file. func openControlFile(ctr *Container, parentDir string) (*os.File, error) { controlPath := filepath.Join(parentDir, "ctl") for i := 0; i < 600; i++ { controlFile, err := os.OpenFile(controlPath, unix.O_WRONLY|unix.O_NONBLOCK, 0) if err == nil { return controlFile, nil } if !isRetryable(err) { return nil, fmt.Errorf("could not open ctl file for terminal resize for container %s: %w", ctr.ID(), err) } time.Sleep(time.Second / 10) } return nil, fmt.Errorf("timeout waiting for %q", controlPath) } // AttachResize resizes the terminal used by the given container. func (r *ConmonOCIRuntime) AttachResize(ctr *Container, newSize resize.TerminalSize) error { controlFile, err := openControlFile(ctr, ctr.bundlePath()) if err != nil { return err } defer controlFile.Close() logrus.Debugf("Received a resize event for container %s: %+v", ctr.ID(), newSize) if _, err = fmt.Fprintf(controlFile, "%d %d %d\n", 1, newSize.Height, newSize.Width); err != nil { return fmt.Errorf("failed to write to ctl file to resize terminal: %w", err) } return nil } // CheckpointContainer checkpoints the given container. func (r *ConmonOCIRuntime) CheckpointContainer(ctr *Container, options ContainerCheckpointOptions) (int64, error) { // imagePath is used by CRIU to store the actual checkpoint files imagePath := ctr.CheckpointPath() if options.PreCheckPoint { imagePath = ctr.PreCheckPointPath() } // workPath will be used to store dump.log and stats-dump workPath := ctr.bundlePath() logrus.Debugf("Writing checkpoint to %s", imagePath) logrus.Debugf("Writing checkpoint logs to %s", workPath) logrus.Debugf("Pre-dump the container %t", options.PreCheckPoint) args := []string{} args = append(args, r.runtimeFlags...) args = append(args, "checkpoint") args = append(args, "--image-path") args = append(args, imagePath) args = append(args, "--work-path") args = append(args, workPath) if options.KeepRunning { args = append(args, "--leave-running") } if options.TCPEstablished { args = append(args, "--tcp-established") } if options.FileLocks { args = append(args, "--file-locks") } if !options.PreCheckPoint && options.KeepRunning { args = append(args, "--leave-running") } if options.PreCheckPoint { args = append(args, "--pre-dump") } if !options.PreCheckPoint && options.WithPrevious { args = append( args, "--parent-path", filepath.Join("..", preCheckpointDir), ) } args = append(args, ctr.ID()) logrus.Debugf("the args to checkpoint: %s %s", r.path, strings.Join(args, " ")) runtimeDir, err := util.GetRuntimeDir() if err != nil { return 0, err } env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} if path, ok := os.LookupEnv("PATH"); ok { env = append(env, fmt.Sprintf("PATH=%s", path)) } var runtimeCheckpointStarted time.Time err = r.withContainerSocketLabel(ctr, func() error { runtimeCheckpointStarted = time.Now() return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, args...) }) runtimeCheckpointDuration := func() int64 { if options.PrintStats { return time.Since(runtimeCheckpointStarted).Microseconds() } return 0 }() return runtimeCheckpointDuration, err } func (r *ConmonOCIRuntime) CheckConmonRunning(ctr *Container) (bool, error) { if ctr.state.ConmonPID == 0 { // If the container is running or paused, assume Conmon is // running. We didn't record Conmon PID on some old versions, so // that is likely what's going on... // Unusual enough that we should print a warning message though. if ctr.ensureState(define.ContainerStateRunning, define.ContainerStatePaused) { logrus.Warnf("Conmon PID is not set, but container is running!") return true, nil } // Container's not running, so conmon PID being unset is // expected. Conmon is not running. return false, nil } // We have a conmon PID. Ping it with signal 0. if err := unix.Kill(ctr.state.ConmonPID, 0); err != nil { if err == unix.ESRCH { return false, nil } return false, fmt.Errorf("error pinging container %s conmon with signal 0: %w", ctr.ID(), err) } return true, nil } // SupportsCheckpoint checks if the OCI runtime supports checkpointing // containers. func (r *ConmonOCIRuntime) SupportsCheckpoint() bool { return crutils.CRRuntimeSupportsCheckpointRestore(r.path) } // SupportsJSONErrors checks if the OCI runtime supports JSON-formatted error // messages. func (r *ConmonOCIRuntime) SupportsJSONErrors() bool { return r.supportsJSON } // SupportsNoCgroups checks if the OCI runtime supports running containers // without cgroups (the --cgroup-manager=disabled flag). func (r *ConmonOCIRuntime) SupportsNoCgroups() bool { return r.supportsNoCgroups } // SupportsKVM checks if the OCI runtime supports running containers // without KVM separation func (r *ConmonOCIRuntime) SupportsKVM() bool { return r.supportsKVM } // AttachSocketPath is the path to a single container's attach socket. func (r *ConmonOCIRuntime) AttachSocketPath(ctr *Container) (string, error) { if ctr == nil { return "", fmt.Errorf("must provide a valid container to get attach socket path: %w", define.ErrInvalidArg) } return filepath.Join(ctr.bundlePath(), "attach"), nil } // ExitFilePath is the path to a container's exit file. func (r *ConmonOCIRuntime) ExitFilePath(ctr *Container) (string, error) { if ctr == nil { return "", fmt.Errorf("must provide a valid container to get exit file path: %w", define.ErrInvalidArg) } return filepath.Join(r.exitsDir, ctr.ID()), nil } // RuntimeInfo provides information on the runtime. func (r *ConmonOCIRuntime) RuntimeInfo() (*define.ConmonInfo, *define.OCIRuntimeInfo, error) { runtimePackage := packageVersion(r.path) conmonPackage := packageVersion(r.conmonPath) runtimeVersion, err := r.getOCIRuntimeVersion() if err != nil { return nil, nil, fmt.Errorf("error getting version of OCI runtime %s: %w", r.name, err) } conmonVersion, err := r.getConmonVersion() if err != nil { return nil, nil, fmt.Errorf("error getting conmon version: %w", err) } conmon := define.ConmonInfo{ Package: conmonPackage, Path: r.conmonPath, Version: conmonVersion, } ocirt := define.OCIRuntimeInfo{ Name: r.name, Path: r.path, Package: runtimePackage, Version: runtimeVersion, } return &conmon, &ocirt, nil } // makeAccessible changes the path permission and each parent directory to have --x--x--x func makeAccessible(path string, uid, gid int) error { for ; path != "/"; path = filepath.Dir(path) { st, err := os.Stat(path) if err != nil { if os.IsNotExist(err) { return nil } return err } if int(st.Sys().(*syscall.Stat_t).Uid) == uid && int(st.Sys().(*syscall.Stat_t).Gid) == gid { continue } if st.Mode()&0111 != 0111 { if err := os.Chmod(path, st.Mode()|0111); err != nil { return err } } } return nil } // Wait for a container which has been sent a signal to stop func waitContainerStop(ctr *Container, timeout time.Duration) error { return waitPidStop(ctr.state.PID, timeout) } // Wait for a given PID to stop func waitPidStop(pid int, timeout time.Duration) error { done := make(chan struct{}) chControl := make(chan struct{}) go func() { for { select { case <-chControl: return default: if err := unix.Kill(pid, 0); err != nil { if err == unix.ESRCH { close(done) return } logrus.Errorf("Pinging PID %d with signal 0: %v", pid, err) } time.Sleep(100 * time.Millisecond) } } }() select { case <-done: return nil case <-time.After(timeout): close(chControl) return fmt.Errorf("given PIDs did not die within timeout") } } func (r *ConmonOCIRuntime) getLogTag(ctr *Container) (string, error) { logTag := ctr.LogTag() if logTag == "" { return "", nil } data, err := ctr.inspectLocked(false) if err != nil { // FIXME: this error should probably be returned return "", nil //nolint: nilerr } tmpl, err := template.New("container").Parse(logTag) if err != nil { return "", fmt.Errorf("template parsing error %s: %w", logTag, err) } var b bytes.Buffer err = tmpl.Execute(&b, data) if err != nil { return "", err } return b.String(), nil } // createOCIContainer generates this container's main conmon instance and prepares it for starting func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) { var stderrBuf bytes.Buffer runtimeDir, err := util.GetRuntimeDir() if err != nil { return 0, err } parentSyncPipe, childSyncPipe, err := newPipe() if err != nil { return 0, fmt.Errorf("error creating socket pair: %w", err) } defer errorhandling.CloseQuiet(parentSyncPipe) childStartPipe, parentStartPipe, err := newPipe() if err != nil { return 0, fmt.Errorf("error creating socket pair for start pipe: %w", err) } defer errorhandling.CloseQuiet(parentStartPipe) var ociLog string if logrus.GetLevel() != logrus.DebugLevel && r.supportsJSON { ociLog = filepath.Join(ctr.state.RunDir, "oci-log") } logTag, err := r.getLogTag(ctr) if err != nil { return 0, err } if ctr.config.CgroupsMode == cgroupSplit { if err := utils.MoveUnderCgroupSubtree("runtime"); err != nil { return 0, err } } pidfile := ctr.config.PidFile if pidfile == "" { pidfile = filepath.Join(ctr.state.RunDir, "pidfile") } args := r.sharedConmonArgs(ctr, ctr.ID(), ctr.bundlePath(), pidfile, ctr.LogPath(), r.exitsDir, ociLog, ctr.LogDriver(), logTag) if ctr.config.SdNotifyMode == define.SdNotifyModeContainer && ctr.config.SdNotifySocket != "" { args = append(args, fmt.Sprintf("--sdnotify-socket=%s", ctr.config.SdNotifySocket)) } if ctr.config.Spec.Process.Terminal { args = append(args, "-t") } else if ctr.config.Stdin { args = append(args, "-i") } if ctr.config.Timeout > 0 { args = append(args, fmt.Sprintf("--timeout=%d", ctr.config.Timeout)) } if !r.enableKeyring { args = append(args, "--no-new-keyring") } if ctr.config.ConmonPidFile != "" { args = append(args, "--conmon-pidfile", ctr.config.ConmonPidFile) } if r.noPivot { args = append(args, "--no-pivot") } exitCommand, err := specgenutil.CreateExitCommandArgs(ctr.runtime.storageConfig, ctr.runtime.config, logrus.IsLevelEnabled(logrus.DebugLevel), ctr.AutoRemove(), false) if err != nil { return 0, err } exitCommand = append(exitCommand, ctr.config.ID) args = append(args, "--exit-command", exitCommand[0]) for _, arg := range exitCommand[1:] { args = append(args, []string{"--exit-command-arg", arg}...) } // Pass down the LISTEN_* environment (see #10443). preserveFDs := ctr.config.PreserveFDs if val := os.Getenv("LISTEN_FDS"); val != "" { if ctr.config.PreserveFDs > 0 { logrus.Warnf("Ignoring LISTEN_FDS to preserve custom user-specified FDs") } else { fds, err := strconv.Atoi(val) if err != nil { return 0, fmt.Errorf("converting LISTEN_FDS=%s: %w", val, err) } preserveFDs = uint(fds) } } if preserveFDs > 0 { args = append(args, formatRuntimeOpts("--preserve-fds", fmt.Sprintf("%d", preserveFDs))...) } if restoreOptions != nil { args = append(args, "--restore", ctr.CheckpointPath()) if restoreOptions.TCPEstablished { args = append(args, "--runtime-opt", "--tcp-established") } if restoreOptions.FileLocks { args = append(args, "--runtime-opt", "--file-locks") } if restoreOptions.Pod != "" { mountLabel := ctr.config.MountLabel processLabel := ctr.config.ProcessLabel if mountLabel != "" { args = append( args, "--runtime-opt", fmt.Sprintf( "--lsm-mount-context=%s", mountLabel, ), ) } if processLabel != "" { args = append( args, "--runtime-opt", fmt.Sprintf( "--lsm-profile=selinux:%s", processLabel, ), ) } } } logrus.WithFields(logrus.Fields{ "args": args, }).Debugf("running conmon: %s", r.conmonPath) cmd := exec.Command(r.conmonPath, args...) cmd.SysProcAttr = &syscall.SysProcAttr{ Setpgid: true, } // TODO this is probably a really bad idea for some uses // Make this configurable cmd.Stdin = os.Stdin cmd.Stdout = os.Stdout cmd.Stderr = os.Stderr if ctr.config.Spec.Process.Terminal { cmd.Stderr = &stderrBuf } // 0, 1 and 2 are stdin, stdout and stderr conmonEnv := r.configureConmonEnv(runtimeDir) var filesToClose []*os.File if preserveFDs > 0 { for fd := 3; fd < int(3+preserveFDs); fd++ { f := os.NewFile(uintptr(fd), fmt.Sprintf("fd-%d", fd)) filesToClose = append(filesToClose, f) cmd.ExtraFiles = append(cmd.ExtraFiles, f) } } cmd.Env = r.conmonEnv // we don't want to step on users fds they asked to preserve // Since 0-2 are used for stdio, start the fds we pass in at preserveFDs+3 cmd.Env = append(cmd.Env, fmt.Sprintf("_OCI_SYNCPIPE=%d", preserveFDs+3), fmt.Sprintf("_OCI_STARTPIPE=%d", preserveFDs+4)) cmd.Env = append(cmd.Env, conmonEnv...) cmd.ExtraFiles = append(cmd.ExtraFiles, childSyncPipe, childStartPipe) if r.reservePorts && !rootless.IsRootless() && !ctr.config.NetMode.IsSlirp4netns() { ports, err := bindPorts(ctr.convertPortMappings()) if err != nil { return 0, err } filesToClose = append(filesToClose, ports...) // Leak the port we bound in the conmon process. These fd's won't be used // by the container and conmon will keep the ports busy so that another // process cannot use them. cmd.ExtraFiles = append(cmd.ExtraFiles, ports...) } if ctr.config.NetMode.IsSlirp4netns() || rootless.IsRootless() { if ctr.config.PostConfigureNetNS { havePortMapping := len(ctr.config.PortMappings) > 0 if havePortMapping { ctr.rootlessPortSyncR, ctr.rootlessPortSyncW, err = os.Pipe() if err != nil { return 0, fmt.Errorf("failed to create rootless port sync pipe: %w", err) } } ctr.rootlessSlirpSyncR, ctr.rootlessSlirpSyncW, err = os.Pipe() if err != nil { return 0, fmt.Errorf("failed to create rootless network sync pipe: %w", err) } } else { if ctr.rootlessSlirpSyncR != nil { defer errorhandling.CloseQuiet(ctr.rootlessSlirpSyncR) } if ctr.rootlessSlirpSyncW != nil { defer errorhandling.CloseQuiet(ctr.rootlessSlirpSyncW) } } // Leak one end in conmon, the other one will be leaked into slirp4netns cmd.ExtraFiles = append(cmd.ExtraFiles, ctr.rootlessSlirpSyncW) if ctr.rootlessPortSyncW != nil { defer errorhandling.CloseQuiet(ctr.rootlessPortSyncW) // Leak one end in conmon, the other one will be leaked into rootlessport cmd.ExtraFiles = append(cmd.ExtraFiles, ctr.rootlessPortSyncW) } } var runtimeRestoreStarted time.Time if restoreOptions != nil { runtimeRestoreStarted = time.Now() } err = startCommand(cmd, ctr) // regardless of whether we errored or not, we no longer need the children pipes childSyncPipe.Close() childStartPipe.Close() if err != nil { return 0, err } if err := r.moveConmonToCgroupAndSignal(ctr, cmd, parentStartPipe); err != nil { return 0, err } /* Wait for initial setup and fork, and reap child */ err = cmd.Wait() if err != nil { return 0, err } pid, err := readConmonPipeData(r.name, parentSyncPipe, ociLog) if err != nil { if err2 := r.DeleteContainer(ctr); err2 != nil { logrus.Errorf("Removing container %s from runtime after creation failed", ctr.ID()) } return 0, err } ctr.state.PID = pid conmonPID, err := readConmonPidFile(ctr.config.ConmonPidFile) if err != nil { logrus.Warnf("Error reading conmon pid file for container %s: %v", ctr.ID(), err) } else if conmonPID > 0 { // conmon not having a pid file is a valid state, so don't set it if we don't have it logrus.Infof("Got Conmon PID as %d", conmonPID) ctr.state.ConmonPID = conmonPID } runtimeRestoreDuration := func() int64 { if restoreOptions != nil && restoreOptions.PrintStats { return time.Since(runtimeRestoreStarted).Microseconds() } return 0 }() // These fds were passed down to the runtime. Close them // and not interfere for _, f := range filesToClose { errorhandling.CloseQuiet(f) } return runtimeRestoreDuration, nil } // configureConmonEnv gets the environment values to add to conmon's exec struct // TODO this may want to be less hardcoded/more configurable in the future func (r *ConmonOCIRuntime) configureConmonEnv(runtimeDir string) []string { var env []string for _, e := range os.Environ() { if strings.HasPrefix(e, "LC_") { env = append(env, e) } } conf, ok := os.LookupEnv("CONTAINERS_CONF") if ok { env = append(env, fmt.Sprintf("CONTAINERS_CONF=%s", conf)) } env = append(env, fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)) env = append(env, fmt.Sprintf("_CONTAINERS_USERNS_CONFIGURED=%s", os.Getenv("_CONTAINERS_USERNS_CONFIGURED"))) env = append(env, fmt.Sprintf("_CONTAINERS_ROOTLESS_UID=%s", os.Getenv("_CONTAINERS_ROOTLESS_UID"))) home := homedir.Get() if home != "" { env = append(env, fmt.Sprintf("HOME=%s", home)) } return env } // sharedConmonArgs takes common arguments for exec and create/restore and formats them for the conmon CLI func (r *ConmonOCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, pidPath, logPath, exitDir, ociLogPath, logDriver, logTag string) []string { // set the conmon API version to be able to use the correct sync struct keys args := []string{ "--api-version", "1", "-c", ctr.ID(), "-u", cuuid, "-r", r.path, "-b", bundlePath, "-p", pidPath, "-n", ctr.Name(), "--exit-dir", exitDir, "--full-attach", } if len(r.runtimeFlags) > 0 { rFlags := []string{} for _, arg := range r.runtimeFlags { rFlags = append(rFlags, "--runtime-arg", arg) } args = append(args, rFlags...) } if ctr.CgroupManager() == config.SystemdCgroupsManager && !ctr.config.NoCgroups && ctr.config.CgroupsMode != cgroupSplit { args = append(args, "-s") } var logDriverArg string switch logDriver { case define.JournaldLogging: logDriverArg = define.JournaldLogging case define.NoLogging: logDriverArg = define.NoLogging case define.PassthroughLogging: logDriverArg = define.PassthroughLogging //lint:ignore ST1015 the default case has to be here default: //nolint:stylecheck,gocritic // No case here should happen except JSONLogging, but keep this here in case the options are extended logrus.Errorf("%s logging specified but not supported. Choosing k8s-file logging instead", ctr.LogDriver()) fallthrough case "": // to get here, either a user would specify `--log-driver ""`, or this came from another place in libpod // since the former case is obscure, and the latter case isn't an error, let's silently fallthrough fallthrough case define.JSONLogging: fallthrough case define.KubernetesLogging: logDriverArg = fmt.Sprintf("%s:%s", define.KubernetesLogging, logPath) } args = append(args, "-l", logDriverArg) logLevel := logrus.GetLevel() args = append(args, "--log-level", logLevel.String()) if logLevel == logrus.DebugLevel { logrus.Debugf("%s messages will be logged to syslog", r.conmonPath) args = append(args, "--syslog") } size := r.logSizeMax if ctr.config.LogSize > 0 { size = ctr.config.LogSize } if size > 0 { args = append(args, "--log-size-max", fmt.Sprintf("%v", size)) } if ociLogPath != "" { args = append(args, "--runtime-arg", "--log-format=json", "--runtime-arg", "--log", fmt.Sprintf("--runtime-arg=%s", ociLogPath)) } if logTag != "" { args = append(args, "--log-tag", logTag) } if ctr.config.NoCgroups { logrus.Debugf("Running with no Cgroups") args = append(args, "--runtime-arg", "--cgroup-manager", "--runtime-arg", "disabled") } return args } func startCommand(cmd *exec.Cmd, ctr *Container) error { // Make sure to unset the NOTIFY_SOCKET and reset it afterwards if needed. switch ctr.config.SdNotifyMode { case define.SdNotifyModeContainer, define.SdNotifyModeIgnore: if prev := os.Getenv("NOTIFY_SOCKET"); prev != "" { if err := os.Unsetenv("NOTIFY_SOCKET"); err != nil { logrus.Warnf("Error unsetting NOTIFY_SOCKET %v", err) } defer func() { if err := os.Setenv("NOTIFY_SOCKET", prev); err != nil { logrus.Errorf("Resetting NOTIFY_SOCKET=%s", prev) } }() } } return cmd.Start() } // newPipe creates a unix socket pair for communication. // Returns two files - first is parent, second is child. func newPipe() (*os.File, *os.File, error) { fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0) if err != nil { return nil, nil, err } return os.NewFile(uintptr(fds[1]), "parent"), os.NewFile(uintptr(fds[0]), "child"), nil } // readConmonPidFile attempts to read conmon's pid from its pid file func readConmonPidFile(pidFile string) (int, error) { // Let's try reading the Conmon pid at the same time. if pidFile != "" { contents, err := ioutil.ReadFile(pidFile) if err != nil { return -1, err } // Convert it to an int conmonPID, err := strconv.Atoi(string(contents)) if err != nil { return -1, err } return conmonPID, nil } return 0, nil } // readConmonPipeData attempts to read a syncInfo struct from the pipe func readConmonPipeData(runtimeName string, pipe *os.File, ociLog string) (int, error) { // syncInfo is used to return data from monitor process to daemon type syncInfo struct { Data int `json:"data"` Message string `json:"message,omitempty"` } // Wait to get container pid from conmon type syncStruct struct { si *syncInfo err error } ch := make(chan syncStruct) go func() { var si *syncInfo rdr := bufio.NewReader(pipe) b, err := rdr.ReadBytes('\n') // ignore EOF here, error is returned even when data was read // if it is no valid json unmarshal will fail below if err != nil && !errors.Is(err, io.EOF) { ch <- syncStruct{err: err} } if err := json.Unmarshal(b, &si); err != nil { ch <- syncStruct{err: fmt.Errorf("conmon bytes %q: %w", string(b), err)} return } ch <- syncStruct{si: si} }() data := -1 //nolint: wastedassign select { case ss := <-ch: if ss.err != nil { if ociLog != "" { ociLogData, err := ioutil.ReadFile(ociLog) if err == nil { var ociErr ociError if err := json.Unmarshal(ociLogData, &ociErr); err == nil { return -1, getOCIRuntimeError(runtimeName, ociErr.Msg) } } } return -1, fmt.Errorf("container create failed (no logs from conmon): %w", ss.err) } logrus.Debugf("Received: %d", ss.si.Data) if ss.si.Data < 0 { if ociLog != "" { ociLogData, err := ioutil.ReadFile(ociLog) if err == nil { var ociErr ociError if err := json.Unmarshal(ociLogData, &ociErr); err == nil { return ss.si.Data, getOCIRuntimeError(runtimeName, ociErr.Msg) } } } // If we failed to parse the JSON errors, then print the output as it is if ss.si.Message != "" { return ss.si.Data, getOCIRuntimeError(runtimeName, ss.si.Message) } return ss.si.Data, fmt.Errorf("container create failed: %w", define.ErrInternal) } data = ss.si.Data case <-time.After(define.ContainerCreateTimeout): return -1, fmt.Errorf("container creation timeout: %w", define.ErrInternal) } return data, nil } // writeConmonPipeData writes nonce data to a pipe func writeConmonPipeData(pipe *os.File) error { someData := []byte{0} _, err := pipe.Write(someData) return err } // formatRuntimeOpts prepends opts passed to it with --runtime-opt for passing to conmon func formatRuntimeOpts(opts ...string) []string { args := make([]string, 0, len(opts)*2) for _, o := range opts { args = append(args, "--runtime-opt", o) } return args } // getConmonVersion returns a string representation of the conmon version. func (r *ConmonOCIRuntime) getConmonVersion() (string, error) { output, err := utils.ExecCmd(r.conmonPath, "--version") if err != nil { return "", err } return strings.TrimSuffix(strings.Replace(output, "\n", ", ", 1), "\n"), nil } // getOCIRuntimeVersion returns a string representation of the OCI runtime's // version. func (r *ConmonOCIRuntime) getOCIRuntimeVersion() (string, error) { output, err := utils.ExecCmd(r.path, "--version") if err != nil { return "", err } return strings.TrimSuffix(output, "\n"), nil } // Copy data from container to HTTP connection, for terminal attach. // Container is the container's attach socket connection, http is a buffer for // the HTTP connection. cid is the ID of the container the attach session is // running for (used solely for error messages). func httpAttachTerminalCopy(container *net.UnixConn, http *bufio.ReadWriter, cid string) error { buf := make([]byte, bufferSize) for { numR, err := container.Read(buf) logrus.Debugf("Read fd(%d) %d/%d bytes for container %s", int(buf[0]), numR, len(buf), cid) if numR > 0 { switch buf[0] { case AttachPipeStdout: // Do nothing default: logrus.Errorf("Received unexpected attach type %+d, discarding %d bytes", buf[0], numR) continue } numW, err2 := http.Write(buf[1:numR]) if err2 != nil { if err != nil { logrus.Errorf("Reading container %s STDOUT: %v", cid, err) } return err2 } else if numW+1 != numR { return io.ErrShortWrite } // We need to force the buffer to write immediately, so // there isn't a delay on the terminal side. if err2 := http.Flush(); err2 != nil { if err != nil { logrus.Errorf("Reading container %s STDOUT: %v", cid, err) } return err2 } } if err != nil { if err == io.EOF { return nil } return err } } } // Copy data from a container to an HTTP connection, for non-terminal attach. // Appends a header to multiplex input. func httpAttachNonTerminalCopy(container *net.UnixConn, http *bufio.ReadWriter, cid string, stdin, stdout, stderr bool) error { buf := make([]byte, bufferSize) for { numR, err := container.Read(buf) if numR > 0 { var headerBuf []byte // Subtract 1 because we strip the first byte (used for // multiplexing by Conmon). headerLen := uint32(numR - 1) // Practically speaking, we could make this buf[0] - 1, // but we need to validate it anyway. switch buf[0] { case AttachPipeStdin: headerBuf = makeHTTPAttachHeader(0, headerLen) if !stdin { continue } case AttachPipeStdout: if !stdout { continue } headerBuf = makeHTTPAttachHeader(1, headerLen) case AttachPipeStderr: if !stderr { continue } headerBuf = makeHTTPAttachHeader(2, headerLen) default: logrus.Errorf("Received unexpected attach type %+d, discarding %d bytes", buf[0], numR) continue } numH, err2 := http.Write(headerBuf) if err2 != nil { if err != nil { logrus.Errorf("Reading container %s standard streams: %v", cid, err) } return err2 } // Hardcoding header length is pretty gross, but // fast. Should be safe, as this is a fixed part // of the protocol. if numH != 8 { if err != nil { logrus.Errorf("Reading container %s standard streams: %v", cid, err) } return io.ErrShortWrite } numW, err2 := http.Write(buf[1:numR]) if err2 != nil { if err != nil { logrus.Errorf("Reading container %s standard streams: %v", cid, err) } return err2 } else if numW+1 != numR { if err != nil { logrus.Errorf("Reading container %s standard streams: %v", cid, err) } return io.ErrShortWrite } // We need to force the buffer to write immediately, so // there isn't a delay on the terminal side. if err2 := http.Flush(); err2 != nil { if err != nil { logrus.Errorf("Reading container %s STDOUT: %v", cid, err) } return err2 } } if err != nil { if err == io.EOF { return nil } return err } } }