diff options
Diffstat (limited to 'libpod/oci_conmon_linux.go')
-rw-r--r-- | libpod/oci_conmon_linux.go | 1421 |
1 files changed, 1421 insertions, 0 deletions
diff --git a/libpod/oci_conmon_linux.go b/libpod/oci_conmon_linux.go new file mode 100644 index 000000000..f29758a69 --- /dev/null +++ b/libpod/oci_conmon_linux.go @@ -0,0 +1,1421 @@ +// +build linux + +package libpod + +import ( + "bufio" + "bytes" + "fmt" + "io/ioutil" + "os" + "os/exec" + "path/filepath" + "runtime" + "strconv" + "strings" + "syscall" + "time" + + "github.com/containers/libpod/libpod/define" + "github.com/containers/libpod/pkg/cgroups" + "github.com/containers/libpod/pkg/errorhandling" + "github.com/containers/libpod/pkg/lookup" + "github.com/containers/libpod/pkg/rootless" + "github.com/containers/libpod/pkg/util" + "github.com/containers/libpod/utils" + pmount "github.com/containers/storage/pkg/mount" + "github.com/coreos/go-systemd/activation" + spec "github.com/opencontainers/runtime-spec/specs-go" + "github.com/opencontainers/selinux/go-selinux" + "github.com/opencontainers/selinux/go-selinux/label" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +// ConmonOCIRuntime is an OCI runtime managed by Conmon. +// TODO: Make all calls to OCI runtime have a timeout. +type ConmonOCIRuntime struct { + name string + path string + conmonPath string + conmonEnv []string + cgroupManager string + tmpDir string + exitsDir string + socketsDir string + logSizeMax int64 + noPivot bool + reservePorts bool + supportsJSON bool + supportsNoCgroups bool + sdNotify bool +} + +// Make a new Conmon-based OCI runtime with the given options. +// Conmon will wrap the given OCI runtime, which can be `runc`, `crun`, or +// any runtime with a runc-compatible CLI. +// The first path that points to a valid executable will be used. +// Deliberately private. Someone should not be able to construct this outside of +// libpod. +func newConmonOCIRuntime(name string, paths []string, conmonPath string, runtimeCfg *RuntimeConfig, supportsJSON, supportsNoCgroups bool) (OCIRuntime, error) { + if name == "" { + return nil, errors.Wrapf(define.ErrInvalidArg, "the OCI runtime must be provided a non-empty name") + } + + runtime := new(ConmonOCIRuntime) + runtime.name = name + runtime.conmonPath = conmonPath + + runtime.conmonEnv = runtimeCfg.ConmonEnvVars + runtime.cgroupManager = runtimeCfg.CgroupManager + runtime.tmpDir = runtimeCfg.TmpDir + runtime.logSizeMax = runtimeCfg.MaxLogSize + runtime.noPivot = runtimeCfg.NoPivotRoot + runtime.reservePorts = runtimeCfg.EnablePortReservation + runtime.sdNotify = runtimeCfg.SDNotify + + // TODO: probe OCI runtime for feature and enable automatically if + // available. + runtime.supportsJSON = supportsJSON + runtime.supportsNoCgroups = supportsNoCgroups + + foundPath := false + for _, path := range paths { + stat, err := os.Stat(path) + if err != nil { + if os.IsNotExist(err) { + continue + } + return nil, errors.Wrapf(err, "cannot stat %s", path) + } + if !stat.Mode().IsRegular() { + continue + } + foundPath = true + runtime.path = path + logrus.Debugf("using runtime %q", path) + break + } + + // Search the $PATH as last fallback + if !foundPath { + if foundRuntime, err := exec.LookPath(name); err == nil { + foundPath = true + runtime.path = foundRuntime + logrus.Debugf("using runtime %q from $PATH: %q", name, foundRuntime) + } + } + + if !foundPath { + return nil, errors.Wrapf(define.ErrInvalidArg, "no valid executable found for OCI runtime %s", name) + } + + runtime.exitsDir = filepath.Join(runtime.tmpDir, "exits") + runtime.socketsDir = filepath.Join(runtime.tmpDir, "socket") + + if runtime.cgroupManager != CgroupfsCgroupsManager && runtime.cgroupManager != SystemdCgroupsManager { + return nil, errors.Wrapf(define.ErrInvalidArg, "invalid cgroup manager specified: %s", runtime.cgroupManager) + } + + // Create the exit files and attach sockets directories + if err := os.MkdirAll(runtime.exitsDir, 0750); err != nil { + // The directory is allowed to exist + if !os.IsExist(err) { + return nil, errors.Wrapf(err, "error creating OCI runtime exit files directory %s", + runtime.exitsDir) + } + } + if err := os.MkdirAll(runtime.socketsDir, 0750); err != nil { + // The directory is allowed to exist + if !os.IsExist(err) { + return nil, errors.Wrapf(err, "error creating OCI runtime attach sockets directory %s", + runtime.socketsDir) + } + } + + return runtime, nil +} + +// Name returns the name of the runtime being wrapped by Conmon. +func (r *ConmonOCIRuntime) Name() string { + return r.name +} + +// Path returns the path of the OCI runtime being wrapped by Conmon. +func (r *ConmonOCIRuntime) Path() string { + return r.path +} + +// CreateContainer creates a container. +func (r *ConmonOCIRuntime) CreateContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (err error) { + if len(ctr.config.IDMappings.UIDMap) != 0 || len(ctr.config.IDMappings.GIDMap) != 0 { + for _, i := range []string{ctr.state.RunDir, ctr.runtime.config.TmpDir, ctr.config.StaticDir, ctr.state.Mountpoint, ctr.runtime.config.VolumePath} { + if err := makeAccessible(i, ctr.RootUID(), ctr.RootGID()); err != nil { + return err + } + } + + // if we are running a non privileged container, be sure to umount some kernel paths so they are not + // bind mounted inside the container at all. + if !ctr.config.Privileged && !rootless.IsRootless() { + ch := make(chan error) + go func() { + runtime.LockOSThread() + err := func() error { + fd, err := os.Open(fmt.Sprintf("/proc/%d/task/%d/ns/mnt", os.Getpid(), unix.Gettid())) + if err != nil { + return err + } + defer errorhandling.CloseQuiet(fd) + + // create a new mountns on the current thread + if err = unix.Unshare(unix.CLONE_NEWNS); err != nil { + return err + } + defer func() { + if err := unix.Setns(int(fd.Fd()), unix.CLONE_NEWNS); err != nil { + logrus.Errorf("unable to clone new namespace: %q", err) + } + }() + + // don't spread our mounts around. We are setting only /sys to be slave + // so that the cleanup process is still able to umount the storage and the + // changes are propagated to the host. + err = unix.Mount("/sys", "/sys", "none", unix.MS_REC|unix.MS_SLAVE, "") + if err != nil { + return errors.Wrapf(err, "cannot make /sys slave") + } + + mounts, err := pmount.GetMounts() + if err != nil { + return err + } + for _, m := range mounts { + if !strings.HasPrefix(m.Mountpoint, "/sys/kernel") { + continue + } + err = unix.Unmount(m.Mountpoint, 0) + if err != nil && !os.IsNotExist(err) { + return errors.Wrapf(err, "cannot unmount %s", m.Mountpoint) + } + } + return r.createOCIContainer(ctr, restoreOptions) + }() + ch <- err + }() + err := <-ch + return err + } + } + return r.createOCIContainer(ctr, restoreOptions) +} + +// UpdateContainerStatus retrieves the current status of the container from the +// runtime. It updates the container's state but does not save it. +// If useRuntime is false, we will not directly hit runc to see the container's +// status, but will instead only check for the existence of the conmon exit file +// and update state to stopped if it exists. +func (r *ConmonOCIRuntime) UpdateContainerStatus(ctr *Container, useRuntime bool) error { + exitFile, err := ctr.exitFilePath() + if err != nil { + return err + } + + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return err + } + + // If not using the OCI runtime, we don't need to do most of this. + if !useRuntime { + // If the container's not running, nothing to do. + if ctr.state.State != define.ContainerStateRunning && ctr.state.State != define.ContainerStatePaused { + return nil + } + + // Check for the exit file conmon makes + info, err := os.Stat(exitFile) + if err != nil { + if os.IsNotExist(err) { + // Container is still running, no error + return nil + } + + return errors.Wrapf(err, "error running stat on container %s exit file", ctr.ID()) + } + + // Alright, it exists. Transition to Stopped state. + ctr.state.State = define.ContainerStateStopped + ctr.state.PID = 0 + ctr.state.ConmonPID = 0 + + // Read the exit file to get our stopped time and exit code. + return ctr.handleExitFile(exitFile, info) + } + + // Store old state so we know if we were already stopped + oldState := ctr.state.State + + state := new(spec.State) + + cmd := exec.Command(r.path, "state", ctr.ID()) + cmd.Env = append(cmd.Env, fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)) + + outPipe, err := cmd.StdoutPipe() + if err != nil { + return errors.Wrapf(err, "getting stdout pipe") + } + errPipe, err := cmd.StderrPipe() + if err != nil { + return errors.Wrapf(err, "getting stderr pipe") + } + + if err := cmd.Start(); err != nil { + out, err2 := ioutil.ReadAll(errPipe) + if err2 != nil { + return errors.Wrapf(err, "error getting container %s state", ctr.ID()) + } + if strings.Contains(string(out), "does not exist") { + if err := ctr.removeConmonFiles(); err != nil { + logrus.Debugf("unable to remove conmon files for container %s", ctr.ID()) + } + ctr.state.ExitCode = -1 + ctr.state.FinishedTime = time.Now() + ctr.state.State = define.ContainerStateExited + return nil + } + return errors.Wrapf(err, "error getting container %s state. stderr/out: %s", ctr.ID(), out) + } + defer func() { + _ = cmd.Wait() + }() + + if err := errPipe.Close(); err != nil { + return err + } + out, err := ioutil.ReadAll(outPipe) + if err != nil { + return errors.Wrapf(err, "error reading stdout: %s", ctr.ID()) + } + if err := json.NewDecoder(bytes.NewBuffer(out)).Decode(state); err != nil { + return errors.Wrapf(err, "error decoding container status for container %s", ctr.ID()) + } + ctr.state.PID = state.Pid + + switch state.Status { + case "created": + ctr.state.State = define.ContainerStateCreated + case "paused": + ctr.state.State = define.ContainerStatePaused + case "running": + ctr.state.State = define.ContainerStateRunning + case "stopped": + ctr.state.State = define.ContainerStateStopped + default: + return errors.Wrapf(define.ErrInternal, "unrecognized status returned by runtime for container %s: %s", + ctr.ID(), state.Status) + } + + // Only grab exit status if we were not already stopped + // If we were, it should already be in the database + if ctr.state.State == define.ContainerStateStopped && oldState != define.ContainerStateStopped { + var fi os.FileInfo + chWait := make(chan error) + defer close(chWait) + + _, err := WaitForFile(exitFile, chWait, time.Second*5) + if err == nil { + fi, err = os.Stat(exitFile) + } + if err != nil { + ctr.state.ExitCode = -1 + ctr.state.FinishedTime = time.Now() + logrus.Errorf("No exit file for container %s found: %v", ctr.ID(), err) + return nil + } + + return ctr.handleExitFile(exitFile, fi) + } + + return nil +} + +// StartContainer starts the given container. +// Sets time the container was started, but does not save it. +func (r *ConmonOCIRuntime) StartContainer(ctr *Container) error { + // TODO: streams should probably *not* be our STDIN/OUT/ERR - redirect to buffers? + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return err + } + env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} + if notify, ok := os.LookupEnv("NOTIFY_SOCKET"); ok { + env = append(env, fmt.Sprintf("NOTIFY_SOCKET=%s", notify)) + } + if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "start", ctr.ID()); err != nil { + return err + } + + ctr.state.StartedTime = time.Now() + + return nil +} + +// KillContainer sends the given signal to the given container. +// If all is set, send to all PIDs in the container. +// All is only supported if the container created cgroups. +func (r *ConmonOCIRuntime) KillContainer(ctr *Container, signal uint, all bool) error { + logrus.Debugf("Sending signal %d to container %s", signal, ctr.ID()) + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return err + } + env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} + var args []string + if all { + args = []string{"kill", "--all", ctr.ID(), fmt.Sprintf("%d", signal)} + } else { + args = []string{"kill", ctr.ID(), fmt.Sprintf("%d", signal)} + } + if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, args...); err != nil { + return errors.Wrapf(err, "error sending signal to container %s", ctr.ID()) + } + + return nil +} + +// StopContainer stops a container, first using its given stop signal (or +// SIGTERM if no signal was specified), then using SIGKILL. +// Timeout is given in seconds. If timeout is 0, the container will be +// immediately kill with SIGKILL. +// Does not set finished time for container, assumes you will run updateStatus +// after to pull the exit code. +func (r *ConmonOCIRuntime) StopContainer(ctr *Container, timeout uint, all bool) error { + logrus.Debugf("Stopping container %s (PID %d)", ctr.ID(), ctr.state.PID) + + // Ping the container to see if it's alive + // If it's not, it's already stopped, return + err := unix.Kill(ctr.state.PID, 0) + if err == unix.ESRCH { + return nil + } + + stopSignal := ctr.config.StopSignal + if stopSignal == 0 { + stopSignal = uint(syscall.SIGTERM) + } + + if timeout > 0 { + if err := r.KillContainer(ctr, stopSignal, all); err != nil { + // Is the container gone? + // If so, it probably died between the first check and + // our sending the signal + // The container is stopped, so exit cleanly + err := unix.Kill(ctr.state.PID, 0) + if err == unix.ESRCH { + return nil + } + + return err + } + + if err := waitContainerStop(ctr, time.Duration(timeout)*time.Second); err != nil { + logrus.Warnf("Timed out stopping container %s, resorting to SIGKILL", ctr.ID()) + } else { + // No error, the container is dead + return nil + } + } + + if err := r.KillContainer(ctr, 9, all); err != nil { + // Again, check if the container is gone. If it is, exit cleanly. + err := unix.Kill(ctr.state.PID, 0) + if err == unix.ESRCH { + return nil + } + + return errors.Wrapf(err, "error sending SIGKILL to container %s", ctr.ID()) + } + + // Give runtime a few seconds to make it happen + if err := waitContainerStop(ctr, killContainerTimeout); err != nil { + return err + } + + return nil +} + +// DeleteContainer deletes a container from the OCI runtime. +func (r *ConmonOCIRuntime) DeleteContainer(ctr *Container) error { + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return err + } + env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} + return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "delete", "--force", ctr.ID()) +} + +// PauseContainer pauses the given container. +func (r *ConmonOCIRuntime) PauseContainer(ctr *Container) error { + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return err + } + env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} + return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "pause", ctr.ID()) +} + +// UnpauseContainer unpauses the given container. +func (r *ConmonOCIRuntime) UnpauseContainer(ctr *Container) error { + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return err + } + env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} + return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "resume", ctr.ID()) +} + +// ExecContainer executes a command in a running container +// TODO: Split into Create/Start/Attach/Wait +func (r *ConmonOCIRuntime) ExecContainer(c *Container, sessionID string, options *ExecOptions) (int, chan error, error) { + if options == nil { + return -1, nil, errors.Wrapf(define.ErrInvalidArg, "must provide an ExecOptions struct to ExecContainer") + } + if len(options.Cmd) == 0 { + return -1, nil, errors.Wrapf(define.ErrInvalidArg, "must provide a command to execute") + } + + if sessionID == "" { + return -1, nil, errors.Wrapf(define.ErrEmptyID, "must provide a session ID for exec") + } + + // create sync pipe to receive the pid + parentSyncPipe, childSyncPipe, err := newPipe() + if err != nil { + return -1, nil, errors.Wrapf(err, "error creating socket pair") + } + + defer errorhandling.CloseQuiet(parentSyncPipe) + + // create start pipe to set the cgroup before running + // attachToExec is responsible for closing parentStartPipe + childStartPipe, parentStartPipe, err := newPipe() + if err != nil { + return -1, nil, errors.Wrapf(err, "error creating socket pair") + } + + // We want to make sure we close the parent{Start,Attach}Pipes if we fail + // but also don't want to close them after attach to exec is called + attachToExecCalled := false + + defer func() { + if !attachToExecCalled { + errorhandling.CloseQuiet(parentStartPipe) + } + }() + + // create the attach pipe to allow attach socket to be created before + // $RUNTIME exec starts running. This is to make sure we can capture all output + // from the process through that socket, rather than half reading the log, half attaching to the socket + // attachToExec is responsible for closing parentAttachPipe + parentAttachPipe, childAttachPipe, err := newPipe() + if err != nil { + return -1, nil, errors.Wrapf(err, "error creating socket pair") + } + + defer func() { + if !attachToExecCalled { + errorhandling.CloseQuiet(parentAttachPipe) + } + }() + + childrenClosed := false + defer func() { + if !childrenClosed { + errorhandling.CloseQuiet(childSyncPipe) + errorhandling.CloseQuiet(childAttachPipe) + errorhandling.CloseQuiet(childStartPipe) + } + }() + + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return -1, nil, err + } + + finalEnv := make([]string, 0, len(options.Env)) + for k, v := range options.Env { + finalEnv = append(finalEnv, fmt.Sprintf("%s=%s", k, v)) + } + + processFile, err := prepareProcessExec(c, options.Cmd, finalEnv, options.Terminal, options.Cwd, options.User, sessionID) + if err != nil { + return -1, nil, err + } + + var ociLog string + if logrus.GetLevel() != logrus.DebugLevel && r.supportsJSON { + ociLog = c.execOCILog(sessionID) + } + args := r.sharedConmonArgs(c, sessionID, c.execBundlePath(sessionID), c.execPidPath(sessionID), c.execLogPath(sessionID), c.execExitFileDir(sessionID), ociLog) + + if options.PreserveFDs > 0 { + args = append(args, formatRuntimeOpts("--preserve-fds", fmt.Sprintf("%d", options.PreserveFDs))...) + } + + for _, capability := range options.CapAdd { + args = append(args, formatRuntimeOpts("--cap", capability)...) + } + + if options.Terminal { + args = append(args, "-t") + } + + // Append container ID and command + args = append(args, "-e") + // TODO make this optional when we can detach + args = append(args, "--exec-attach") + args = append(args, "--exec-process-spec", processFile.Name()) + + logrus.WithFields(logrus.Fields{ + "args": args, + }).Debugf("running conmon: %s", r.conmonPath) + execCmd := exec.Command(r.conmonPath, args...) + + if options.Streams != nil { + if options.Streams.AttachInput { + execCmd.Stdin = options.Streams.InputStream + } + if options.Streams.AttachOutput { + execCmd.Stdout = options.Streams.OutputStream + } + if options.Streams.AttachError { + execCmd.Stderr = options.Streams.ErrorStream + } + } + + conmonEnv, extraFiles, err := r.configureConmonEnv(runtimeDir) + if err != nil { + return -1, nil, err + } + + if options.PreserveFDs > 0 { + for fd := 3; fd < int(3+options.PreserveFDs); fd++ { + execCmd.ExtraFiles = append(execCmd.ExtraFiles, os.NewFile(uintptr(fd), fmt.Sprintf("fd-%d", fd))) + } + } + + // we don't want to step on users fds they asked to preserve + // Since 0-2 are used for stdio, start the fds we pass in at preserveFDs+3 + execCmd.Env = append(r.conmonEnv, fmt.Sprintf("_OCI_SYNCPIPE=%d", options.PreserveFDs+3), fmt.Sprintf("_OCI_STARTPIPE=%d", options.PreserveFDs+4), fmt.Sprintf("_OCI_ATTACHPIPE=%d", options.PreserveFDs+5)) + execCmd.Env = append(execCmd.Env, conmonEnv...) + + execCmd.ExtraFiles = append(execCmd.ExtraFiles, childSyncPipe, childStartPipe, childAttachPipe) + execCmd.ExtraFiles = append(execCmd.ExtraFiles, extraFiles...) + execCmd.Dir = c.execBundlePath(sessionID) + execCmd.SysProcAttr = &syscall.SysProcAttr{ + Setpgid: true, + } + + err = startCommandGivenSelinux(execCmd) + + // We don't need children pipes on the parent side + errorhandling.CloseQuiet(childSyncPipe) + errorhandling.CloseQuiet(childAttachPipe) + errorhandling.CloseQuiet(childStartPipe) + childrenClosed = true + + if err != nil { + return -1, nil, errors.Wrapf(err, "cannot start container %s", c.ID()) + } + if err := r.moveConmonToCgroupAndSignal(c, execCmd, parentStartPipe, sessionID); err != nil { + return -1, nil, err + } + + if options.PreserveFDs > 0 { + for fd := 3; fd < int(3+options.PreserveFDs); fd++ { + // These fds were passed down to the runtime. Close them + // and not interfere + if err := os.NewFile(uintptr(fd), fmt.Sprintf("fd-%d", fd)).Close(); err != nil { + logrus.Debugf("unable to close file fd-%d", fd) + } + } + } + + // TODO Only create if !detach + // Attach to the container before starting it + attachChan := make(chan error) + go func() { + // attachToExec is responsible for closing pipes + attachChan <- c.attachToExec(options.Streams, options.DetachKeys, options.Resize, sessionID, parentStartPipe, parentAttachPipe) + close(attachChan) + }() + attachToExecCalled = true + + pid, err := readConmonPipeData(parentSyncPipe, ociLog) + + return pid, attachChan, err +} + +// ExecStopContainer stops a given exec session in a running container. +func (r *ConmonOCIRuntime) ExecStopContainer(ctr *Container, sessionID string, timeout uint) error { + session, ok := ctr.state.ExecSessions[sessionID] + if !ok { + // TODO This should probably be a separate error + return errors.Wrapf(define.ErrInvalidArg, "no exec session with ID %s found in container %s", sessionID, ctr.ID()) + } + + logrus.Debugf("Going to stop container %s exec session %s", ctr.ID(), sessionID) + + // Is the session dead? + // Ping the PID with signal 0 to see if it still exists. + if err := unix.Kill(session.PID, 0); err != nil { + if err == unix.ESRCH { + return nil + } + return errors.Wrapf(err, "error pinging container %s exec session %s PID %d with signal 0", ctr.ID(), sessionID, session.PID) + } + + if timeout > 0 { + // Use SIGTERM by default, then SIGSTOP after timeout. + logrus.Debugf("Killing exec session %s (PID %d) of container %s with SIGTERM", sessionID, session.PID, ctr.ID()) + if err := unix.Kill(session.PID, unix.SIGTERM); err != nil { + if err == unix.ESRCH { + return nil + } + return errors.Wrapf(err, "error killing container %s exec session %s PID %d with SIGTERM", ctr.ID(), sessionID, session.PID) + } + + // Wait for the PID to stop + if err := waitPidStop(session.PID, time.Duration(timeout)*time.Second); err != nil { + logrus.Warnf("Timed out waiting for container %s exec session %s to stop, resorting to SIGKILL", ctr.ID(), sessionID) + } else { + // No error, container is dead + return nil + } + } + + // SIGTERM did not work. On to SIGKILL. + logrus.Debugf("Killing exec session %s (PID %d) of container %s with SIGKILL", sessionID, session.PID, ctr.ID()) + if err := unix.Kill(session.PID, unix.SIGTERM); err != nil { + if err == unix.ESRCH { + return nil + } + return errors.Wrapf(err, "error killing container %s exec session %s PID %d with SIGKILL", ctr.ID(), sessionID, session.PID) + } + + // Wait for the PID to stop + if err := waitPidStop(session.PID, killContainerTimeout*time.Second); err != nil { + return errors.Wrapf(err, "timed out waiting for container %s exec session %s PID %d to stop after SIGKILL", ctr.ID(), sessionID, session.PID) + } + + return nil +} + +// ExecCleanupContainer cleans up files created when a command is run via +// ExecContainer. This includes the attach socket for the exec session. +func (r *ConmonOCIRuntime) ExecContainerCleanup(ctr *Container, sessionID string) error { + // Clean up the sockets dir. Issue #3962 + // Also ignore if it doesn't exist for some reason; hence the conditional return below + if err := os.RemoveAll(filepath.Join(r.socketsDir, sessionID)); err != nil && !os.IsNotExist(err) { + return err + } + return nil +} + +// CheckpointContainer checkpoints the given container. +func (r *ConmonOCIRuntime) CheckpointContainer(ctr *Container, options ContainerCheckpointOptions) error { + if err := label.SetSocketLabel(ctr.ProcessLabel()); err != nil { + return err + } + // imagePath is used by CRIU to store the actual checkpoint files + imagePath := ctr.CheckpointPath() + // workPath will be used to store dump.log and stats-dump + workPath := ctr.bundlePath() + logrus.Debugf("Writing checkpoint to %s", imagePath) + logrus.Debugf("Writing checkpoint logs to %s", workPath) + args := []string{} + args = append(args, "checkpoint") + args = append(args, "--image-path") + args = append(args, imagePath) + args = append(args, "--work-path") + args = append(args, workPath) + if options.KeepRunning { + args = append(args, "--leave-running") + } + if options.TCPEstablished { + args = append(args, "--tcp-established") + } + args = append(args, ctr.ID()) + return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, nil, r.path, args...) +} + +// SupportsCheckpoint checks if the OCI runtime supports checkpointing +// containers. +func (r *ConmonOCIRuntime) SupportsCheckpoint() bool { + // Check if the runtime implements checkpointing. Currently only + // runc's checkpoint/restore implementation is supported. + cmd := exec.Command(r.path, "checkpoint", "-h") + if err := cmd.Start(); err != nil { + return false + } + if err := cmd.Wait(); err == nil { + return true + } + return false +} + +// SupportsJSONErrors checks if the OCI runtime supports JSON-formatted error +// messages. +func (r *ConmonOCIRuntime) SupportsJSONErrors() bool { + return r.supportsJSON +} + +// SupportsNoCgroups checks if the OCI runtime supports running containers +// without cgroups (the --cgroup-manager=disabled flag). +func (r *ConmonOCIRuntime) SupportsNoCgroups() bool { + return r.supportsNoCgroups +} + +// AttachSocketPath is the path to a single container's attach socket. +func (r *ConmonOCIRuntime) AttachSocketPath(ctr *Container) (string, error) { + if ctr == nil { + return "", errors.Wrapf(define.ErrInvalidArg, "must provide a valid container to get attach socket path") + } + + return filepath.Join(r.socketsDir, ctr.ID(), "attach"), nil +} + +// ExecAttachSocketPath is the path to a container's exec session attach socket. +func (r *ConmonOCIRuntime) ExecAttachSocketPath(ctr *Container, sessionID string) (string, error) { + // We don't even use container, so don't validity check it + if sessionID == "" { + return "", errors.Wrapf(define.ErrInvalidArg, "must provide a valid session ID to get attach socket path") + } + + return filepath.Join(r.socketsDir, sessionID, "attach"), nil +} + +// ExitFilePath is the path to a container's exit file. +func (r *ConmonOCIRuntime) ExitFilePath(ctr *Container) (string, error) { + if ctr == nil { + return "", errors.Wrapf(define.ErrInvalidArg, "must provide a valid container to get exit file path") + } + return filepath.Join(r.exitsDir, ctr.ID()), nil +} + +// RuntimeInfo provides information on the runtime. +func (r *ConmonOCIRuntime) RuntimeInfo() (map[string]interface{}, error) { + runtimePackage := packageVersion(r.path) + conmonPackage := packageVersion(r.conmonPath) + runtimeVersion, err := r.getOCIRuntimeVersion() + if err != nil { + return nil, errors.Wrapf(err, "error getting version of OCI runtime %s", r.name) + } + conmonVersion, err := r.getConmonVersion() + if err != nil { + return nil, errors.Wrapf(err, "error getting conmon version") + } + + info := make(map[string]interface{}) + info["Conmon"] = map[string]interface{}{ + "path": r.conmonPath, + "package": conmonPackage, + "version": conmonVersion, + } + info["OCIRuntime"] = map[string]interface{}{ + "path": r.path, + "package": runtimePackage, + "version": runtimeVersion, + } + + return info, nil +} + +// makeAccessible changes the path permission and each parent directory to have --x--x--x +func makeAccessible(path string, uid, gid int) error { + for ; path != "/"; path = filepath.Dir(path) { + st, err := os.Stat(path) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + if int(st.Sys().(*syscall.Stat_t).Uid) == uid && int(st.Sys().(*syscall.Stat_t).Gid) == gid { + continue + } + if st.Mode()&0111 != 0111 { + if err := os.Chmod(path, st.Mode()|0111); err != nil { + return err + } + } + } + return nil +} + +// Wait for a container which has been sent a signal to stop +func waitContainerStop(ctr *Container, timeout time.Duration) error { + return waitPidStop(ctr.state.PID, timeout) +} + +// Wait for a given PID to stop +func waitPidStop(pid int, timeout time.Duration) error { + done := make(chan struct{}) + chControl := make(chan struct{}) + go func() { + for { + select { + case <-chControl: + return + default: + if err := unix.Kill(pid, 0); err != nil { + if err == unix.ESRCH { + close(done) + return + } + logrus.Errorf("Error pinging PID %d with signal 0: %v", pid, err) + } + time.Sleep(100 * time.Millisecond) + } + } + }() + select { + case <-done: + return nil + case <-time.After(timeout): + close(chControl) + return errors.Errorf("given PIDs did not die within timeout") + } +} + +// createOCIContainer generates this container's main conmon instance and prepares it for starting +func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (err error) { + var stderrBuf bytes.Buffer + + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return err + } + + parentSyncPipe, childSyncPipe, err := newPipe() + if err != nil { + return errors.Wrapf(err, "error creating socket pair") + } + defer errorhandling.CloseQuiet(parentSyncPipe) + + childStartPipe, parentStartPipe, err := newPipe() + if err != nil { + return errors.Wrapf(err, "error creating socket pair for start pipe") + } + + defer errorhandling.CloseQuiet(parentStartPipe) + + var ociLog string + if logrus.GetLevel() != logrus.DebugLevel && r.supportsJSON { + ociLog = filepath.Join(ctr.state.RunDir, "oci-log") + } + args := r.sharedConmonArgs(ctr, ctr.ID(), ctr.bundlePath(), filepath.Join(ctr.state.RunDir, "pidfile"), ctr.LogPath(), r.exitsDir, ociLog) + + if ctr.config.Spec.Process.Terminal { + args = append(args, "-t") + } else if ctr.config.Stdin { + args = append(args, "-i") + } + + if ctr.config.ConmonPidFile != "" { + args = append(args, "--conmon-pidfile", ctr.config.ConmonPidFile) + } + + if r.noPivot { + args = append(args, "--no-pivot") + } + + if len(ctr.config.ExitCommand) > 0 { + args = append(args, "--exit-command", ctr.config.ExitCommand[0]) + for _, arg := range ctr.config.ExitCommand[1:] { + args = append(args, []string{"--exit-command-arg", arg}...) + } + } + + if restoreOptions != nil { + args = append(args, "--restore", ctr.CheckpointPath()) + if restoreOptions.TCPEstablished { + args = append(args, "--runtime-opt", "--tcp-established") + } + } + + logrus.WithFields(logrus.Fields{ + "args": args, + }).Debugf("running conmon: %s", r.conmonPath) + + cmd := exec.Command(r.conmonPath, args...) + cmd.Dir = ctr.bundlePath() + cmd.SysProcAttr = &syscall.SysProcAttr{ + Setpgid: true, + } + // TODO this is probably a really bad idea for some uses + // Make this configurable + cmd.Stdin = os.Stdin + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + if ctr.config.Spec.Process.Terminal { + cmd.Stderr = &stderrBuf + } + + // 0, 1 and 2 are stdin, stdout and stderr + conmonEnv, envFiles, err := r.configureConmonEnv(runtimeDir) + if err != nil { + return err + } + + cmd.Env = append(r.conmonEnv, fmt.Sprintf("_OCI_SYNCPIPE=%d", 3), fmt.Sprintf("_OCI_STARTPIPE=%d", 4)) + cmd.Env = append(cmd.Env, conmonEnv...) + cmd.ExtraFiles = append(cmd.ExtraFiles, childSyncPipe, childStartPipe) + cmd.ExtraFiles = append(cmd.ExtraFiles, envFiles...) + + if r.reservePorts && !ctr.config.NetMode.IsSlirp4netns() { + ports, err := bindPorts(ctr.config.PortMappings) + if err != nil { + return err + } + + // Leak the port we bound in the conmon process. These fd's won't be used + // by the container and conmon will keep the ports busy so that another + // process cannot use them. + cmd.ExtraFiles = append(cmd.ExtraFiles, ports...) + } + + if ctr.config.NetMode.IsSlirp4netns() { + if ctr.config.PostConfigureNetNS { + ctr.rootlessSlirpSyncR, ctr.rootlessSlirpSyncW, err = os.Pipe() + if err != nil { + return errors.Wrapf(err, "failed to create rootless network sync pipe") + } + } else { + if ctr.rootlessSlirpSyncR != nil { + defer errorhandling.CloseQuiet(ctr.rootlessSlirpSyncR) + } + if ctr.rootlessSlirpSyncW != nil { + defer errorhandling.CloseQuiet(ctr.rootlessSlirpSyncW) + } + } + // Leak one end in conmon, the other one will be leaked into slirp4netns + cmd.ExtraFiles = append(cmd.ExtraFiles, ctr.rootlessSlirpSyncW) + } + + err = startCommandGivenSelinux(cmd) + // regardless of whether we errored or not, we no longer need the children pipes + childSyncPipe.Close() + childStartPipe.Close() + if err != nil { + return err + } + if err := r.moveConmonToCgroupAndSignal(ctr, cmd, parentStartPipe, ctr.ID()); err != nil { + return err + } + /* Wait for initial setup and fork, and reap child */ + err = cmd.Wait() + if err != nil { + return err + } + + pid, err := readConmonPipeData(parentSyncPipe, ociLog) + if err != nil { + if err2 := r.DeleteContainer(ctr); err2 != nil { + logrus.Errorf("Error removing container %s from runtime after creation failed", ctr.ID()) + } + return err + } + ctr.state.PID = pid + + conmonPID, err := readConmonPidFile(ctr.config.ConmonPidFile) + if err != nil { + logrus.Warnf("error reading conmon pid file for container %s: %s", ctr.ID(), err.Error()) + } else if conmonPID > 0 { + // conmon not having a pid file is a valid state, so don't set it if we don't have it + logrus.Infof("Got Conmon PID as %d", conmonPID) + ctr.state.ConmonPID = conmonPID + } + + return nil +} + +// prepareProcessExec returns the path of the process.json used in runc exec -p +// caller is responsible to close the returned *os.File if needed. +func prepareProcessExec(c *Container, cmd, env []string, tty bool, cwd, user, sessionID string) (*os.File, error) { + f, err := ioutil.TempFile(c.execBundlePath(sessionID), "exec-process-") + if err != nil { + return nil, err + } + + pspec := c.config.Spec.Process + pspec.Args = cmd + // We need to default this to false else it will inherit terminal as true + // from the container. + pspec.Terminal = false + if tty { + pspec.Terminal = true + } + if len(env) > 0 { + pspec.Env = append(pspec.Env, env...) + } + + if cwd != "" { + pspec.Cwd = cwd + + } + + overrides := c.getUserOverrides() + execUser, err := lookup.GetUserGroupInfo(c.state.Mountpoint, user, overrides) + if err != nil { + return nil, err + } + + // If user was set, look it up in the container to get a UID to use on + // the host + if user != "" { + sgids := make([]uint32, 0, len(execUser.Sgids)) + for _, sgid := range execUser.Sgids { + sgids = append(sgids, uint32(sgid)) + } + processUser := spec.User{ + UID: uint32(execUser.Uid), + GID: uint32(execUser.Gid), + AdditionalGids: sgids, + } + + pspec.User = processUser + } + + hasHomeSet := false + for _, s := range pspec.Env { + if strings.HasPrefix(s, "HOME=") { + hasHomeSet = true + break + } + } + if !hasHomeSet { + pspec.Env = append(pspec.Env, fmt.Sprintf("HOME=%s", execUser.Home)) + } + + processJSON, err := json.Marshal(pspec) + if err != nil { + return nil, err + } + + if err := ioutil.WriteFile(f.Name(), processJSON, 0644); err != nil { + return nil, err + } + return f, nil +} + +// configureConmonEnv gets the environment values to add to conmon's exec struct +// TODO this may want to be less hardcoded/more configurable in the future +func (r *ConmonOCIRuntime) configureConmonEnv(runtimeDir string) ([]string, []*os.File, error) { + env := make([]string, 0, 6) + env = append(env, fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)) + env = append(env, fmt.Sprintf("_CONTAINERS_USERNS_CONFIGURED=%s", os.Getenv("_CONTAINERS_USERNS_CONFIGURED"))) + env = append(env, fmt.Sprintf("_CONTAINERS_ROOTLESS_UID=%s", os.Getenv("_CONTAINERS_ROOTLESS_UID"))) + home, err := homeDir() + if err != nil { + return nil, nil, err + } + env = append(env, fmt.Sprintf("HOME=%s", home)) + + extraFiles := make([]*os.File, 0) + if notify, ok := os.LookupEnv("NOTIFY_SOCKET"); ok { + env = append(env, fmt.Sprintf("NOTIFY_SOCKET=%s", notify)) + } + if !r.sdNotify { + if listenfds, ok := os.LookupEnv("LISTEN_FDS"); ok { + env = append(env, fmt.Sprintf("LISTEN_FDS=%s", listenfds), "LISTEN_PID=1") + fds := activation.Files(false) + extraFiles = append(extraFiles, fds...) + } + } else { + logrus.Debug("disabling SD notify") + } + return env, extraFiles, nil +} + +// sharedConmonArgs takes common arguments for exec and create/restore and formats them for the conmon CLI +func (r *ConmonOCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, pidPath, logPath, exitDir, ociLogPath string) []string { + // set the conmon API version to be able to use the correct sync struct keys + args := []string{"--api-version", "1"} + if r.cgroupManager == SystemdCgroupsManager && !ctr.config.NoCgroups { + args = append(args, "-s") + } + args = append(args, "-c", ctr.ID()) + args = append(args, "-u", cuuid) + args = append(args, "-r", r.path) + args = append(args, "-b", bundlePath) + args = append(args, "-p", pidPath) + + var logDriver string + switch ctr.LogDriver() { + case JournaldLogging: + logDriver = JournaldLogging + case JSONLogging: + fallthrough + default: //nolint-stylecheck + // No case here should happen except JSONLogging, but keep this here in case the options are extended + logrus.Errorf("%s logging specified but not supported. Choosing k8s-file logging instead", ctr.LogDriver()) + fallthrough + case "": + // to get here, either a user would specify `--log-driver ""`, or this came from another place in libpod + // since the former case is obscure, and the latter case isn't an error, let's silently fallthrough + fallthrough + case KubernetesLogging: + logDriver = fmt.Sprintf("%s:%s", KubernetesLogging, logPath) + } + + args = append(args, "-l", logDriver) + args = append(args, "--exit-dir", exitDir) + args = append(args, "--socket-dir-path", r.socketsDir) + if r.logSizeMax >= 0 { + args = append(args, "--log-size-max", fmt.Sprintf("%v", r.logSizeMax)) + } + + logLevel := logrus.GetLevel() + args = append(args, "--log-level", logLevel.String()) + + if logLevel == logrus.DebugLevel { + logrus.Debugf("%s messages will be logged to syslog", r.conmonPath) + args = append(args, "--syslog") + } + if ociLogPath != "" { + args = append(args, "--runtime-arg", "--log-format=json", "--runtime-arg", "--log", fmt.Sprintf("--runtime-arg=%s", ociLogPath)) + } + if ctr.config.NoCgroups { + logrus.Debugf("Running with no CGroups") + args = append(args, "--runtime-arg", "--cgroup-manager", "--runtime-arg", "disabled") + } + return args +} + +// startCommandGivenSelinux starts a container ensuring to set the labels of +// the process to make sure SELinux doesn't block conmon communication, if SELinux is enabled +func startCommandGivenSelinux(cmd *exec.Cmd) error { + if !selinux.GetEnabled() { + return cmd.Start() + } + // Set the label of the conmon process to be level :s0 + // This will allow the container processes to talk to fifo-files + // passed into the container by conmon + var ( + plabel string + con selinux.Context + err error + ) + plabel, err = selinux.CurrentLabel() + if err != nil { + return errors.Wrapf(err, "Failed to get current SELinux label") + } + + con, err = selinux.NewContext(plabel) + if err != nil { + return errors.Wrapf(err, "Failed to get new context from SELinux label") + } + + runtime.LockOSThread() + if con["level"] != "s0" && con["level"] != "" { + con["level"] = "s0" + if err = label.SetProcessLabel(con.Get()); err != nil { + runtime.UnlockOSThread() + return err + } + } + err = cmd.Start() + // Ignore error returned from SetProcessLabel("") call, + // can't recover. + if labelErr := label.SetProcessLabel(""); labelErr != nil { + logrus.Errorf("unable to set process label: %q", err) + } + runtime.UnlockOSThread() + return err +} + +// moveConmonToCgroupAndSignal gets a container's cgroupParent and moves the conmon process to that cgroup +// it then signals for conmon to start by sending nonse data down the start fd +func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec.Cmd, startFd *os.File, uuid string) error { + mustCreateCgroup := true + // If cgroup creation is disabled - just signal. + if ctr.config.NoCgroups { + mustCreateCgroup = false + } + + if rootless.IsRootless() { + ownsCgroup, err := cgroups.UserOwnsCurrentSystemdCgroup() + if err != nil { + return err + } + mustCreateCgroup = !ownsCgroup + } + + if mustCreateCgroup { + cgroupParent := ctr.CgroupParent() + if r.cgroupManager == SystemdCgroupsManager { + unitName := createUnitName("libpod-conmon", ctr.ID()) + + realCgroupParent := cgroupParent + splitParent := strings.Split(cgroupParent, "/") + if strings.HasSuffix(cgroupParent, ".slice") && len(splitParent) > 1 { + realCgroupParent = splitParent[len(splitParent)-1] + } + + logrus.Infof("Running conmon under slice %s and unitName %s", realCgroupParent, unitName) + if err := utils.RunUnderSystemdScope(cmd.Process.Pid, realCgroupParent, unitName); err != nil { + logrus.Warnf("Failed to add conmon to systemd sandbox cgroup: %v", err) + } + } else { + cgroupPath := filepath.Join(ctr.config.CgroupParent, "conmon") + control, err := cgroups.New(cgroupPath, &spec.LinuxResources{}) + if err != nil { + logrus.Warnf("Failed to add conmon to cgroupfs sandbox cgroup: %v", err) + } else { + // we need to remove this defer and delete the cgroup once conmon exits + // maybe need a conmon monitor? + if err := control.AddPid(cmd.Process.Pid); err != nil { + logrus.Warnf("Failed to add conmon to cgroupfs sandbox cgroup: %v", err) + } + } + } + } + + /* We set the cgroup, now the child can start creating children */ + if err := writeConmonPipeData(startFd); err != nil { + return err + } + return nil +} + +// newPipe creates a unix socket pair for communication +func newPipe() (parent *os.File, child *os.File, err error) { + fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0) + if err != nil { + return nil, nil, err + } + return os.NewFile(uintptr(fds[1]), "parent"), os.NewFile(uintptr(fds[0]), "child"), nil +} + +// readConmonPidFile attempts to read conmon's pid from its pid file +func readConmonPidFile(pidFile string) (int, error) { + // Let's try reading the Conmon pid at the same time. + if pidFile != "" { + contents, err := ioutil.ReadFile(pidFile) + if err != nil { + return -1, err + } + // Convert it to an int + conmonPID, err := strconv.Atoi(string(contents)) + if err != nil { + return -1, err + } + return conmonPID, nil + } + return 0, nil +} + +// readConmonPipeData attempts to read a syncInfo struct from the pipe +func readConmonPipeData(pipe *os.File, ociLog string) (int, error) { + // syncInfo is used to return data from monitor process to daemon + type syncInfo struct { + Data int `json:"data"` + Message string `json:"message,omitempty"` + } + + // Wait to get container pid from conmon + type syncStruct struct { + si *syncInfo + err error + } + ch := make(chan syncStruct) + go func() { + var si *syncInfo + rdr := bufio.NewReader(pipe) + b, err := rdr.ReadBytes('\n') + if err != nil { + ch <- syncStruct{err: err} + } + if err := json.Unmarshal(b, &si); err != nil { + ch <- syncStruct{err: err} + return + } + ch <- syncStruct{si: si} + }() + + data := -1 + select { + case ss := <-ch: + if ss.err != nil { + if ociLog != "" { + ociLogData, err := ioutil.ReadFile(ociLog) + if err == nil { + var ociErr ociError + if err := json.Unmarshal(ociLogData, &ociErr); err == nil { + return -1, getOCIRuntimeError(ociErr.Msg) + } + } + } + return -1, errors.Wrapf(ss.err, "error reading container (probably exited) json message") + } + logrus.Debugf("Received: %d", ss.si.Data) + if ss.si.Data < 0 { + if ociLog != "" { + ociLogData, err := ioutil.ReadFile(ociLog) + if err == nil { + var ociErr ociError + if err := json.Unmarshal(ociLogData, &ociErr); err == nil { + return ss.si.Data, getOCIRuntimeError(ociErr.Msg) + } + } + } + // If we failed to parse the JSON errors, then print the output as it is + if ss.si.Message != "" { + return ss.si.Data, getOCIRuntimeError(ss.si.Message) + } + return ss.si.Data, errors.Wrapf(define.ErrInternal, "container create failed") + } + data = ss.si.Data + case <-time.After(ContainerCreateTimeout): + return -1, errors.Wrapf(define.ErrInternal, "container creation timeout") + } + return data, nil +} + +// writeConmonPipeData writes nonse data to a pipe +func writeConmonPipeData(pipe *os.File) error { + someData := []byte{0} + _, err := pipe.Write(someData) + return err +} + +// formatRuntimeOpts prepends opts passed to it with --runtime-opt for passing to conmon +func formatRuntimeOpts(opts ...string) []string { + args := make([]string, 0, len(opts)*2) + for _, o := range opts { + args = append(args, "--runtime-opt", o) + } + return args +} + +// getConmonVersion returns a string representation of the conmon version. +func (r *ConmonOCIRuntime) getConmonVersion() (string, error) { + output, err := utils.ExecCmd(r.conmonPath, "--version") + if err != nil { + return "", err + } + return strings.TrimSuffix(strings.Replace(output, "\n", ", ", 1), "\n"), nil +} + +// getOCIRuntimeVersion returns a string representation of the OCI runtime's +// version. +func (r *ConmonOCIRuntime) getOCIRuntimeVersion() (string, error) { + output, err := utils.ExecCmd(r.path, "--version") + if err != nil { + return "", err + } + return strings.TrimSuffix(output, "\n"), nil +} |