diff options
author | Matthew Heon <matthew.heon@pm.me> | 2019-10-08 13:53:36 -0400 |
---|---|---|
committer | Matthew Heon <matthew.heon@pm.me> | 2019-10-10 10:19:32 -0400 |
commit | 6f630bc09b3e937fe3ddc4a829715bacd5b6c779 (patch) | |
tree | 4f95293e4673bd5f046847c6b669bf124e57e90c /libpod/oci_linux.go | |
parent | a7f266891ca20214f56d0bb742896e9112f4905a (diff) | |
download | podman-6f630bc09b3e937fe3ddc4a829715bacd5b6c779.tar.gz podman-6f630bc09b3e937fe3ddc4a829715bacd5b6c779.tar.bz2 podman-6f630bc09b3e937fe3ddc4a829715bacd5b6c779.zip |
Move OCI runtime implementation behind an interface
For future work, we need multiple implementations of the OCI
runtime, not just a Conmon-wrapped runtime matching the runc CLI.
As part of this, do some refactoring on the interface for exec
(move to a struct, not a massive list of arguments). Also, add
'all' support to Kill and Stop (supported by runc and used a bit
internally for removing containers).
Signed-off-by: Matthew Heon <matthew.heon@pm.me>
Diffstat (limited to 'libpod/oci_linux.go')
-rw-r--r-- | libpod/oci_linux.go | 503 |
1 files changed, 0 insertions, 503 deletions
diff --git a/libpod/oci_linux.go b/libpod/oci_linux.go deleted file mode 100644 index 9ec074704..000000000 --- a/libpod/oci_linux.go +++ /dev/null @@ -1,503 +0,0 @@ -// +build linux - -package libpod - -import ( - "fmt" - "os" - "os/exec" - "path/filepath" - "runtime" - "strconv" - "strings" - "syscall" - "time" - - "github.com/containers/libpod/libpod/define" - "github.com/containers/libpod/pkg/errorhandling" - "github.com/containers/libpod/pkg/rootless" - "github.com/containers/libpod/pkg/util" - "github.com/containers/libpod/utils" - pmount "github.com/containers/storage/pkg/mount" - "github.com/pkg/errors" - "github.com/sirupsen/logrus" - "golang.org/x/sys/unix" - "k8s.io/client-go/tools/remotecommand" -) - -// makeAccessible changes the path permission and each parent directory to have --x--x--x -func makeAccessible(path string, uid, gid int) error { - for ; path != "/"; path = filepath.Dir(path) { - st, err := os.Stat(path) - if err != nil { - if os.IsNotExist(err) { - return nil - } - return err - } - if int(st.Sys().(*syscall.Stat_t).Uid) == uid && int(st.Sys().(*syscall.Stat_t).Gid) == gid { - continue - } - if st.Mode()&0111 != 0111 { - if err := os.Chmod(path, st.Mode()|0111); err != nil { - return err - } - } - } - return nil -} - -// CreateContainer creates a container in the OCI runtime -// TODO terminal support for container -// Presently just ignoring conmon opts related to it -func (r *OCIRuntime) createContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (err error) { - if len(ctr.config.IDMappings.UIDMap) != 0 || len(ctr.config.IDMappings.GIDMap) != 0 { - for _, i := range []string{ctr.state.RunDir, ctr.runtime.config.TmpDir, ctr.config.StaticDir, ctr.state.Mountpoint, ctr.runtime.config.VolumePath} { - if err := makeAccessible(i, ctr.RootUID(), ctr.RootGID()); err != nil { - return err - } - } - - // if we are running a non privileged container, be sure to umount some kernel paths so they are not - // bind mounted inside the container at all. - if !ctr.config.Privileged && !rootless.IsRootless() { - ch := make(chan error) - go func() { - runtime.LockOSThread() - err := func() error { - fd, err := os.Open(fmt.Sprintf("/proc/%d/task/%d/ns/mnt", os.Getpid(), unix.Gettid())) - if err != nil { - return err - } - defer errorhandling.CloseQuiet(fd) - - // create a new mountns on the current thread - if err = unix.Unshare(unix.CLONE_NEWNS); err != nil { - return err - } - defer func() { - if err := unix.Setns(int(fd.Fd()), unix.CLONE_NEWNS); err != nil { - logrus.Errorf("unable to clone new namespace: %q", err) - } - }() - - // don't spread our mounts around. We are setting only /sys to be slave - // so that the cleanup process is still able to umount the storage and the - // changes are propagated to the host. - err = unix.Mount("/sys", "/sys", "none", unix.MS_REC|unix.MS_SLAVE, "") - if err != nil { - return errors.Wrapf(err, "cannot make /sys slave") - } - - mounts, err := pmount.GetMounts() - if err != nil { - return err - } - for _, m := range mounts { - if !strings.HasPrefix(m.Mountpoint, "/sys/kernel") { - continue - } - err = unix.Unmount(m.Mountpoint, 0) - if err != nil && !os.IsNotExist(err) { - return errors.Wrapf(err, "cannot unmount %s", m.Mountpoint) - } - } - return r.createOCIContainer(ctr, restoreOptions) - }() - ch <- err - }() - err := <-ch - return err - } - } - return r.createOCIContainer(ctr, restoreOptions) -} - -func (r *OCIRuntime) pathPackage() string { - return packageVersion(r.path) -} - -func (r *OCIRuntime) conmonPackage() string { - return packageVersion(r.conmonPath) -} - -// execContainer executes a command in a running container -// TODO: Add --detach support -// TODO: Convert to use conmon -// TODO: add --pid-file and use that to generate exec session tracking -func (r *OCIRuntime) execContainer(c *Container, cmd, capAdd, env []string, tty bool, cwd, user, sessionID string, streams *AttachStreams, preserveFDs int, resize chan remotecommand.TerminalSize, detachKeys string) (int, chan error, error) { - if len(cmd) == 0 { - return -1, nil, errors.Wrapf(define.ErrInvalidArg, "must provide a command to execute") - } - - if sessionID == "" { - return -1, nil, errors.Wrapf(define.ErrEmptyID, "must provide a session ID for exec") - } - - // create sync pipe to receive the pid - parentSyncPipe, childSyncPipe, err := newPipe() - if err != nil { - return -1, nil, errors.Wrapf(err, "error creating socket pair") - } - - defer errorhandling.CloseQuiet(parentSyncPipe) - - // create start pipe to set the cgroup before running - // attachToExec is responsible for closing parentStartPipe - childStartPipe, parentStartPipe, err := newPipe() - if err != nil { - return -1, nil, errors.Wrapf(err, "error creating socket pair") - } - - // We want to make sure we close the parent{Start,Attach}Pipes if we fail - // but also don't want to close them after attach to exec is called - attachToExecCalled := false - - defer func() { - if !attachToExecCalled { - errorhandling.CloseQuiet(parentStartPipe) - } - }() - - // create the attach pipe to allow attach socket to be created before - // $RUNTIME exec starts running. This is to make sure we can capture all output - // from the process through that socket, rather than half reading the log, half attaching to the socket - // attachToExec is responsible for closing parentAttachPipe - parentAttachPipe, childAttachPipe, err := newPipe() - if err != nil { - return -1, nil, errors.Wrapf(err, "error creating socket pair") - } - - defer func() { - if !attachToExecCalled { - errorhandling.CloseQuiet(parentAttachPipe) - } - }() - - childrenClosed := false - defer func() { - if !childrenClosed { - errorhandling.CloseQuiet(childSyncPipe) - errorhandling.CloseQuiet(childAttachPipe) - errorhandling.CloseQuiet(childStartPipe) - } - }() - - runtimeDir, err := util.GetRuntimeDir() - if err != nil { - return -1, nil, err - } - - processFile, err := prepareProcessExec(c, cmd, env, tty, cwd, user, sessionID) - if err != nil { - return -1, nil, err - } - - var ociLog string - if logrus.GetLevel() != logrus.DebugLevel && r.supportsJSON { - ociLog = c.execOCILog(sessionID) - } - args := r.sharedConmonArgs(c, sessionID, c.execBundlePath(sessionID), c.execPidPath(sessionID), c.execLogPath(sessionID), c.execExitFileDir(sessionID), ociLog) - - if preserveFDs > 0 { - args = append(args, formatRuntimeOpts("--preserve-fds", strconv.Itoa(preserveFDs))...) - } - - for _, capability := range capAdd { - args = append(args, formatRuntimeOpts("--cap", capability)...) - } - - if tty { - args = append(args, "-t") - } - - // Append container ID and command - args = append(args, "-e") - // TODO make this optional when we can detach - args = append(args, "--exec-attach") - args = append(args, "--exec-process-spec", processFile.Name()) - - logrus.WithFields(logrus.Fields{ - "args": args, - }).Debugf("running conmon: %s", r.conmonPath) - execCmd := exec.Command(r.conmonPath, args...) - - if streams.AttachInput { - execCmd.Stdin = streams.InputStream - } - if streams.AttachOutput { - execCmd.Stdout = streams.OutputStream - } - if streams.AttachError { - execCmd.Stderr = streams.ErrorStream - } - - conmonEnv, extraFiles, err := r.configureConmonEnv(runtimeDir) - if err != nil { - return -1, nil, err - } - - if preserveFDs > 0 { - for fd := 3; fd < 3+preserveFDs; fd++ { - execCmd.ExtraFiles = append(execCmd.ExtraFiles, os.NewFile(uintptr(fd), fmt.Sprintf("fd-%d", fd))) - } - } - - // we don't want to step on users fds they asked to preserve - // Since 0-2 are used for stdio, start the fds we pass in at preserveFDs+3 - execCmd.Env = append(r.conmonEnv, fmt.Sprintf("_OCI_SYNCPIPE=%d", preserveFDs+3), fmt.Sprintf("_OCI_STARTPIPE=%d", preserveFDs+4), fmt.Sprintf("_OCI_ATTACHPIPE=%d", preserveFDs+5)) - execCmd.Env = append(execCmd.Env, conmonEnv...) - - execCmd.ExtraFiles = append(execCmd.ExtraFiles, childSyncPipe, childStartPipe, childAttachPipe) - execCmd.ExtraFiles = append(execCmd.ExtraFiles, extraFiles...) - execCmd.Dir = c.execBundlePath(sessionID) - execCmd.SysProcAttr = &syscall.SysProcAttr{ - Setpgid: true, - } - - err = startCommandGivenSelinux(execCmd) - - // We don't need children pipes on the parent side - errorhandling.CloseQuiet(childSyncPipe) - errorhandling.CloseQuiet(childAttachPipe) - errorhandling.CloseQuiet(childStartPipe) - childrenClosed = true - - if err != nil { - return -1, nil, errors.Wrapf(err, "cannot start container %s", c.ID()) - } - if err := r.moveConmonToCgroupAndSignal(c, execCmd, parentStartPipe, sessionID); err != nil { - return -1, nil, err - } - - if preserveFDs > 0 { - for fd := 3; fd < 3+preserveFDs; fd++ { - // These fds were passed down to the runtime. Close them - // and not interfere - if err := os.NewFile(uintptr(fd), fmt.Sprintf("fd-%d", fd)).Close(); err != nil { - logrus.Debugf("unable to close file fd-%d", fd) - } - } - } - - // TODO Only create if !detach - // Attach to the container before starting it - attachChan := make(chan error) - go func() { - // attachToExec is responsible for closing pipes - attachChan <- c.attachToExec(streams, detachKeys, resize, sessionID, parentStartPipe, parentAttachPipe) - close(attachChan) - }() - attachToExecCalled = true - - pid, err := readConmonPipeData(parentSyncPipe, ociLog) - - return pid, attachChan, err -} - -// Wait for a container which has been sent a signal to stop -func waitContainerStop(ctr *Container, timeout time.Duration) error { - done := make(chan struct{}) - chControl := make(chan struct{}) - go func() { - for { - select { - case <-chControl: - return - default: - // Check if the process is still around - err := unix.Kill(ctr.state.PID, 0) - if err == unix.ESRCH { - close(done) - return - } - time.Sleep(100 * time.Millisecond) - } - } - }() - select { - case <-done: - return nil - case <-time.After(timeout): - close(chControl) - logrus.Debugf("container %s did not die within timeout %d", ctr.ID(), timeout) - return errors.Errorf("container %s did not die within timeout", ctr.ID()) - } -} - -// Wait for a set of given PIDs to stop -func waitPidsStop(pids []int, timeout time.Duration) error { - done := make(chan struct{}) - chControl := make(chan struct{}) - go func() { - for { - select { - case <-chControl: - return - default: - allClosed := true - for _, pid := range pids { - if err := unix.Kill(pid, 0); err != unix.ESRCH { - allClosed = false - break - } - } - if allClosed { - close(done) - return - } - time.Sleep(100 * time.Millisecond) - } - } - }() - select { - case <-done: - return nil - case <-time.After(timeout): - close(chControl) - return errors.Errorf("given PIDs did not die within timeout") - } -} - -// stopContainer stops a container, first using its given stop signal (or -// SIGTERM if no signal was specified), then using SIGKILL -// Timeout is given in seconds. If timeout is 0, the container will be -// immediately kill with SIGKILL -// Does not set finished time for container, assumes you will run updateStatus -// after to pull the exit code -func (r *OCIRuntime) stopContainer(ctr *Container, timeout uint) error { - logrus.Debugf("Stopping container %s (PID %d)", ctr.ID(), ctr.state.PID) - - // Ping the container to see if it's alive - // If it's not, it's already stopped, return - err := unix.Kill(ctr.state.PID, 0) - if err == unix.ESRCH { - return nil - } - - stopSignal := ctr.config.StopSignal - if stopSignal == 0 { - stopSignal = uint(syscall.SIGTERM) - } - - if timeout > 0 { - if err := r.killContainer(ctr, stopSignal); err != nil { - // Is the container gone? - // If so, it probably died between the first check and - // our sending the signal - // The container is stopped, so exit cleanly - err := unix.Kill(ctr.state.PID, 0) - if err == unix.ESRCH { - return nil - } - - return err - } - - if err := waitContainerStop(ctr, time.Duration(timeout)*time.Second); err != nil { - logrus.Warnf("Timed out stopping container %s, resorting to SIGKILL", ctr.ID()) - } else { - // No error, the container is dead - return nil - } - } - - var args []string - if rootless.IsRootless() || ctr.config.NoCgroups { - // we don't use --all for rootless containers as the OCI runtime might use - // the cgroups to determine the PIDs, but for rootless containers there is - // not any. - // Same logic for NoCgroups - we can't use cgroups as the user - // explicitly requested none be created. - args = []string{"kill", ctr.ID(), "KILL"} - } else { - args = []string{"kill", "--all", ctr.ID(), "KILL"} - } - - runtimeDir, err := util.GetRuntimeDir() - if err != nil { - return err - } - env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} - if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, args...); err != nil { - // Again, check if the container is gone. If it is, exit cleanly. - err := unix.Kill(ctr.state.PID, 0) - if err == unix.ESRCH { - return nil - } - - return errors.Wrapf(err, "error sending SIGKILL to container %s", ctr.ID()) - } - - // Give runtime a few seconds to make it happen - if err := waitContainerStop(ctr, killContainerTimeout); err != nil { - return err - } - - return nil -} - -// execStopContainer stops all active exec sessions in a container -// It will also stop all other processes in the container. It is only intended -// to be used to assist in cleanup when removing a container. -// SIGTERM is used by default to stop processes. If SIGTERM fails, SIGKILL will be used. -func (r *OCIRuntime) execStopContainer(ctr *Container, timeout uint) error { - // Do we have active exec sessions? - if len(ctr.state.ExecSessions) == 0 { - return nil - } - - // Get a list of active exec sessions - execSessions := []int{} - for _, session := range ctr.state.ExecSessions { - pid := session.PID - // Ping the PID with signal 0 to see if it still exists - if err := unix.Kill(pid, 0); err == unix.ESRCH { - continue - } - - execSessions = append(execSessions, pid) - } - - // All the sessions may be dead - // If they are, just return - if len(execSessions) == 0 { - return nil - } - runtimeDir, err := util.GetRuntimeDir() - if err != nil { - return err - } - env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} - - // If timeout is 0, just use SIGKILL - if timeout > 0 { - // Stop using SIGTERM by default - // Use SIGSTOP after a timeout - logrus.Debugf("Killing all processes in container %s with SIGTERM", ctr.ID()) - if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "kill", "--all", ctr.ID(), "TERM"); err != nil { - return errors.Wrapf(err, "error sending SIGTERM to container %s processes", ctr.ID()) - } - - // Wait for all processes to stop - if err := waitPidsStop(execSessions, time.Duration(timeout)*time.Second); err != nil { - logrus.Warnf("Timed out stopping container %s exec sessions", ctr.ID()) - } else { - // No error, all exec sessions are dead - return nil - } - } - - // Send SIGKILL - logrus.Debugf("Killing all processes in container %s with SIGKILL", ctr.ID()) - if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "kill", "--all", ctr.ID(), "KILL"); err != nil { - return errors.Wrapf(err, "error sending SIGKILL to container %s processes", ctr.ID()) - } - - // Give the processes a few seconds to go down - if err := waitPidsStop(execSessions, killContainerTimeout); err != nil { - return errors.Wrapf(err, "failed to kill container %s exec sessions", ctr.ID()) - } - - return nil -} |