diff options
author | Matthew Heon <matthew.heon@pm.me> | 2019-10-08 13:53:36 -0400 |
---|---|---|
committer | Matthew Heon <matthew.heon@pm.me> | 2019-10-10 10:19:32 -0400 |
commit | 6f630bc09b3e937fe3ddc4a829715bacd5b6c779 (patch) | |
tree | 4f95293e4673bd5f046847c6b669bf124e57e90c /libpod/oci.go | |
parent | a7f266891ca20214f56d0bb742896e9112f4905a (diff) | |
download | podman-6f630bc09b3e937fe3ddc4a829715bacd5b6c779.tar.gz podman-6f630bc09b3e937fe3ddc4a829715bacd5b6c779.tar.bz2 podman-6f630bc09b3e937fe3ddc4a829715bacd5b6c779.zip |
Move OCI runtime implementation behind an interface
For future work, we need multiple implementations of the OCI
runtime, not just a Conmon-wrapped runtime matching the runc CLI.
As part of this, do some refactoring on the interface for exec
(move to a struct, not a massive list of arguments). Also, add
'all' support to Kill and Stop (supported by runc and used a bit
internally for removing containers).
Signed-off-by: Matthew Heon <matthew.heon@pm.me>
Diffstat (limited to 'libpod/oci.go')
-rw-r--r-- | libpod/oci.go | 557 |
1 files changed, 124 insertions, 433 deletions
diff --git a/libpod/oci.go b/libpod/oci.go index 9879fa90e..37d04349f 100644 --- a/libpod/oci.go +++ b/libpod/oci.go @@ -1,441 +1,132 @@ package libpod import ( - "bytes" - "fmt" - "io/ioutil" - "net" - "os" - "os/exec" - "path/filepath" - "strings" - "time" - - "github.com/containers/libpod/libpod/define" - "github.com/containers/libpod/pkg/util" - "github.com/cri-o/ocicni/pkg/ocicni" - spec "github.com/opencontainers/runtime-spec/specs-go" - "github.com/opencontainers/selinux/go-selinux/label" - "github.com/pkg/errors" - "github.com/sirupsen/logrus" - - // TODO import these functions into libpod and remove the import - // Trying to keep libpod from depending on CRI-O code - "github.com/containers/libpod/utils" -) - -// OCI code is undergoing heavy rewrite - -const ( - // CgroupfsCgroupsManager represents cgroupfs native cgroup manager - CgroupfsCgroupsManager = "cgroupfs" - // SystemdCgroupsManager represents systemd native cgroup manager - SystemdCgroupsManager = "systemd" - - // ContainerCreateTimeout represents the value of container creating timeout - ContainerCreateTimeout = 240 * time.Second - - // Timeout before declaring that runtime has failed to kill a given - // container - killContainerTimeout = 5 * time.Second - // DefaultShmSize is the default shm size - DefaultShmSize = 64 * 1024 * 1024 - // NsRunDir is the default directory in which running network namespaces - // are stored - NsRunDir = "/var/run/netns" + "k8s.io/client-go/tools/remotecommand" ) -// OCIRuntime represents an OCI-compatible runtime that libpod can call into -// to perform container operations -type OCIRuntime struct { - name string - path string - conmonPath string - conmonEnv []string - cgroupManager string - tmpDir string - exitsDir string - socketsDir string - logSizeMax int64 - noPivot bool - reservePorts bool - supportsJSON bool - supportsNoCgroups bool - sdNotify bool -} - -// ociError is used to parse the OCI runtime JSON log. It is not part of the -// OCI runtime specifications, it follows what runc does -type ociError struct { - Level string `json:"level,omitempty"` - Time string `json:"time,omitempty"` - Msg string `json:"msg,omitempty"` -} - -// Make a new OCI runtime with provided options. -// The first path that points to a valid executable will be used. -func newOCIRuntime(name string, paths []string, conmonPath string, runtimeCfg *RuntimeConfig, supportsJSON, supportsNoCgroups bool) (*OCIRuntime, error) { - if name == "" { - return nil, errors.Wrapf(define.ErrInvalidArg, "the OCI runtime must be provided a non-empty name") - } - - runtime := new(OCIRuntime) - runtime.name = name - runtime.conmonPath = conmonPath - - runtime.conmonEnv = runtimeCfg.ConmonEnvVars - runtime.cgroupManager = runtimeCfg.CgroupManager - runtime.tmpDir = runtimeCfg.TmpDir - runtime.logSizeMax = runtimeCfg.MaxLogSize - runtime.noPivot = runtimeCfg.NoPivotRoot - runtime.reservePorts = runtimeCfg.EnablePortReservation - runtime.sdNotify = runtimeCfg.SDNotify - - // TODO: probe OCI runtime for feature and enable automatically if - // available. - runtime.supportsJSON = supportsJSON - runtime.supportsNoCgroups = supportsNoCgroups - - foundPath := false - for _, path := range paths { - stat, err := os.Stat(path) - if err != nil { - if os.IsNotExist(err) { - continue - } - return nil, errors.Wrapf(err, "cannot stat %s", path) - } - if !stat.Mode().IsRegular() { - continue - } - foundPath = true - runtime.path = path - logrus.Debugf("using runtime %q", path) - break - } - - // Search the $PATH as last fallback - if !foundPath { - if foundRuntime, err := exec.LookPath(name); err == nil { - foundPath = true - runtime.path = foundRuntime - logrus.Debugf("using runtime %q from $PATH: %q", name, foundRuntime) - } - } - - if !foundPath { - return nil, errors.Wrapf(define.ErrInvalidArg, "no valid executable found for OCI runtime %s", name) - } - - runtime.exitsDir = filepath.Join(runtime.tmpDir, "exits") - runtime.socketsDir = filepath.Join(runtime.tmpDir, "socket") - - if runtime.cgroupManager != CgroupfsCgroupsManager && runtime.cgroupManager != SystemdCgroupsManager { - return nil, errors.Wrapf(define.ErrInvalidArg, "invalid cgroup manager specified: %s", runtime.cgroupManager) - } - - // Create the exit files and attach sockets directories - if err := os.MkdirAll(runtime.exitsDir, 0750); err != nil { - // The directory is allowed to exist - if !os.IsExist(err) { - return nil, errors.Wrapf(err, "error creating OCI runtime exit files directory %s", - runtime.exitsDir) - } - } - if err := os.MkdirAll(runtime.socketsDir, 0750); err != nil { - // The directory is allowed to exist - if !os.IsExist(err) { - return nil, errors.Wrapf(err, "error creating OCI runtime attach sockets directory %s", - runtime.socketsDir) - } - } - - return runtime, nil -} - -// Create systemd unit name for cgroup scopes -func createUnitName(prefix string, name string) string { - return fmt.Sprintf("%s-%s.scope", prefix, name) -} - -func bindPorts(ports []ocicni.PortMapping) ([]*os.File, error) { - var files []*os.File - notifySCTP := false - for _, i := range ports { - switch i.Protocol { - case "udp": - addr, err := net.ResolveUDPAddr("udp", fmt.Sprintf("%s:%d", i.HostIP, i.HostPort)) - if err != nil { - return nil, errors.Wrapf(err, "cannot resolve the UDP address") - } - - server, err := net.ListenUDP("udp", addr) - if err != nil { - return nil, errors.Wrapf(err, "cannot listen on the UDP port") - } - f, err := server.File() - if err != nil { - return nil, errors.Wrapf(err, "cannot get file for UDP socket") - } - files = append(files, f) - - case "tcp": - addr, err := net.ResolveTCPAddr("tcp4", fmt.Sprintf("%s:%d", i.HostIP, i.HostPort)) - if err != nil { - return nil, errors.Wrapf(err, "cannot resolve the TCP address") - } - - server, err := net.ListenTCP("tcp4", addr) - if err != nil { - return nil, errors.Wrapf(err, "cannot listen on the TCP port") - } - f, err := server.File() - if err != nil { - return nil, errors.Wrapf(err, "cannot get file for TCP socket") - } - files = append(files, f) - case "sctp": - if !notifySCTP { - notifySCTP = true - logrus.Warnf("port reservation for SCTP is not supported") - } - default: - return nil, fmt.Errorf("unknown protocol %s", i.Protocol) - - } - } - return files, nil -} - -// updateContainerStatus retrieves the current status of the container from the -// runtime. It updates the container's state but does not save it. -// If useRunc is false, we will not directly hit runc to see the container's -// status, but will instead only check for the existence of the conmon exit file -// and update state to stopped if it exists. -func (r *OCIRuntime) updateContainerStatus(ctr *Container, useRuntime bool) error { - exitFile := ctr.exitFilePath() - - runtimeDir, err := util.GetRuntimeDir() - if err != nil { - return err - } - - // If not using the OCI runtime, we don't need to do most of this. - if !useRuntime { - // If the container's not running, nothing to do. - if ctr.state.State != define.ContainerStateRunning && ctr.state.State != define.ContainerStatePaused { - return nil - } - - // Check for the exit file conmon makes - info, err := os.Stat(exitFile) - if err != nil { - if os.IsNotExist(err) { - // Container is still running, no error - return nil - } - - return errors.Wrapf(err, "error running stat on container %s exit file", ctr.ID()) - } - - // Alright, it exists. Transition to Stopped state. - ctr.state.State = define.ContainerStateStopped - ctr.state.PID = 0 - ctr.state.ConmonPID = 0 - - // Read the exit file to get our stopped time and exit code. - return ctr.handleExitFile(exitFile, info) - } - - // Store old state so we know if we were already stopped - oldState := ctr.state.State - - state := new(spec.State) - - cmd := exec.Command(r.path, "state", ctr.ID()) - cmd.Env = append(cmd.Env, fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)) - - outPipe, err := cmd.StdoutPipe() - if err != nil { - return errors.Wrapf(err, "getting stdout pipe") - } - errPipe, err := cmd.StderrPipe() - if err != nil { - return errors.Wrapf(err, "getting stderr pipe") - } - - if err := cmd.Start(); err != nil { - out, err2 := ioutil.ReadAll(errPipe) - if err2 != nil { - return errors.Wrapf(err, "error getting container %s state", ctr.ID()) - } - if strings.Contains(string(out), "does not exist") { - if err := ctr.removeConmonFiles(); err != nil { - logrus.Debugf("unable to remove conmon files for container %s", ctr.ID()) - } - ctr.state.ExitCode = -1 - ctr.state.FinishedTime = time.Now() - ctr.state.State = define.ContainerStateExited - return nil - } - return errors.Wrapf(err, "error getting container %s state. stderr/out: %s", ctr.ID(), out) - } - defer func() { - _ = cmd.Wait() - }() - - if err := errPipe.Close(); err != nil { - return err - } - out, err := ioutil.ReadAll(outPipe) - if err != nil { - return errors.Wrapf(err, "error reading stdout: %s", ctr.ID()) - } - if err := json.NewDecoder(bytes.NewBuffer(out)).Decode(state); err != nil { - return errors.Wrapf(err, "error decoding container status for container %s", ctr.ID()) - } - ctr.state.PID = state.Pid - - switch state.Status { - case "created": - ctr.state.State = define.ContainerStateCreated - case "paused": - ctr.state.State = define.ContainerStatePaused - case "running": - ctr.state.State = define.ContainerStateRunning - case "stopped": - ctr.state.State = define.ContainerStateStopped - default: - return errors.Wrapf(define.ErrInternal, "unrecognized status returned by runtime for container %s: %s", - ctr.ID(), state.Status) - } - - // Only grab exit status if we were not already stopped - // If we were, it should already be in the database - if ctr.state.State == define.ContainerStateStopped && oldState != define.ContainerStateStopped { - var fi os.FileInfo - chWait := make(chan error) - defer close(chWait) - - _, err := WaitForFile(exitFile, chWait, time.Second*5) - if err == nil { - fi, err = os.Stat(exitFile) - } - if err != nil { - ctr.state.ExitCode = -1 - ctr.state.FinishedTime = time.Now() - logrus.Errorf("No exit file for container %s found: %v", ctr.ID(), err) - return nil - } - - return ctr.handleExitFile(exitFile, fi) - } - - return nil -} - -// startContainer starts the given container -// Sets time the container was started, but does not save it. -func (r *OCIRuntime) startContainer(ctr *Container) error { - // TODO: streams should probably *not* be our STDIN/OUT/ERR - redirect to buffers? - runtimeDir, err := util.GetRuntimeDir() - if err != nil { - return err - } - env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} - if notify, ok := os.LookupEnv("NOTIFY_SOCKET"); ok { - env = append(env, fmt.Sprintf("NOTIFY_SOCKET=%s", notify)) - } - if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "start", ctr.ID()); err != nil { - return err - } - - ctr.state.StartedTime = time.Now() - - return nil -} - -// killContainer sends the given signal to the given container -func (r *OCIRuntime) killContainer(ctr *Container, signal uint) error { - logrus.Debugf("Sending signal %d to container %s", signal, ctr.ID()) - runtimeDir, err := util.GetRuntimeDir() - if err != nil { - return err - } - env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} - if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "kill", ctr.ID(), fmt.Sprintf("%d", signal)); err != nil { - return errors.Wrapf(err, "error sending signal to container %s", ctr.ID()) - } - - return nil -} - -// deleteContainer deletes a container from the OCI runtime -func (r *OCIRuntime) deleteContainer(ctr *Container) error { - runtimeDir, err := util.GetRuntimeDir() - if err != nil { - return err - } - env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} - return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "delete", "--force", ctr.ID()) -} - -// pauseContainer pauses the given container -func (r *OCIRuntime) pauseContainer(ctr *Container) error { - runtimeDir, err := util.GetRuntimeDir() - if err != nil { - return err - } - env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} - return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "pause", ctr.ID()) -} - -// unpauseContainer unpauses the given container -func (r *OCIRuntime) unpauseContainer(ctr *Container) error { - runtimeDir, err := util.GetRuntimeDir() - if err != nil { - return err - } - env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} - return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "resume", ctr.ID()) -} - -// checkpointContainer checkpoints the given container -func (r *OCIRuntime) checkpointContainer(ctr *Container, options ContainerCheckpointOptions) error { - if err := label.SetSocketLabel(ctr.ProcessLabel()); err != nil { - return err - } - // imagePath is used by CRIU to store the actual checkpoint files - imagePath := ctr.CheckpointPath() - // workPath will be used to store dump.log and stats-dump - workPath := ctr.bundlePath() - logrus.Debugf("Writing checkpoint to %s", imagePath) - logrus.Debugf("Writing checkpoint logs to %s", workPath) - args := []string{} - args = append(args, "checkpoint") - args = append(args, "--image-path") - args = append(args, imagePath) - args = append(args, "--work-path") - args = append(args, workPath) - if options.KeepRunning { - args = append(args, "--leave-running") - } - if options.TCPEstablished { - args = append(args, "--tcp-established") - } - args = append(args, ctr.ID()) - return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, nil, r.path, args...) +// OCIRuntime is an implementation of an OCI runtime. +// The OCI runtime implementation is expected to be a fairly thin wrapper around +// the actual runtime, and is not expected to include things like state +// management logic - e.g., we do not expect it to determine on its own that +// calling 'UnpauseContainer()' on a container that is not paused is an error. +// The code calling the OCIRuntime will manage this. +// TODO: May want to move the Attach() code under this umbrella. It's highly OCI +// runtime dependent. +// TODO: May want to move the conmon cleanup code here too - it depends on +// Conmon being in use. +type OCIRuntime interface { + // Name returns the name of the runtime. + Name() string + // Path returns the path to the runtime executable. + Path() string + + // CreateContainer creates the container in the OCI runtime. + CreateContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) error + // UpdateContainerStatus updates the status of the given container. + // It includes a switch for whether to perform a hard query of the + // runtime. If unset, the exit file (if supported by the implementation) + // will be used. + UpdateContainerStatus(ctr *Container, useRuntime bool) error + // StartContainer starts the given container. + StartContainer(ctr *Container) error + // KillContainer sends the given signal to the given container. + // If all is set, all processes in the container will be signalled; + // otherwise, only init will be signalled. + KillContainer(ctr *Container, signal uint, all bool) error + // StopContainer stops the given container. + // The container's stop signal (or SIGTERM if unspecified) will be sent + // first. + // After the given timeout, SIGKILL will be sent. + // If the given timeout is 0, SIGKILL will be sent immediately, and the + // stop signal will be omitted. + // If all is set, we will attempt to use the --all flag will `kill` in + // the OCI runtime to kill all processes in the container, including + // exec sessions. This is only supported if the container has cgroups. + StopContainer(ctr *Container, timeout uint, all bool) error + // DeleteContainer deletes the given container from the OCI runtime. + DeleteContainer(ctr *Container) error + // PauseContainer pauses the given container. + PauseContainer(ctr *Container) error + // UnpauseContainer unpauses the given container. + UnpauseContainer(ctr *Container) error + + // ExecContainer executes a command in a running container. + // Returns an int (exit code), error channel (errors from attach), and + // error (errors that occurred attempting to start the exec session). + ExecContainer(ctr *Container, sessionID string, options *ExecOptions) (int, chan error, error) + // ExecStopContainer stops a given exec session in a running container. + // SIGTERM with be sent initially, then SIGKILL after the given timeout. + // If timeout is 0, SIGKILL will be sent immediately, and SIGTERM will + // be omitted. + ExecStopContainer(ctr *Container, sessionID string, timeout uint) error + // ExecContainerCleanup cleans up after an exec session exits. + // It removes any files left by the exec session that are no longer + // needed, including the attach socket. + ExecContainerCleanup(ctr *Container, sessionID string) error + + // CheckpointContainer checkpoints the given container. + // Some OCI runtimes may not support this - if SupportsCheckpoint() + // returns false, this is not implemented, and will always return an + // error. + CheckpointContainer(ctr *Container, options ContainerCheckpointOptions) error + + // SupportsCheckpoint returns whether this OCI runtime + // implementation supports the CheckpointContainer() operation. + SupportsCheckpoint() bool + // SupportsJSONErrors is whether the runtime can return JSON-formatted + // error messages. + SupportsJSONErrors() bool + // SupportsNoCgroups is whether the runtime supports running containers + // without cgroups. + SupportsNoCgroups() bool + + // AttachSocketPath is the path to the socket to attach to a given + // container. + // TODO: If we move Attach code in here, this should be made internal. + // We don't want to force all runtimes to share the same attach + // implementation. + AttachSocketPath(ctr *Container) (string, error) + // ExecAttachSocketPath is the path to the socket to attach to a given + // exec session in the given container. + // TODO: Probably should be made internal. + ExecAttachSocketPath(ctr *Container, sessionID string) (string, error) + // ExitFilePath is the path to a container's exit file. + // All runtime implementations must create an exit file when containers + // exit, containing the exit code of the container (as a string). + // This is the path to that file for a given container. + ExitFilePath(ctr *Container) (string, error) + + // RuntimeInfo returns verbose information about the runtime. + RuntimeInfo() (map[string]interface{}, error) } -func (r *OCIRuntime) featureCheckCheckpointing() bool { - // Check if the runtime implements checkpointing. Currently only - // runc's checkpoint/restore implementation is supported. - cmd := exec.Command(r.path, "checkpoint", "-h") - if err := cmd.Start(); err != nil { - return false - } - if err := cmd.Wait(); err == nil { - return true - } - return false +// ExecOptions are options passed into ExecContainer. They control the command +// that will be executed and how the exec will proceed. +type ExecOptions struct { + // Cmd is the command to execute. + Cmd []string + // CapAdd is a set of capabilities to add to the executed command. + CapAdd []string + // Env is a set of environment variables to add to the container. + Env map[string]string + // Terminal is whether to create a new TTY for the exec session. + Terminal bool + // Cwd is the working directory for the executed command. If unset, the + // working directory of the container will be used. + Cwd string + // User is the user the command will be executed as. If unset, the user + // the container was run as will be used. + User string + // Streams are the streams that will be attached to the container. + Streams *AttachStreams + // PreserveFDs is a number of additional file descriptors (in addition + // to 0, 1, 2) that will be passed to the executed process. The total FDs + // passed will be 3 + PreserveFDs. + PreserveFDs uint + // Resize is a channel where terminal resize events are sent to be + // handled. + Resize chan remotecommand.TerminalSize + // DetachKeys is a set of keys that, when pressed in sequence, will + // detach from the container. + DetachKeys string } |