summaryrefslogtreecommitdiff
path: root/libpod/oci.go
diff options
context:
space:
mode:
authorMatthew Heon <matthew.heon@pm.me>2019-10-08 13:53:36 -0400
committerMatthew Heon <matthew.heon@pm.me>2019-10-10 10:19:32 -0400
commit6f630bc09b3e937fe3ddc4a829715bacd5b6c779 (patch)
tree4f95293e4673bd5f046847c6b669bf124e57e90c /libpod/oci.go
parenta7f266891ca20214f56d0bb742896e9112f4905a (diff)
downloadpodman-6f630bc09b3e937fe3ddc4a829715bacd5b6c779.tar.gz
podman-6f630bc09b3e937fe3ddc4a829715bacd5b6c779.tar.bz2
podman-6f630bc09b3e937fe3ddc4a829715bacd5b6c779.zip
Move OCI runtime implementation behind an interface
For future work, we need multiple implementations of the OCI runtime, not just a Conmon-wrapped runtime matching the runc CLI. As part of this, do some refactoring on the interface for exec (move to a struct, not a massive list of arguments). Also, add 'all' support to Kill and Stop (supported by runc and used a bit internally for removing containers). Signed-off-by: Matthew Heon <matthew.heon@pm.me>
Diffstat (limited to 'libpod/oci.go')
-rw-r--r--libpod/oci.go557
1 files changed, 124 insertions, 433 deletions
diff --git a/libpod/oci.go b/libpod/oci.go
index 9879fa90e..37d04349f 100644
--- a/libpod/oci.go
+++ b/libpod/oci.go
@@ -1,441 +1,132 @@
package libpod
import (
- "bytes"
- "fmt"
- "io/ioutil"
- "net"
- "os"
- "os/exec"
- "path/filepath"
- "strings"
- "time"
-
- "github.com/containers/libpod/libpod/define"
- "github.com/containers/libpod/pkg/util"
- "github.com/cri-o/ocicni/pkg/ocicni"
- spec "github.com/opencontainers/runtime-spec/specs-go"
- "github.com/opencontainers/selinux/go-selinux/label"
- "github.com/pkg/errors"
- "github.com/sirupsen/logrus"
-
- // TODO import these functions into libpod and remove the import
- // Trying to keep libpod from depending on CRI-O code
- "github.com/containers/libpod/utils"
-)
-
-// OCI code is undergoing heavy rewrite
-
-const (
- // CgroupfsCgroupsManager represents cgroupfs native cgroup manager
- CgroupfsCgroupsManager = "cgroupfs"
- // SystemdCgroupsManager represents systemd native cgroup manager
- SystemdCgroupsManager = "systemd"
-
- // ContainerCreateTimeout represents the value of container creating timeout
- ContainerCreateTimeout = 240 * time.Second
-
- // Timeout before declaring that runtime has failed to kill a given
- // container
- killContainerTimeout = 5 * time.Second
- // DefaultShmSize is the default shm size
- DefaultShmSize = 64 * 1024 * 1024
- // NsRunDir is the default directory in which running network namespaces
- // are stored
- NsRunDir = "/var/run/netns"
+ "k8s.io/client-go/tools/remotecommand"
)
-// OCIRuntime represents an OCI-compatible runtime that libpod can call into
-// to perform container operations
-type OCIRuntime struct {
- name string
- path string
- conmonPath string
- conmonEnv []string
- cgroupManager string
- tmpDir string
- exitsDir string
- socketsDir string
- logSizeMax int64
- noPivot bool
- reservePorts bool
- supportsJSON bool
- supportsNoCgroups bool
- sdNotify bool
-}
-
-// ociError is used to parse the OCI runtime JSON log. It is not part of the
-// OCI runtime specifications, it follows what runc does
-type ociError struct {
- Level string `json:"level,omitempty"`
- Time string `json:"time,omitempty"`
- Msg string `json:"msg,omitempty"`
-}
-
-// Make a new OCI runtime with provided options.
-// The first path that points to a valid executable will be used.
-func newOCIRuntime(name string, paths []string, conmonPath string, runtimeCfg *RuntimeConfig, supportsJSON, supportsNoCgroups bool) (*OCIRuntime, error) {
- if name == "" {
- return nil, errors.Wrapf(define.ErrInvalidArg, "the OCI runtime must be provided a non-empty name")
- }
-
- runtime := new(OCIRuntime)
- runtime.name = name
- runtime.conmonPath = conmonPath
-
- runtime.conmonEnv = runtimeCfg.ConmonEnvVars
- runtime.cgroupManager = runtimeCfg.CgroupManager
- runtime.tmpDir = runtimeCfg.TmpDir
- runtime.logSizeMax = runtimeCfg.MaxLogSize
- runtime.noPivot = runtimeCfg.NoPivotRoot
- runtime.reservePorts = runtimeCfg.EnablePortReservation
- runtime.sdNotify = runtimeCfg.SDNotify
-
- // TODO: probe OCI runtime for feature and enable automatically if
- // available.
- runtime.supportsJSON = supportsJSON
- runtime.supportsNoCgroups = supportsNoCgroups
-
- foundPath := false
- for _, path := range paths {
- stat, err := os.Stat(path)
- if err != nil {
- if os.IsNotExist(err) {
- continue
- }
- return nil, errors.Wrapf(err, "cannot stat %s", path)
- }
- if !stat.Mode().IsRegular() {
- continue
- }
- foundPath = true
- runtime.path = path
- logrus.Debugf("using runtime %q", path)
- break
- }
-
- // Search the $PATH as last fallback
- if !foundPath {
- if foundRuntime, err := exec.LookPath(name); err == nil {
- foundPath = true
- runtime.path = foundRuntime
- logrus.Debugf("using runtime %q from $PATH: %q", name, foundRuntime)
- }
- }
-
- if !foundPath {
- return nil, errors.Wrapf(define.ErrInvalidArg, "no valid executable found for OCI runtime %s", name)
- }
-
- runtime.exitsDir = filepath.Join(runtime.tmpDir, "exits")
- runtime.socketsDir = filepath.Join(runtime.tmpDir, "socket")
-
- if runtime.cgroupManager != CgroupfsCgroupsManager && runtime.cgroupManager != SystemdCgroupsManager {
- return nil, errors.Wrapf(define.ErrInvalidArg, "invalid cgroup manager specified: %s", runtime.cgroupManager)
- }
-
- // Create the exit files and attach sockets directories
- if err := os.MkdirAll(runtime.exitsDir, 0750); err != nil {
- // The directory is allowed to exist
- if !os.IsExist(err) {
- return nil, errors.Wrapf(err, "error creating OCI runtime exit files directory %s",
- runtime.exitsDir)
- }
- }
- if err := os.MkdirAll(runtime.socketsDir, 0750); err != nil {
- // The directory is allowed to exist
- if !os.IsExist(err) {
- return nil, errors.Wrapf(err, "error creating OCI runtime attach sockets directory %s",
- runtime.socketsDir)
- }
- }
-
- return runtime, nil
-}
-
-// Create systemd unit name for cgroup scopes
-func createUnitName(prefix string, name string) string {
- return fmt.Sprintf("%s-%s.scope", prefix, name)
-}
-
-func bindPorts(ports []ocicni.PortMapping) ([]*os.File, error) {
- var files []*os.File
- notifySCTP := false
- for _, i := range ports {
- switch i.Protocol {
- case "udp":
- addr, err := net.ResolveUDPAddr("udp", fmt.Sprintf("%s:%d", i.HostIP, i.HostPort))
- if err != nil {
- return nil, errors.Wrapf(err, "cannot resolve the UDP address")
- }
-
- server, err := net.ListenUDP("udp", addr)
- if err != nil {
- return nil, errors.Wrapf(err, "cannot listen on the UDP port")
- }
- f, err := server.File()
- if err != nil {
- return nil, errors.Wrapf(err, "cannot get file for UDP socket")
- }
- files = append(files, f)
-
- case "tcp":
- addr, err := net.ResolveTCPAddr("tcp4", fmt.Sprintf("%s:%d", i.HostIP, i.HostPort))
- if err != nil {
- return nil, errors.Wrapf(err, "cannot resolve the TCP address")
- }
-
- server, err := net.ListenTCP("tcp4", addr)
- if err != nil {
- return nil, errors.Wrapf(err, "cannot listen on the TCP port")
- }
- f, err := server.File()
- if err != nil {
- return nil, errors.Wrapf(err, "cannot get file for TCP socket")
- }
- files = append(files, f)
- case "sctp":
- if !notifySCTP {
- notifySCTP = true
- logrus.Warnf("port reservation for SCTP is not supported")
- }
- default:
- return nil, fmt.Errorf("unknown protocol %s", i.Protocol)
-
- }
- }
- return files, nil
-}
-
-// updateContainerStatus retrieves the current status of the container from the
-// runtime. It updates the container's state but does not save it.
-// If useRunc is false, we will not directly hit runc to see the container's
-// status, but will instead only check for the existence of the conmon exit file
-// and update state to stopped if it exists.
-func (r *OCIRuntime) updateContainerStatus(ctr *Container, useRuntime bool) error {
- exitFile := ctr.exitFilePath()
-
- runtimeDir, err := util.GetRuntimeDir()
- if err != nil {
- return err
- }
-
- // If not using the OCI runtime, we don't need to do most of this.
- if !useRuntime {
- // If the container's not running, nothing to do.
- if ctr.state.State != define.ContainerStateRunning && ctr.state.State != define.ContainerStatePaused {
- return nil
- }
-
- // Check for the exit file conmon makes
- info, err := os.Stat(exitFile)
- if err != nil {
- if os.IsNotExist(err) {
- // Container is still running, no error
- return nil
- }
-
- return errors.Wrapf(err, "error running stat on container %s exit file", ctr.ID())
- }
-
- // Alright, it exists. Transition to Stopped state.
- ctr.state.State = define.ContainerStateStopped
- ctr.state.PID = 0
- ctr.state.ConmonPID = 0
-
- // Read the exit file to get our stopped time and exit code.
- return ctr.handleExitFile(exitFile, info)
- }
-
- // Store old state so we know if we were already stopped
- oldState := ctr.state.State
-
- state := new(spec.State)
-
- cmd := exec.Command(r.path, "state", ctr.ID())
- cmd.Env = append(cmd.Env, fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir))
-
- outPipe, err := cmd.StdoutPipe()
- if err != nil {
- return errors.Wrapf(err, "getting stdout pipe")
- }
- errPipe, err := cmd.StderrPipe()
- if err != nil {
- return errors.Wrapf(err, "getting stderr pipe")
- }
-
- if err := cmd.Start(); err != nil {
- out, err2 := ioutil.ReadAll(errPipe)
- if err2 != nil {
- return errors.Wrapf(err, "error getting container %s state", ctr.ID())
- }
- if strings.Contains(string(out), "does not exist") {
- if err := ctr.removeConmonFiles(); err != nil {
- logrus.Debugf("unable to remove conmon files for container %s", ctr.ID())
- }
- ctr.state.ExitCode = -1
- ctr.state.FinishedTime = time.Now()
- ctr.state.State = define.ContainerStateExited
- return nil
- }
- return errors.Wrapf(err, "error getting container %s state. stderr/out: %s", ctr.ID(), out)
- }
- defer func() {
- _ = cmd.Wait()
- }()
-
- if err := errPipe.Close(); err != nil {
- return err
- }
- out, err := ioutil.ReadAll(outPipe)
- if err != nil {
- return errors.Wrapf(err, "error reading stdout: %s", ctr.ID())
- }
- if err := json.NewDecoder(bytes.NewBuffer(out)).Decode(state); err != nil {
- return errors.Wrapf(err, "error decoding container status for container %s", ctr.ID())
- }
- ctr.state.PID = state.Pid
-
- switch state.Status {
- case "created":
- ctr.state.State = define.ContainerStateCreated
- case "paused":
- ctr.state.State = define.ContainerStatePaused
- case "running":
- ctr.state.State = define.ContainerStateRunning
- case "stopped":
- ctr.state.State = define.ContainerStateStopped
- default:
- return errors.Wrapf(define.ErrInternal, "unrecognized status returned by runtime for container %s: %s",
- ctr.ID(), state.Status)
- }
-
- // Only grab exit status if we were not already stopped
- // If we were, it should already be in the database
- if ctr.state.State == define.ContainerStateStopped && oldState != define.ContainerStateStopped {
- var fi os.FileInfo
- chWait := make(chan error)
- defer close(chWait)
-
- _, err := WaitForFile(exitFile, chWait, time.Second*5)
- if err == nil {
- fi, err = os.Stat(exitFile)
- }
- if err != nil {
- ctr.state.ExitCode = -1
- ctr.state.FinishedTime = time.Now()
- logrus.Errorf("No exit file for container %s found: %v", ctr.ID(), err)
- return nil
- }
-
- return ctr.handleExitFile(exitFile, fi)
- }
-
- return nil
-}
-
-// startContainer starts the given container
-// Sets time the container was started, but does not save it.
-func (r *OCIRuntime) startContainer(ctr *Container) error {
- // TODO: streams should probably *not* be our STDIN/OUT/ERR - redirect to buffers?
- runtimeDir, err := util.GetRuntimeDir()
- if err != nil {
- return err
- }
- env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)}
- if notify, ok := os.LookupEnv("NOTIFY_SOCKET"); ok {
- env = append(env, fmt.Sprintf("NOTIFY_SOCKET=%s", notify))
- }
- if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "start", ctr.ID()); err != nil {
- return err
- }
-
- ctr.state.StartedTime = time.Now()
-
- return nil
-}
-
-// killContainer sends the given signal to the given container
-func (r *OCIRuntime) killContainer(ctr *Container, signal uint) error {
- logrus.Debugf("Sending signal %d to container %s", signal, ctr.ID())
- runtimeDir, err := util.GetRuntimeDir()
- if err != nil {
- return err
- }
- env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)}
- if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "kill", ctr.ID(), fmt.Sprintf("%d", signal)); err != nil {
- return errors.Wrapf(err, "error sending signal to container %s", ctr.ID())
- }
-
- return nil
-}
-
-// deleteContainer deletes a container from the OCI runtime
-func (r *OCIRuntime) deleteContainer(ctr *Container) error {
- runtimeDir, err := util.GetRuntimeDir()
- if err != nil {
- return err
- }
- env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)}
- return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "delete", "--force", ctr.ID())
-}
-
-// pauseContainer pauses the given container
-func (r *OCIRuntime) pauseContainer(ctr *Container) error {
- runtimeDir, err := util.GetRuntimeDir()
- if err != nil {
- return err
- }
- env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)}
- return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "pause", ctr.ID())
-}
-
-// unpauseContainer unpauses the given container
-func (r *OCIRuntime) unpauseContainer(ctr *Container) error {
- runtimeDir, err := util.GetRuntimeDir()
- if err != nil {
- return err
- }
- env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)}
- return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "resume", ctr.ID())
-}
-
-// checkpointContainer checkpoints the given container
-func (r *OCIRuntime) checkpointContainer(ctr *Container, options ContainerCheckpointOptions) error {
- if err := label.SetSocketLabel(ctr.ProcessLabel()); err != nil {
- return err
- }
- // imagePath is used by CRIU to store the actual checkpoint files
- imagePath := ctr.CheckpointPath()
- // workPath will be used to store dump.log and stats-dump
- workPath := ctr.bundlePath()
- logrus.Debugf("Writing checkpoint to %s", imagePath)
- logrus.Debugf("Writing checkpoint logs to %s", workPath)
- args := []string{}
- args = append(args, "checkpoint")
- args = append(args, "--image-path")
- args = append(args, imagePath)
- args = append(args, "--work-path")
- args = append(args, workPath)
- if options.KeepRunning {
- args = append(args, "--leave-running")
- }
- if options.TCPEstablished {
- args = append(args, "--tcp-established")
- }
- args = append(args, ctr.ID())
- return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, nil, r.path, args...)
+// OCIRuntime is an implementation of an OCI runtime.
+// The OCI runtime implementation is expected to be a fairly thin wrapper around
+// the actual runtime, and is not expected to include things like state
+// management logic - e.g., we do not expect it to determine on its own that
+// calling 'UnpauseContainer()' on a container that is not paused is an error.
+// The code calling the OCIRuntime will manage this.
+// TODO: May want to move the Attach() code under this umbrella. It's highly OCI
+// runtime dependent.
+// TODO: May want to move the conmon cleanup code here too - it depends on
+// Conmon being in use.
+type OCIRuntime interface {
+ // Name returns the name of the runtime.
+ Name() string
+ // Path returns the path to the runtime executable.
+ Path() string
+
+ // CreateContainer creates the container in the OCI runtime.
+ CreateContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) error
+ // UpdateContainerStatus updates the status of the given container.
+ // It includes a switch for whether to perform a hard query of the
+ // runtime. If unset, the exit file (if supported by the implementation)
+ // will be used.
+ UpdateContainerStatus(ctr *Container, useRuntime bool) error
+ // StartContainer starts the given container.
+ StartContainer(ctr *Container) error
+ // KillContainer sends the given signal to the given container.
+ // If all is set, all processes in the container will be signalled;
+ // otherwise, only init will be signalled.
+ KillContainer(ctr *Container, signal uint, all bool) error
+ // StopContainer stops the given container.
+ // The container's stop signal (or SIGTERM if unspecified) will be sent
+ // first.
+ // After the given timeout, SIGKILL will be sent.
+ // If the given timeout is 0, SIGKILL will be sent immediately, and the
+ // stop signal will be omitted.
+ // If all is set, we will attempt to use the --all flag will `kill` in
+ // the OCI runtime to kill all processes in the container, including
+ // exec sessions. This is only supported if the container has cgroups.
+ StopContainer(ctr *Container, timeout uint, all bool) error
+ // DeleteContainer deletes the given container from the OCI runtime.
+ DeleteContainer(ctr *Container) error
+ // PauseContainer pauses the given container.
+ PauseContainer(ctr *Container) error
+ // UnpauseContainer unpauses the given container.
+ UnpauseContainer(ctr *Container) error
+
+ // ExecContainer executes a command in a running container.
+ // Returns an int (exit code), error channel (errors from attach), and
+ // error (errors that occurred attempting to start the exec session).
+ ExecContainer(ctr *Container, sessionID string, options *ExecOptions) (int, chan error, error)
+ // ExecStopContainer stops a given exec session in a running container.
+ // SIGTERM with be sent initially, then SIGKILL after the given timeout.
+ // If timeout is 0, SIGKILL will be sent immediately, and SIGTERM will
+ // be omitted.
+ ExecStopContainer(ctr *Container, sessionID string, timeout uint) error
+ // ExecContainerCleanup cleans up after an exec session exits.
+ // It removes any files left by the exec session that are no longer
+ // needed, including the attach socket.
+ ExecContainerCleanup(ctr *Container, sessionID string) error
+
+ // CheckpointContainer checkpoints the given container.
+ // Some OCI runtimes may not support this - if SupportsCheckpoint()
+ // returns false, this is not implemented, and will always return an
+ // error.
+ CheckpointContainer(ctr *Container, options ContainerCheckpointOptions) error
+
+ // SupportsCheckpoint returns whether this OCI runtime
+ // implementation supports the CheckpointContainer() operation.
+ SupportsCheckpoint() bool
+ // SupportsJSONErrors is whether the runtime can return JSON-formatted
+ // error messages.
+ SupportsJSONErrors() bool
+ // SupportsNoCgroups is whether the runtime supports running containers
+ // without cgroups.
+ SupportsNoCgroups() bool
+
+ // AttachSocketPath is the path to the socket to attach to a given
+ // container.
+ // TODO: If we move Attach code in here, this should be made internal.
+ // We don't want to force all runtimes to share the same attach
+ // implementation.
+ AttachSocketPath(ctr *Container) (string, error)
+ // ExecAttachSocketPath is the path to the socket to attach to a given
+ // exec session in the given container.
+ // TODO: Probably should be made internal.
+ ExecAttachSocketPath(ctr *Container, sessionID string) (string, error)
+ // ExitFilePath is the path to a container's exit file.
+ // All runtime implementations must create an exit file when containers
+ // exit, containing the exit code of the container (as a string).
+ // This is the path to that file for a given container.
+ ExitFilePath(ctr *Container) (string, error)
+
+ // RuntimeInfo returns verbose information about the runtime.
+ RuntimeInfo() (map[string]interface{}, error)
}
-func (r *OCIRuntime) featureCheckCheckpointing() bool {
- // Check if the runtime implements checkpointing. Currently only
- // runc's checkpoint/restore implementation is supported.
- cmd := exec.Command(r.path, "checkpoint", "-h")
- if err := cmd.Start(); err != nil {
- return false
- }
- if err := cmd.Wait(); err == nil {
- return true
- }
- return false
+// ExecOptions are options passed into ExecContainer. They control the command
+// that will be executed and how the exec will proceed.
+type ExecOptions struct {
+ // Cmd is the command to execute.
+ Cmd []string
+ // CapAdd is a set of capabilities to add to the executed command.
+ CapAdd []string
+ // Env is a set of environment variables to add to the container.
+ Env map[string]string
+ // Terminal is whether to create a new TTY for the exec session.
+ Terminal bool
+ // Cwd is the working directory for the executed command. If unset, the
+ // working directory of the container will be used.
+ Cwd string
+ // User is the user the command will be executed as. If unset, the user
+ // the container was run as will be used.
+ User string
+ // Streams are the streams that will be attached to the container.
+ Streams *AttachStreams
+ // PreserveFDs is a number of additional file descriptors (in addition
+ // to 0, 1, 2) that will be passed to the executed process. The total FDs
+ // passed will be 3 + PreserveFDs.
+ PreserveFDs uint
+ // Resize is a channel where terminal resize events are sent to be
+ // handled.
+ Resize chan remotecommand.TerminalSize
+ // DetachKeys is a set of keys that, when pressed in sequence, will
+ // detach from the container.
+ DetachKeys string
}