diff options
author | Matthew Heon <matthew.heon@pm.me> | 2019-10-08 13:53:36 -0400 |
---|---|---|
committer | Matthew Heon <matthew.heon@pm.me> | 2019-10-10 10:19:32 -0400 |
commit | 6f630bc09b3e937fe3ddc4a829715bacd5b6c779 (patch) | |
tree | 4f95293e4673bd5f046847c6b669bf124e57e90c /libpod | |
parent | a7f266891ca20214f56d0bb742896e9112f4905a (diff) | |
download | podman-6f630bc09b3e937fe3ddc4a829715bacd5b6c779.tar.gz podman-6f630bc09b3e937fe3ddc4a829715bacd5b6c779.tar.bz2 podman-6f630bc09b3e937fe3ddc4a829715bacd5b6c779.zip |
Move OCI runtime implementation behind an interface
For future work, we need multiple implementations of the OCI
runtime, not just a Conmon-wrapped runtime matching the runc CLI.
As part of this, do some refactoring on the interface for exec
(move to a struct, not a massive list of arguments). Also, add
'all' support to Kill and Stop (supported by runc and used a bit
internally for removing containers).
Signed-off-by: Matthew Heon <matthew.heon@pm.me>
Diffstat (limited to 'libpod')
-rw-r--r-- | libpod/container.go | 2 | ||||
-rw-r--r-- | libpod/container_api.go | 39 | ||||
-rw-r--r-- | libpod/container_commit.go | 4 | ||||
-rw-r--r-- | libpod/container_internal.go | 78 | ||||
-rw-r--r-- | libpod/container_internal_linux.go | 6 | ||||
-rw-r--r-- | libpod/healthcheck.go | 2 | ||||
-rw-r--r-- | libpod/info.go | 45 | ||||
-rw-r--r-- | libpod/networking_linux.go | 2 | ||||
-rw-r--r-- | libpod/oci.go | 557 | ||||
-rw-r--r-- | libpod/oci_attach_linux.go | 12 | ||||
-rw-r--r-- | libpod/oci_conmon_linux.go | 1421 | ||||
-rw-r--r-- | libpod/oci_conmon_unsupported.go | 130 | ||||
-rw-r--r-- | libpod/oci_internal_linux.go | 556 | ||||
-rw-r--r-- | libpod/oci_linux.go | 503 | ||||
-rw-r--r-- | libpod/oci_unsupported.go | 47 | ||||
-rw-r--r-- | libpod/oci_util.go | 113 | ||||
-rw-r--r-- | libpod/pod_api.go | 4 | ||||
-rw-r--r-- | libpod/runtime.go | 17 | ||||
-rw-r--r-- | libpod/runtime_ctr.go | 16 |
19 files changed, 1916 insertions, 1638 deletions
diff --git a/libpod/container.go b/libpod/container.go index f36ddbd3f..7be73b3c3 100644 --- a/libpod/container.go +++ b/libpod/container.go @@ -129,7 +129,7 @@ type Container struct { valid bool lock lock.Locker runtime *Runtime - ociRuntime *OCIRuntime + ociRuntime OCIRuntime rootlessSlirpSyncR *os.File rootlessSlirpSyncW *os.File diff --git a/libpod/container_api.go b/libpod/container_api.go index 4f0d5301c..04c796410 100644 --- a/libpod/container_api.go +++ b/libpod/container_api.go @@ -187,7 +187,7 @@ func (c *Container) StopWithTimeout(timeout uint) error { return define.ErrCtrStopped } - return c.stop(timeout) + return c.stop(timeout, false) } // Kill sends a signal to a container @@ -205,13 +205,15 @@ func (c *Container) Kill(signal uint) error { return errors.Wrapf(define.ErrCtrStateInvalid, "can only kill running containers. %s is in state %s", c.ID(), c.state.State.String()) } - defer c.newContainerEvent(events.Kill) - if err := c.ociRuntime.killContainer(c, signal); err != nil { + // Hardcode all = false, we only use all when removing. + if err := c.ociRuntime.KillContainer(c, signal, false); err != nil { return err } c.state.StoppedByUser = true + c.newContainerEvent(events.Kill) + return c.save() } @@ -221,7 +223,7 @@ func (c *Container) Kill(signal uint) error { // Sometimes, the $RUNTIME exec call errors, and if that is the case, the exit code is the exit code of the call. // Otherwise, the exit code will be the exit code of the executed call inside of the container. // TODO investigate allowing exec without attaching -func (c *Container) Exec(tty, privileged bool, env, cmd []string, user, workDir string, streams *AttachStreams, preserveFDs int, resize chan remotecommand.TerminalSize, detachKeys string) (int, error) { +func (c *Container) Exec(tty, privileged bool, env map[string]string, cmd []string, user, workDir string, streams *AttachStreams, preserveFDs uint, resize chan remotecommand.TerminalSize, detachKeys string) (int, error) { var capList []string if !c.batched { c.lock.Lock() @@ -278,7 +280,19 @@ func (c *Container) Exec(tty, privileged bool, env, cmd []string, user, workDir user = c.config.User } - pid, attachChan, err := c.ociRuntime.execContainer(c, cmd, capList, env, tty, workDir, user, sessionID, streams, preserveFDs, resize, detachKeys) + opts := new(ExecOptions) + opts.Cmd = cmd + opts.CapAdd = capList + opts.Env = env + opts.Terminal = tty + opts.Cwd = workDir + opts.User = user + opts.Streams = streams + opts.PreserveFDs = preserveFDs + opts.Resize = resize + opts.DetachKeys = detachKeys + + pid, attachChan, err := c.ociRuntime.ExecContainer(c, sessionID, opts) if err != nil { ec := define.ExecErrorCodeGeneric // Conmon will pass a non-zero exit code from the runtime as a pid here. @@ -524,7 +538,10 @@ func (c *Container) WaitWithInterval(waitTimeout time.Duration) (int32, error) { return -1, define.ErrCtrRemoved } - exitFile := c.exitFilePath() + exitFile, err := c.exitFilePath() + if err != nil { + return -1, err + } chWait := make(chan error, 1) defer close(chWait) @@ -639,7 +656,7 @@ func (c *Container) Sync() error { (c.state.State != define.ContainerStateConfigured) && (c.state.State != define.ContainerStateExited) { oldState := c.state.State - if err := c.ociRuntime.updateContainerStatus(c, true); err != nil { + if err := c.ociRuntime.UpdateContainerStatus(c, true); err != nil { return err } // Only save back to DB if state changed @@ -687,7 +704,7 @@ func (c *Container) Refresh(ctx context.Context) error { // Next, if the container is running, stop it if c.state.State == define.ContainerStateRunning { - if err := c.stop(c.config.StopTimeout); err != nil { + if err := c.stop(c.config.StopTimeout, false); err != nil { return err } } @@ -696,8 +713,10 @@ func (c *Container) Refresh(ctx context.Context) error { if len(c.state.ExecSessions) > 0 { logrus.Infof("Killing %d exec sessions in container %s. They will not be restored after refresh.", len(c.state.ExecSessions), c.ID()) - if err := c.ociRuntime.execStopContainer(c, c.config.StopTimeout); err != nil { - return err + } + for _, session := range c.state.ExecSessions { + if err := c.ociRuntime.ExecStopContainer(c, session.ID, c.StopTimeout()); err != nil { + return errors.Wrapf(err, "error stopping exec session %s of container %s", session.ID, c.ID()) } } diff --git a/libpod/container_commit.go b/libpod/container_commit.go index 570d406b7..d5afe0da7 100644 --- a/libpod/container_commit.go +++ b/libpod/container_commit.go @@ -50,11 +50,11 @@ func (c *Container) Commit(ctx context.Context, destImage string, options Contai } if c.state.State == define.ContainerStateRunning && options.Pause { - if err := c.ociRuntime.pauseContainer(c); err != nil { + if err := c.pause(); err != nil { return nil, errors.Wrapf(err, "error pausing container %q", c.ID()) } defer func() { - if err := c.ociRuntime.unpauseContainer(c); err != nil { + if err := c.unpause(); err != nil { logrus.Errorf("error unpausing container %q: %v", c.ID(), err) } }() diff --git a/libpod/container_internal.go b/libpod/container_internal.go index ac921d737..e7f541c52 100644 --- a/libpod/container_internal.go +++ b/libpod/container_internal.go @@ -131,13 +131,13 @@ func (c *Container) CheckpointPath() string { } // AttachSocketPath retrieves the path of the container's attach socket -func (c *Container) AttachSocketPath() string { - return filepath.Join(c.ociRuntime.socketsDir, c.ID(), "attach") +func (c *Container) AttachSocketPath() (string, error) { + return c.ociRuntime.AttachSocketPath(c) } // exitFilePath gets the path to the container's exit file -func (c *Container) exitFilePath() string { - return filepath.Join(c.ociRuntime.exitsDir, c.ID()) +func (c *Container) exitFilePath() (string, error) { + return c.ociRuntime.ExitFilePath(c) } // create a bundle path and associated files for an exec session @@ -167,12 +167,8 @@ func (c *Container) cleanupExecBundle(sessionID string) error { if err := os.RemoveAll(c.execBundlePath(sessionID)); err != nil && !os.IsNotExist(err) { return err } - // Clean up the sockets dir. Issue #3962 - // Also ignore if it doesn't exist for some reason; hence the conditional return below - if err := os.RemoveAll(filepath.Join(c.ociRuntime.socketsDir, sessionID)); err != nil && !os.IsNotExist(err) { - return err - } - return nil + + return c.ociRuntime.ExecContainerCleanup(c, sessionID) } // the path to a containers exec session bundle @@ -191,8 +187,8 @@ func (c *Container) execLogPath(sessionID string) string { } // the socket conmon creates for an exec session -func (c *Container) execAttachSocketPath(sessionID string) string { - return filepath.Join(c.ociRuntime.socketsDir, sessionID, "attach") +func (c *Container) execAttachSocketPath(sessionID string) (string, error) { + return c.ociRuntime.ExecAttachSocketPath(c, sessionID) } // execExitFileDir gets the path to the container's exit file @@ -202,7 +198,7 @@ func (c *Container) execExitFileDir(sessionID string) string { // execOCILog returns the file path for the exec sessions oci log func (c *Container) execOCILog(sessionID string) string { - if !c.ociRuntime.supportsJSON { + if !c.ociRuntime.SupportsJSONErrors() { return "" } return filepath.Join(c.execBundlePath(sessionID), "oci-log") @@ -233,12 +229,15 @@ func (c *Container) readExecExitCode(sessionID string) (int, error) { // Wait for the container's exit file to appear. // When it does, update our state based on it. func (c *Container) waitForExitFileAndSync() error { - exitFile := c.exitFilePath() + exitFile, err := c.exitFilePath() + if err != nil { + return err + } chWait := make(chan error) defer close(chWait) - _, err := WaitForFile(exitFile, chWait, time.Second*5) + _, err = WaitForFile(exitFile, chWait, time.Second*5) if err != nil { // Exit file did not appear // Reset our state @@ -253,7 +252,7 @@ func (c *Container) waitForExitFileAndSync() error { return err } - if err := c.ociRuntime.updateContainerStatus(c, false); err != nil { + if err := c.ociRuntime.UpdateContainerStatus(c, false); err != nil { return err } @@ -388,7 +387,7 @@ func (c *Container) syncContainer() error { (c.state.State != define.ContainerStateExited) { oldState := c.state.State // TODO: optionally replace this with a stat for the exit file - if err := c.ociRuntime.updateContainerStatus(c, false); err != nil { + if err := c.ociRuntime.UpdateContainerStatus(c, false); err != nil { return err } // Only save back to DB if state changed @@ -649,7 +648,10 @@ func (c *Container) removeConmonFiles() error { } // Remove the exit file so we don't leak memory in tmpfs - exitFile := filepath.Join(c.ociRuntime.exitsDir, c.ID()) + exitFile, err := c.exitFilePath() + if err != nil { + return err + } if err := os.Remove(exitFile); err != nil && !os.IsNotExist(err) { return errors.Wrapf(err, "error removing container %s exit file", c.ID()) } @@ -938,9 +940,13 @@ func (c *Container) init(ctx context.Context, retainRetries bool) error { } // With the spec complete, do an OCI create - if err := c.ociRuntime.createContainer(c, nil); err != nil { + if err := c.ociRuntime.CreateContainer(c, nil); err != nil { + // Fedora 31 is carrying a patch to display improved error + // messages to better handle the V2 transition. This is NOT + // upstream in any OCI runtime. + // TODO: Remove once runc supports cgroupsv2 if strings.Contains(err.Error(), "this version of runc doesn't work on cgroups v2") { - logrus.Errorf("oci runtime %q does not support CGroups V2: use system migrate to mitigate", c.ociRuntime.name) + logrus.Errorf("oci runtime %q does not support CGroups V2: use system migrate to mitigate", c.ociRuntime.Name()) } return err } @@ -1088,7 +1094,7 @@ func (c *Container) start() error { logrus.Debugf("Starting container %s with command %v", c.ID(), c.config.Spec.Process.Args) } - if err := c.ociRuntime.startContainer(c); err != nil { + if err := c.ociRuntime.StartContainer(c); err != nil { return err } logrus.Debugf("Started container %s", c.ID()) @@ -1110,10 +1116,28 @@ func (c *Container) start() error { } // Internal, non-locking function to stop container -func (c *Container) stop(timeout uint) error { +func (c *Container) stop(timeout uint, all bool) error { logrus.Debugf("Stopping ctr %s (timeout %d)", c.ID(), timeout) - if err := c.ociRuntime.stopContainer(c, timeout); err != nil { + // We can't use --all if CGroups aren't present. + // Rootless containers with CGroups v1 and NoCgroups are both cases + // where this can happen. + if all { + if c.config.NoCgroups { + all = false + } else if rootless.IsRootless() { + // Only do this check if we need to + unified, err := cgroups.IsCgroup2UnifiedMode() + if err != nil { + return err + } + if !unified { + all = false + } + } + } + + if err := c.ociRuntime.StopContainer(c, timeout, all); err != nil { return err } @@ -1150,7 +1174,7 @@ func (c *Container) pause() error { } } - if err := c.ociRuntime.pauseContainer(c); err != nil { + if err := c.ociRuntime.PauseContainer(c); err != nil { return err } @@ -1167,7 +1191,7 @@ func (c *Container) unpause() error { return errors.Wrapf(define.ErrNoCgroups, "cannot unpause without using CGroups") } - if err := c.ociRuntime.unpauseContainer(c); err != nil { + if err := c.ociRuntime.UnpauseContainer(c); err != nil { return err } @@ -1188,7 +1212,7 @@ func (c *Container) restartWithTimeout(ctx context.Context, timeout uint) (err e if c.state.State == define.ContainerStateRunning { conmonPID := c.state.ConmonPID - if err := c.stop(timeout); err != nil { + if err := c.stop(timeout, false); err != nil { return err } // Old versions of conmon have a bug where they create the exit file before @@ -1475,7 +1499,7 @@ func (c *Container) delete(ctx context.Context) (err error) { span.SetTag("struct", "container") defer span.Finish() - if err := c.ociRuntime.deleteContainer(c); err != nil { + if err := c.ociRuntime.DeleteContainer(c); err != nil { return errors.Wrapf(err, "error removing container %s from runtime", c.ID()) } diff --git a/libpod/container_internal_linux.go b/libpod/container_internal_linux.go index 2636fdb6c..a35daf71d 100644 --- a/libpod/container_internal_linux.go +++ b/libpod/container_internal_linux.go @@ -659,7 +659,7 @@ func (c *Container) checkpointRestoreSupported() (err error) { if !criu.CheckForCriu() { return errors.Errorf("Checkpoint/Restore requires at least CRIU %d", criu.MinCriuVersion) } - if !c.ociRuntime.featureCheckCheckpointing() { + if !c.ociRuntime.SupportsCheckpoint() { return errors.Errorf("Configured runtime does not support checkpoint/restore") } return nil @@ -695,7 +695,7 @@ func (c *Container) checkpoint(ctx context.Context, options ContainerCheckpointO return err } - if err := c.ociRuntime.checkpointContainer(c, options); err != nil { + if err := c.ociRuntime.CheckpointContainer(c, options); err != nil { return err } @@ -923,7 +923,7 @@ func (c *Container) restore(ctx context.Context, options ContainerCheckpointOpti } } - if err := c.ociRuntime.createContainer(c, &options); err != nil { + if err := c.ociRuntime.CreateContainer(c, &options); err != nil { return err } diff --git a/libpod/healthcheck.go b/libpod/healthcheck.go index 0338828e4..68ffc2349 100644 --- a/libpod/healthcheck.go +++ b/libpod/healthcheck.go @@ -141,7 +141,7 @@ func (c *Container) runHealthCheck() (HealthCheckStatus, error) { logrus.Debugf("executing health check command %s for %s", strings.Join(newCommand, " "), c.ID()) timeStart := time.Now() hcResult := HealthCheckSuccess - _, hcErr := c.Exec(false, false, []string{}, newCommand, "", "", streams, 0, nil, "") + _, hcErr := c.Exec(false, false, map[string]string{}, newCommand, "", "", streams, 0, nil, "") if hcErr != nil { errCause := errors.Cause(hcErr) hcResult = HealthCheckFailure diff --git a/libpod/info.go b/libpod/info.go index 297086ebb..2c28b67c8 100644 --- a/libpod/info.go +++ b/libpod/info.go @@ -15,7 +15,6 @@ import ( "github.com/containers/buildah" "github.com/containers/libpod/pkg/cgroups" "github.com/containers/libpod/pkg/rootless" - "github.com/containers/libpod/utils" "github.com/containers/storage" "github.com/containers/storage/pkg/system" "github.com/pkg/errors" @@ -48,14 +47,7 @@ func (r *Runtime) hostInfo() (map[string]interface{}, error) { info["MemFree"] = mi.MemFree info["SwapTotal"] = mi.SwapTotal info["SwapFree"] = mi.SwapFree - conmonVersion, _ := r.GetConmonVersion() - ociruntimeVersion, _ := r.GetOCIRuntimeVersion() hostDistributionInfo := r.GetHostDistributionInfo() - info["Conmon"] = map[string]interface{}{ - "path": r.conmonPath, - "package": r.defaultOCIRuntime.conmonPackage(), - "version": conmonVersion, - } if rootless.IsRootless() { if path, err := exec.LookPath("slirp4netns"); err == nil { logrus.Warnf("Failed to retrieve program version for %s: %v", path, err) @@ -70,11 +62,6 @@ func (r *Runtime) hostInfo() (map[string]interface{}, error) { info["slirp4netns"] = program } } - info["OCIRuntime"] = map[string]interface{}{ - "path": r.defaultOCIRuntime.path, - "package": r.defaultOCIRuntime.pathPackage(), - "version": ociruntimeVersion, - } info["Distribution"] = map[string]interface{}{ "distribution": hostDistributionInfo["Distribution"], "version": hostDistributionInfo["Version"], @@ -86,6 +73,15 @@ func (r *Runtime) hostInfo() (map[string]interface{}, error) { } info["kernel"] = kv + runtimeInfo, err := r.defaultOCIRuntime.RuntimeInfo() + if err != nil { + logrus.Errorf("Error getting info on OCI runtime %s: %v", r.defaultOCIRuntime.Name(), err) + } else { + for k, v := range runtimeInfo { + info[k] = v + } + } + up, err := readUptime() if err != nil { return nil, errors.Wrapf(err, "error reading up time") @@ -215,29 +211,6 @@ func readUptime() (string, error) { return string(f[0]), nil } -// GetConmonVersion returns a string representation of the conmon version -func (r *Runtime) GetConmonVersion() (string, error) { - output, err := utils.ExecCmd(r.conmonPath, "--version") - if err != nil { - return "", err - } - return strings.TrimSuffix(strings.Replace(output, "\n", ", ", 1), "\n"), nil -} - -// GetOCIRuntimePath returns the path to the OCI Runtime Path the runtime is using -func (r *Runtime) GetOCIRuntimePath() string { - return r.defaultOCIRuntime.path -} - -// GetOCIRuntimeVersion returns a string representation of the oci runtimes version -func (r *Runtime) GetOCIRuntimeVersion() (string, error) { - output, err := utils.ExecCmd(r.GetOCIRuntimePath(), "--version") - if err != nil { - return "", err - } - return strings.TrimSuffix(output, "\n"), nil -} - // GetHostDistributionInfo returns a map containing the host's distribution and version func (r *Runtime) GetHostDistributionInfo() map[string]string { dist := make(map[string]string) diff --git a/libpod/networking_linux.go b/libpod/networking_linux.go index 61ab57d65..8181cbc8a 100644 --- a/libpod/networking_linux.go +++ b/libpod/networking_linux.go @@ -157,7 +157,7 @@ func (r *Runtime) setupRootlessNetNS(ctr *Container) (err error) { defer errorhandling.CloseQuiet(syncW) havePortMapping := len(ctr.Config().PortMappings) > 0 - apiSocket := filepath.Join(ctr.ociRuntime.tmpDir, fmt.Sprintf("%s.net", ctr.config.ID)) + apiSocket := filepath.Join(ctr.runtime.config.TmpDir, fmt.Sprintf("%s.net", ctr.config.ID)) cmdArgs := []string{} if havePortMapping { diff --git a/libpod/oci.go b/libpod/oci.go index 9879fa90e..37d04349f 100644 --- a/libpod/oci.go +++ b/libpod/oci.go @@ -1,441 +1,132 @@ package libpod import ( - "bytes" - "fmt" - "io/ioutil" - "net" - "os" - "os/exec" - "path/filepath" - "strings" - "time" - - "github.com/containers/libpod/libpod/define" - "github.com/containers/libpod/pkg/util" - "github.com/cri-o/ocicni/pkg/ocicni" - spec "github.com/opencontainers/runtime-spec/specs-go" - "github.com/opencontainers/selinux/go-selinux/label" - "github.com/pkg/errors" - "github.com/sirupsen/logrus" - - // TODO import these functions into libpod and remove the import - // Trying to keep libpod from depending on CRI-O code - "github.com/containers/libpod/utils" -) - -// OCI code is undergoing heavy rewrite - -const ( - // CgroupfsCgroupsManager represents cgroupfs native cgroup manager - CgroupfsCgroupsManager = "cgroupfs" - // SystemdCgroupsManager represents systemd native cgroup manager - SystemdCgroupsManager = "systemd" - - // ContainerCreateTimeout represents the value of container creating timeout - ContainerCreateTimeout = 240 * time.Second - - // Timeout before declaring that runtime has failed to kill a given - // container - killContainerTimeout = 5 * time.Second - // DefaultShmSize is the default shm size - DefaultShmSize = 64 * 1024 * 1024 - // NsRunDir is the default directory in which running network namespaces - // are stored - NsRunDir = "/var/run/netns" + "k8s.io/client-go/tools/remotecommand" ) -// OCIRuntime represents an OCI-compatible runtime that libpod can call into -// to perform container operations -type OCIRuntime struct { - name string - path string - conmonPath string - conmonEnv []string - cgroupManager string - tmpDir string - exitsDir string - socketsDir string - logSizeMax int64 - noPivot bool - reservePorts bool - supportsJSON bool - supportsNoCgroups bool - sdNotify bool -} - -// ociError is used to parse the OCI runtime JSON log. It is not part of the -// OCI runtime specifications, it follows what runc does -type ociError struct { - Level string `json:"level,omitempty"` - Time string `json:"time,omitempty"` - Msg string `json:"msg,omitempty"` -} - -// Make a new OCI runtime with provided options. -// The first path that points to a valid executable will be used. -func newOCIRuntime(name string, paths []string, conmonPath string, runtimeCfg *RuntimeConfig, supportsJSON, supportsNoCgroups bool) (*OCIRuntime, error) { - if name == "" { - return nil, errors.Wrapf(define.ErrInvalidArg, "the OCI runtime must be provided a non-empty name") - } - - runtime := new(OCIRuntime) - runtime.name = name - runtime.conmonPath = conmonPath - - runtime.conmonEnv = runtimeCfg.ConmonEnvVars - runtime.cgroupManager = runtimeCfg.CgroupManager - runtime.tmpDir = runtimeCfg.TmpDir - runtime.logSizeMax = runtimeCfg.MaxLogSize - runtime.noPivot = runtimeCfg.NoPivotRoot - runtime.reservePorts = runtimeCfg.EnablePortReservation - runtime.sdNotify = runtimeCfg.SDNotify - - // TODO: probe OCI runtime for feature and enable automatically if - // available. - runtime.supportsJSON = supportsJSON - runtime.supportsNoCgroups = supportsNoCgroups - - foundPath := false - for _, path := range paths { - stat, err := os.Stat(path) - if err != nil { - if os.IsNotExist(err) { - continue - } - return nil, errors.Wrapf(err, "cannot stat %s", path) - } - if !stat.Mode().IsRegular() { - continue - } - foundPath = true - runtime.path = path - logrus.Debugf("using runtime %q", path) - break - } - - // Search the $PATH as last fallback - if !foundPath { - if foundRuntime, err := exec.LookPath(name); err == nil { - foundPath = true - runtime.path = foundRuntime - logrus.Debugf("using runtime %q from $PATH: %q", name, foundRuntime) - } - } - - if !foundPath { - return nil, errors.Wrapf(define.ErrInvalidArg, "no valid executable found for OCI runtime %s", name) - } - - runtime.exitsDir = filepath.Join(runtime.tmpDir, "exits") - runtime.socketsDir = filepath.Join(runtime.tmpDir, "socket") - - if runtime.cgroupManager != CgroupfsCgroupsManager && runtime.cgroupManager != SystemdCgroupsManager { - return nil, errors.Wrapf(define.ErrInvalidArg, "invalid cgroup manager specified: %s", runtime.cgroupManager) - } - - // Create the exit files and attach sockets directories - if err := os.MkdirAll(runtime.exitsDir, 0750); err != nil { - // The directory is allowed to exist - if !os.IsExist(err) { - return nil, errors.Wrapf(err, "error creating OCI runtime exit files directory %s", - runtime.exitsDir) - } - } - if err := os.MkdirAll(runtime.socketsDir, 0750); err != nil { - // The directory is allowed to exist - if !os.IsExist(err) { - return nil, errors.Wrapf(err, "error creating OCI runtime attach sockets directory %s", - runtime.socketsDir) - } - } - - return runtime, nil -} - -// Create systemd unit name for cgroup scopes -func createUnitName(prefix string, name string) string { - return fmt.Sprintf("%s-%s.scope", prefix, name) -} - -func bindPorts(ports []ocicni.PortMapping) ([]*os.File, error) { - var files []*os.File - notifySCTP := false - for _, i := range ports { - switch i.Protocol { - case "udp": - addr, err := net.ResolveUDPAddr("udp", fmt.Sprintf("%s:%d", i.HostIP, i.HostPort)) - if err != nil { - return nil, errors.Wrapf(err, "cannot resolve the UDP address") - } - - server, err := net.ListenUDP("udp", addr) - if err != nil { - return nil, errors.Wrapf(err, "cannot listen on the UDP port") - } - f, err := server.File() - if err != nil { - return nil, errors.Wrapf(err, "cannot get file for UDP socket") - } - files = append(files, f) - - case "tcp": - addr, err := net.ResolveTCPAddr("tcp4", fmt.Sprintf("%s:%d", i.HostIP, i.HostPort)) - if err != nil { - return nil, errors.Wrapf(err, "cannot resolve the TCP address") - } - - server, err := net.ListenTCP("tcp4", addr) - if err != nil { - return nil, errors.Wrapf(err, "cannot listen on the TCP port") - } - f, err := server.File() - if err != nil { - return nil, errors.Wrapf(err, "cannot get file for TCP socket") - } - files = append(files, f) - case "sctp": - if !notifySCTP { - notifySCTP = true - logrus.Warnf("port reservation for SCTP is not supported") - } - default: - return nil, fmt.Errorf("unknown protocol %s", i.Protocol) - - } - } - return files, nil -} - -// updateContainerStatus retrieves the current status of the container from the -// runtime. It updates the container's state but does not save it. -// If useRunc is false, we will not directly hit runc to see the container's -// status, but will instead only check for the existence of the conmon exit file -// and update state to stopped if it exists. -func (r *OCIRuntime) updateContainerStatus(ctr *Container, useRuntime bool) error { - exitFile := ctr.exitFilePath() - - runtimeDir, err := util.GetRuntimeDir() - if err != nil { - return err - } - - // If not using the OCI runtime, we don't need to do most of this. - if !useRuntime { - // If the container's not running, nothing to do. - if ctr.state.State != define.ContainerStateRunning && ctr.state.State != define.ContainerStatePaused { - return nil - } - - // Check for the exit file conmon makes - info, err := os.Stat(exitFile) - if err != nil { - if os.IsNotExist(err) { - // Container is still running, no error - return nil - } - - return errors.Wrapf(err, "error running stat on container %s exit file", ctr.ID()) - } - - // Alright, it exists. Transition to Stopped state. - ctr.state.State = define.ContainerStateStopped - ctr.state.PID = 0 - ctr.state.ConmonPID = 0 - - // Read the exit file to get our stopped time and exit code. - return ctr.handleExitFile(exitFile, info) - } - - // Store old state so we know if we were already stopped - oldState := ctr.state.State - - state := new(spec.State) - - cmd := exec.Command(r.path, "state", ctr.ID()) - cmd.Env = append(cmd.Env, fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)) - - outPipe, err := cmd.StdoutPipe() - if err != nil { - return errors.Wrapf(err, "getting stdout pipe") - } - errPipe, err := cmd.StderrPipe() - if err != nil { - return errors.Wrapf(err, "getting stderr pipe") - } - - if err := cmd.Start(); err != nil { - out, err2 := ioutil.ReadAll(errPipe) - if err2 != nil { - return errors.Wrapf(err, "error getting container %s state", ctr.ID()) - } - if strings.Contains(string(out), "does not exist") { - if err := ctr.removeConmonFiles(); err != nil { - logrus.Debugf("unable to remove conmon files for container %s", ctr.ID()) - } - ctr.state.ExitCode = -1 - ctr.state.FinishedTime = time.Now() - ctr.state.State = define.ContainerStateExited - return nil - } - return errors.Wrapf(err, "error getting container %s state. stderr/out: %s", ctr.ID(), out) - } - defer func() { - _ = cmd.Wait() - }() - - if err := errPipe.Close(); err != nil { - return err - } - out, err := ioutil.ReadAll(outPipe) - if err != nil { - return errors.Wrapf(err, "error reading stdout: %s", ctr.ID()) - } - if err := json.NewDecoder(bytes.NewBuffer(out)).Decode(state); err != nil { - return errors.Wrapf(err, "error decoding container status for container %s", ctr.ID()) - } - ctr.state.PID = state.Pid - - switch state.Status { - case "created": - ctr.state.State = define.ContainerStateCreated - case "paused": - ctr.state.State = define.ContainerStatePaused - case "running": - ctr.state.State = define.ContainerStateRunning - case "stopped": - ctr.state.State = define.ContainerStateStopped - default: - return errors.Wrapf(define.ErrInternal, "unrecognized status returned by runtime for container %s: %s", - ctr.ID(), state.Status) - } - - // Only grab exit status if we were not already stopped - // If we were, it should already be in the database - if ctr.state.State == define.ContainerStateStopped && oldState != define.ContainerStateStopped { - var fi os.FileInfo - chWait := make(chan error) - defer close(chWait) - - _, err := WaitForFile(exitFile, chWait, time.Second*5) - if err == nil { - fi, err = os.Stat(exitFile) - } - if err != nil { - ctr.state.ExitCode = -1 - ctr.state.FinishedTime = time.Now() - logrus.Errorf("No exit file for container %s found: %v", ctr.ID(), err) - return nil - } - - return ctr.handleExitFile(exitFile, fi) - } - - return nil -} - -// startContainer starts the given container -// Sets time the container was started, but does not save it. -func (r *OCIRuntime) startContainer(ctr *Container) error { - // TODO: streams should probably *not* be our STDIN/OUT/ERR - redirect to buffers? - runtimeDir, err := util.GetRuntimeDir() - if err != nil { - return err - } - env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} - if notify, ok := os.LookupEnv("NOTIFY_SOCKET"); ok { - env = append(env, fmt.Sprintf("NOTIFY_SOCKET=%s", notify)) - } - if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "start", ctr.ID()); err != nil { - return err - } - - ctr.state.StartedTime = time.Now() - - return nil -} - -// killContainer sends the given signal to the given container -func (r *OCIRuntime) killContainer(ctr *Container, signal uint) error { - logrus.Debugf("Sending signal %d to container %s", signal, ctr.ID()) - runtimeDir, err := util.GetRuntimeDir() - if err != nil { - return err - } - env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} - if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "kill", ctr.ID(), fmt.Sprintf("%d", signal)); err != nil { - return errors.Wrapf(err, "error sending signal to container %s", ctr.ID()) - } - - return nil -} - -// deleteContainer deletes a container from the OCI runtime -func (r *OCIRuntime) deleteContainer(ctr *Container) error { - runtimeDir, err := util.GetRuntimeDir() - if err != nil { - return err - } - env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} - return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "delete", "--force", ctr.ID()) -} - -// pauseContainer pauses the given container -func (r *OCIRuntime) pauseContainer(ctr *Container) error { - runtimeDir, err := util.GetRuntimeDir() - if err != nil { - return err - } - env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} - return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "pause", ctr.ID()) -} - -// unpauseContainer unpauses the given container -func (r *OCIRuntime) unpauseContainer(ctr *Container) error { - runtimeDir, err := util.GetRuntimeDir() - if err != nil { - return err - } - env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} - return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "resume", ctr.ID()) -} - -// checkpointContainer checkpoints the given container -func (r *OCIRuntime) checkpointContainer(ctr *Container, options ContainerCheckpointOptions) error { - if err := label.SetSocketLabel(ctr.ProcessLabel()); err != nil { - return err - } - // imagePath is used by CRIU to store the actual checkpoint files - imagePath := ctr.CheckpointPath() - // workPath will be used to store dump.log and stats-dump - workPath := ctr.bundlePath() - logrus.Debugf("Writing checkpoint to %s", imagePath) - logrus.Debugf("Writing checkpoint logs to %s", workPath) - args := []string{} - args = append(args, "checkpoint") - args = append(args, "--image-path") - args = append(args, imagePath) - args = append(args, "--work-path") - args = append(args, workPath) - if options.KeepRunning { - args = append(args, "--leave-running") - } - if options.TCPEstablished { - args = append(args, "--tcp-established") - } - args = append(args, ctr.ID()) - return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, nil, r.path, args...) +// OCIRuntime is an implementation of an OCI runtime. +// The OCI runtime implementation is expected to be a fairly thin wrapper around +// the actual runtime, and is not expected to include things like state +// management logic - e.g., we do not expect it to determine on its own that +// calling 'UnpauseContainer()' on a container that is not paused is an error. +// The code calling the OCIRuntime will manage this. +// TODO: May want to move the Attach() code under this umbrella. It's highly OCI +// runtime dependent. +// TODO: May want to move the conmon cleanup code here too - it depends on +// Conmon being in use. +type OCIRuntime interface { + // Name returns the name of the runtime. + Name() string + // Path returns the path to the runtime executable. + Path() string + + // CreateContainer creates the container in the OCI runtime. + CreateContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) error + // UpdateContainerStatus updates the status of the given container. + // It includes a switch for whether to perform a hard query of the + // runtime. If unset, the exit file (if supported by the implementation) + // will be used. + UpdateContainerStatus(ctr *Container, useRuntime bool) error + // StartContainer starts the given container. + StartContainer(ctr *Container) error + // KillContainer sends the given signal to the given container. + // If all is set, all processes in the container will be signalled; + // otherwise, only init will be signalled. + KillContainer(ctr *Container, signal uint, all bool) error + // StopContainer stops the given container. + // The container's stop signal (or SIGTERM if unspecified) will be sent + // first. + // After the given timeout, SIGKILL will be sent. + // If the given timeout is 0, SIGKILL will be sent immediately, and the + // stop signal will be omitted. + // If all is set, we will attempt to use the --all flag will `kill` in + // the OCI runtime to kill all processes in the container, including + // exec sessions. This is only supported if the container has cgroups. + StopContainer(ctr *Container, timeout uint, all bool) error + // DeleteContainer deletes the given container from the OCI runtime. + DeleteContainer(ctr *Container) error + // PauseContainer pauses the given container. + PauseContainer(ctr *Container) error + // UnpauseContainer unpauses the given container. + UnpauseContainer(ctr *Container) error + + // ExecContainer executes a command in a running container. + // Returns an int (exit code), error channel (errors from attach), and + // error (errors that occurred attempting to start the exec session). + ExecContainer(ctr *Container, sessionID string, options *ExecOptions) (int, chan error, error) + // ExecStopContainer stops a given exec session in a running container. + // SIGTERM with be sent initially, then SIGKILL after the given timeout. + // If timeout is 0, SIGKILL will be sent immediately, and SIGTERM will + // be omitted. + ExecStopContainer(ctr *Container, sessionID string, timeout uint) error + // ExecContainerCleanup cleans up after an exec session exits. + // It removes any files left by the exec session that are no longer + // needed, including the attach socket. + ExecContainerCleanup(ctr *Container, sessionID string) error + + // CheckpointContainer checkpoints the given container. + // Some OCI runtimes may not support this - if SupportsCheckpoint() + // returns false, this is not implemented, and will always return an + // error. + CheckpointContainer(ctr *Container, options ContainerCheckpointOptions) error + + // SupportsCheckpoint returns whether this OCI runtime + // implementation supports the CheckpointContainer() operation. + SupportsCheckpoint() bool + // SupportsJSONErrors is whether the runtime can return JSON-formatted + // error messages. + SupportsJSONErrors() bool + // SupportsNoCgroups is whether the runtime supports running containers + // without cgroups. + SupportsNoCgroups() bool + + // AttachSocketPath is the path to the socket to attach to a given + // container. + // TODO: If we move Attach code in here, this should be made internal. + // We don't want to force all runtimes to share the same attach + // implementation. + AttachSocketPath(ctr *Container) (string, error) + // ExecAttachSocketPath is the path to the socket to attach to a given + // exec session in the given container. + // TODO: Probably should be made internal. + ExecAttachSocketPath(ctr *Container, sessionID string) (string, error) + // ExitFilePath is the path to a container's exit file. + // All runtime implementations must create an exit file when containers + // exit, containing the exit code of the container (as a string). + // This is the path to that file for a given container. + ExitFilePath(ctr *Container) (string, error) + + // RuntimeInfo returns verbose information about the runtime. + RuntimeInfo() (map[string]interface{}, error) } -func (r *OCIRuntime) featureCheckCheckpointing() bool { - // Check if the runtime implements checkpointing. Currently only - // runc's checkpoint/restore implementation is supported. - cmd := exec.Command(r.path, "checkpoint", "-h") - if err := cmd.Start(); err != nil { - return false - } - if err := cmd.Wait(); err == nil { - return true - } - return false +// ExecOptions are options passed into ExecContainer. They control the command +// that will be executed and how the exec will proceed. +type ExecOptions struct { + // Cmd is the command to execute. + Cmd []string + // CapAdd is a set of capabilities to add to the executed command. + CapAdd []string + // Env is a set of environment variables to add to the container. + Env map[string]string + // Terminal is whether to create a new TTY for the exec session. + Terminal bool + // Cwd is the working directory for the executed command. If unset, the + // working directory of the container will be used. + Cwd string + // User is the user the command will be executed as. If unset, the user + // the container was run as will be used. + User string + // Streams are the streams that will be attached to the container. + Streams *AttachStreams + // PreserveFDs is a number of additional file descriptors (in addition + // to 0, 1, 2) that will be passed to the executed process. The total FDs + // passed will be 3 + PreserveFDs. + PreserveFDs uint + // Resize is a channel where terminal resize events are sent to be + // handled. + Resize chan remotecommand.TerminalSize + // DetachKeys is a set of keys that, when pressed in sequence, will + // detach from the container. + DetachKeys string } diff --git a/libpod/oci_attach_linux.go b/libpod/oci_attach_linux.go index 6cada0801..a383f6eab 100644 --- a/libpod/oci_attach_linux.go +++ b/libpod/oci_attach_linux.go @@ -47,7 +47,11 @@ func (c *Container) attach(streams *AttachStreams, keys string, resize <-chan re registerResizeFunc(resize, c.bundlePath()) - socketPath := buildSocketPath(c.AttachSocketPath()) + attachSock, err := c.AttachSocketPath() + if err != nil { + return err + } + socketPath := buildSocketPath(attachSock) conn, err := net.DialUnix("unixpacket", nil, &net.UnixAddr{Name: socketPath, Net: "unixpacket"}) if err != nil { @@ -108,7 +112,11 @@ func (c *Container) attachToExec(streams *AttachStreams, keys string, resize <-c logrus.Debugf("Attaching to container %s exec session %s", c.ID(), sessionID) // set up the socket path, such that it is the correct length and location for exec - socketPath := buildSocketPath(c.execAttachSocketPath(sessionID)) + sockPath, err := c.execAttachSocketPath(sessionID) + if err != nil { + return err + } + socketPath := buildSocketPath(sockPath) // 2: read from attachFd that the parent process has set up the console socket if _, err := readConmonPipeData(attachFd, ""); err != nil { diff --git a/libpod/oci_conmon_linux.go b/libpod/oci_conmon_linux.go new file mode 100644 index 000000000..f29758a69 --- /dev/null +++ b/libpod/oci_conmon_linux.go @@ -0,0 +1,1421 @@ +// +build linux + +package libpod + +import ( + "bufio" + "bytes" + "fmt" + "io/ioutil" + "os" + "os/exec" + "path/filepath" + "runtime" + "strconv" + "strings" + "syscall" + "time" + + "github.com/containers/libpod/libpod/define" + "github.com/containers/libpod/pkg/cgroups" + "github.com/containers/libpod/pkg/errorhandling" + "github.com/containers/libpod/pkg/lookup" + "github.com/containers/libpod/pkg/rootless" + "github.com/containers/libpod/pkg/util" + "github.com/containers/libpod/utils" + pmount "github.com/containers/storage/pkg/mount" + "github.com/coreos/go-systemd/activation" + spec "github.com/opencontainers/runtime-spec/specs-go" + "github.com/opencontainers/selinux/go-selinux" + "github.com/opencontainers/selinux/go-selinux/label" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +// ConmonOCIRuntime is an OCI runtime managed by Conmon. +// TODO: Make all calls to OCI runtime have a timeout. +type ConmonOCIRuntime struct { + name string + path string + conmonPath string + conmonEnv []string + cgroupManager string + tmpDir string + exitsDir string + socketsDir string + logSizeMax int64 + noPivot bool + reservePorts bool + supportsJSON bool + supportsNoCgroups bool + sdNotify bool +} + +// Make a new Conmon-based OCI runtime with the given options. +// Conmon will wrap the given OCI runtime, which can be `runc`, `crun`, or +// any runtime with a runc-compatible CLI. +// The first path that points to a valid executable will be used. +// Deliberately private. Someone should not be able to construct this outside of +// libpod. +func newConmonOCIRuntime(name string, paths []string, conmonPath string, runtimeCfg *RuntimeConfig, supportsJSON, supportsNoCgroups bool) (OCIRuntime, error) { + if name == "" { + return nil, errors.Wrapf(define.ErrInvalidArg, "the OCI runtime must be provided a non-empty name") + } + + runtime := new(ConmonOCIRuntime) + runtime.name = name + runtime.conmonPath = conmonPath + + runtime.conmonEnv = runtimeCfg.ConmonEnvVars + runtime.cgroupManager = runtimeCfg.CgroupManager + runtime.tmpDir = runtimeCfg.TmpDir + runtime.logSizeMax = runtimeCfg.MaxLogSize + runtime.noPivot = runtimeCfg.NoPivotRoot + runtime.reservePorts = runtimeCfg.EnablePortReservation + runtime.sdNotify = runtimeCfg.SDNotify + + // TODO: probe OCI runtime for feature and enable automatically if + // available. + runtime.supportsJSON = supportsJSON + runtime.supportsNoCgroups = supportsNoCgroups + + foundPath := false + for _, path := range paths { + stat, err := os.Stat(path) + if err != nil { + if os.IsNotExist(err) { + continue + } + return nil, errors.Wrapf(err, "cannot stat %s", path) + } + if !stat.Mode().IsRegular() { + continue + } + foundPath = true + runtime.path = path + logrus.Debugf("using runtime %q", path) + break + } + + // Search the $PATH as last fallback + if !foundPath { + if foundRuntime, err := exec.LookPath(name); err == nil { + foundPath = true + runtime.path = foundRuntime + logrus.Debugf("using runtime %q from $PATH: %q", name, foundRuntime) + } + } + + if !foundPath { + return nil, errors.Wrapf(define.ErrInvalidArg, "no valid executable found for OCI runtime %s", name) + } + + runtime.exitsDir = filepath.Join(runtime.tmpDir, "exits") + runtime.socketsDir = filepath.Join(runtime.tmpDir, "socket") + + if runtime.cgroupManager != CgroupfsCgroupsManager && runtime.cgroupManager != SystemdCgroupsManager { + return nil, errors.Wrapf(define.ErrInvalidArg, "invalid cgroup manager specified: %s", runtime.cgroupManager) + } + + // Create the exit files and attach sockets directories + if err := os.MkdirAll(runtime.exitsDir, 0750); err != nil { + // The directory is allowed to exist + if !os.IsExist(err) { + return nil, errors.Wrapf(err, "error creating OCI runtime exit files directory %s", + runtime.exitsDir) + } + } + if err := os.MkdirAll(runtime.socketsDir, 0750); err != nil { + // The directory is allowed to exist + if !os.IsExist(err) { + return nil, errors.Wrapf(err, "error creating OCI runtime attach sockets directory %s", + runtime.socketsDir) + } + } + + return runtime, nil +} + +// Name returns the name of the runtime being wrapped by Conmon. +func (r *ConmonOCIRuntime) Name() string { + return r.name +} + +// Path returns the path of the OCI runtime being wrapped by Conmon. +func (r *ConmonOCIRuntime) Path() string { + return r.path +} + +// CreateContainer creates a container. +func (r *ConmonOCIRuntime) CreateContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (err error) { + if len(ctr.config.IDMappings.UIDMap) != 0 || len(ctr.config.IDMappings.GIDMap) != 0 { + for _, i := range []string{ctr.state.RunDir, ctr.runtime.config.TmpDir, ctr.config.StaticDir, ctr.state.Mountpoint, ctr.runtime.config.VolumePath} { + if err := makeAccessible(i, ctr.RootUID(), ctr.RootGID()); err != nil { + return err + } + } + + // if we are running a non privileged container, be sure to umount some kernel paths so they are not + // bind mounted inside the container at all. + if !ctr.config.Privileged && !rootless.IsRootless() { + ch := make(chan error) + go func() { + runtime.LockOSThread() + err := func() error { + fd, err := os.Open(fmt.Sprintf("/proc/%d/task/%d/ns/mnt", os.Getpid(), unix.Gettid())) + if err != nil { + return err + } + defer errorhandling.CloseQuiet(fd) + + // create a new mountns on the current thread + if err = unix.Unshare(unix.CLONE_NEWNS); err != nil { + return err + } + defer func() { + if err := unix.Setns(int(fd.Fd()), unix.CLONE_NEWNS); err != nil { + logrus.Errorf("unable to clone new namespace: %q", err) + } + }() + + // don't spread our mounts around. We are setting only /sys to be slave + // so that the cleanup process is still able to umount the storage and the + // changes are propagated to the host. + err = unix.Mount("/sys", "/sys", "none", unix.MS_REC|unix.MS_SLAVE, "") + if err != nil { + return errors.Wrapf(err, "cannot make /sys slave") + } + + mounts, err := pmount.GetMounts() + if err != nil { + return err + } + for _, m := range mounts { + if !strings.HasPrefix(m.Mountpoint, "/sys/kernel") { + continue + } + err = unix.Unmount(m.Mountpoint, 0) + if err != nil && !os.IsNotExist(err) { + return errors.Wrapf(err, "cannot unmount %s", m.Mountpoint) + } + } + return r.createOCIContainer(ctr, restoreOptions) + }() + ch <- err + }() + err := <-ch + return err + } + } + return r.createOCIContainer(ctr, restoreOptions) +} + +// UpdateContainerStatus retrieves the current status of the container from the +// runtime. It updates the container's state but does not save it. +// If useRuntime is false, we will not directly hit runc to see the container's +// status, but will instead only check for the existence of the conmon exit file +// and update state to stopped if it exists. +func (r *ConmonOCIRuntime) UpdateContainerStatus(ctr *Container, useRuntime bool) error { + exitFile, err := ctr.exitFilePath() + if err != nil { + return err + } + + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return err + } + + // If not using the OCI runtime, we don't need to do most of this. + if !useRuntime { + // If the container's not running, nothing to do. + if ctr.state.State != define.ContainerStateRunning && ctr.state.State != define.ContainerStatePaused { + return nil + } + + // Check for the exit file conmon makes + info, err := os.Stat(exitFile) + if err != nil { + if os.IsNotExist(err) { + // Container is still running, no error + return nil + } + + return errors.Wrapf(err, "error running stat on container %s exit file", ctr.ID()) + } + + // Alright, it exists. Transition to Stopped state. + ctr.state.State = define.ContainerStateStopped + ctr.state.PID = 0 + ctr.state.ConmonPID = 0 + + // Read the exit file to get our stopped time and exit code. + return ctr.handleExitFile(exitFile, info) + } + + // Store old state so we know if we were already stopped + oldState := ctr.state.State + + state := new(spec.State) + + cmd := exec.Command(r.path, "state", ctr.ID()) + cmd.Env = append(cmd.Env, fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)) + + outPipe, err := cmd.StdoutPipe() + if err != nil { + return errors.Wrapf(err, "getting stdout pipe") + } + errPipe, err := cmd.StderrPipe() + if err != nil { + return errors.Wrapf(err, "getting stderr pipe") + } + + if err := cmd.Start(); err != nil { + out, err2 := ioutil.ReadAll(errPipe) + if err2 != nil { + return errors.Wrapf(err, "error getting container %s state", ctr.ID()) + } + if strings.Contains(string(out), "does not exist") { + if err := ctr.removeConmonFiles(); err != nil { + logrus.Debugf("unable to remove conmon files for container %s", ctr.ID()) + } + ctr.state.ExitCode = -1 + ctr.state.FinishedTime = time.Now() + ctr.state.State = define.ContainerStateExited + return nil + } + return errors.Wrapf(err, "error getting container %s state. stderr/out: %s", ctr.ID(), out) + } + defer func() { + _ = cmd.Wait() + }() + + if err := errPipe.Close(); err != nil { + return err + } + out, err := ioutil.ReadAll(outPipe) + if err != nil { + return errors.Wrapf(err, "error reading stdout: %s", ctr.ID()) + } + if err := json.NewDecoder(bytes.NewBuffer(out)).Decode(state); err != nil { + return errors.Wrapf(err, "error decoding container status for container %s", ctr.ID()) + } + ctr.state.PID = state.Pid + + switch state.Status { + case "created": + ctr.state.State = define.ContainerStateCreated + case "paused": + ctr.state.State = define.ContainerStatePaused + case "running": + ctr.state.State = define.ContainerStateRunning + case "stopped": + ctr.state.State = define.ContainerStateStopped + default: + return errors.Wrapf(define.ErrInternal, "unrecognized status returned by runtime for container %s: %s", + ctr.ID(), state.Status) + } + + // Only grab exit status if we were not already stopped + // If we were, it should already be in the database + if ctr.state.State == define.ContainerStateStopped && oldState != define.ContainerStateStopped { + var fi os.FileInfo + chWait := make(chan error) + defer close(chWait) + + _, err := WaitForFile(exitFile, chWait, time.Second*5) + if err == nil { + fi, err = os.Stat(exitFile) + } + if err != nil { + ctr.state.ExitCode = -1 + ctr.state.FinishedTime = time.Now() + logrus.Errorf("No exit file for container %s found: %v", ctr.ID(), err) + return nil + } + + return ctr.handleExitFile(exitFile, fi) + } + + return nil +} + +// StartContainer starts the given container. +// Sets time the container was started, but does not save it. +func (r *ConmonOCIRuntime) StartContainer(ctr *Container) error { + // TODO: streams should probably *not* be our STDIN/OUT/ERR - redirect to buffers? + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return err + } + env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} + if notify, ok := os.LookupEnv("NOTIFY_SOCKET"); ok { + env = append(env, fmt.Sprintf("NOTIFY_SOCKET=%s", notify)) + } + if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "start", ctr.ID()); err != nil { + return err + } + + ctr.state.StartedTime = time.Now() + + return nil +} + +// KillContainer sends the given signal to the given container. +// If all is set, send to all PIDs in the container. +// All is only supported if the container created cgroups. +func (r *ConmonOCIRuntime) KillContainer(ctr *Container, signal uint, all bool) error { + logrus.Debugf("Sending signal %d to container %s", signal, ctr.ID()) + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return err + } + env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} + var args []string + if all { + args = []string{"kill", "--all", ctr.ID(), fmt.Sprintf("%d", signal)} + } else { + args = []string{"kill", ctr.ID(), fmt.Sprintf("%d", signal)} + } + if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, args...); err != nil { + return errors.Wrapf(err, "error sending signal to container %s", ctr.ID()) + } + + return nil +} + +// StopContainer stops a container, first using its given stop signal (or +// SIGTERM if no signal was specified), then using SIGKILL. +// Timeout is given in seconds. If timeout is 0, the container will be +// immediately kill with SIGKILL. +// Does not set finished time for container, assumes you will run updateStatus +// after to pull the exit code. +func (r *ConmonOCIRuntime) StopContainer(ctr *Container, timeout uint, all bool) error { + logrus.Debugf("Stopping container %s (PID %d)", ctr.ID(), ctr.state.PID) + + // Ping the container to see if it's alive + // If it's not, it's already stopped, return + err := unix.Kill(ctr.state.PID, 0) + if err == unix.ESRCH { + return nil + } + + stopSignal := ctr.config.StopSignal + if stopSignal == 0 { + stopSignal = uint(syscall.SIGTERM) + } + + if timeout > 0 { + if err := r.KillContainer(ctr, stopSignal, all); err != nil { + // Is the container gone? + // If so, it probably died between the first check and + // our sending the signal + // The container is stopped, so exit cleanly + err := unix.Kill(ctr.state.PID, 0) + if err == unix.ESRCH { + return nil + } + + return err + } + + if err := waitContainerStop(ctr, time.Duration(timeout)*time.Second); err != nil { + logrus.Warnf("Timed out stopping container %s, resorting to SIGKILL", ctr.ID()) + } else { + // No error, the container is dead + return nil + } + } + + if err := r.KillContainer(ctr, 9, all); err != nil { + // Again, check if the container is gone. If it is, exit cleanly. + err := unix.Kill(ctr.state.PID, 0) + if err == unix.ESRCH { + return nil + } + + return errors.Wrapf(err, "error sending SIGKILL to container %s", ctr.ID()) + } + + // Give runtime a few seconds to make it happen + if err := waitContainerStop(ctr, killContainerTimeout); err != nil { + return err + } + + return nil +} + +// DeleteContainer deletes a container from the OCI runtime. +func (r *ConmonOCIRuntime) DeleteContainer(ctr *Container) error { + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return err + } + env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} + return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "delete", "--force", ctr.ID()) +} + +// PauseContainer pauses the given container. +func (r *ConmonOCIRuntime) PauseContainer(ctr *Container) error { + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return err + } + env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} + return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "pause", ctr.ID()) +} + +// UnpauseContainer unpauses the given container. +func (r *ConmonOCIRuntime) UnpauseContainer(ctr *Container) error { + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return err + } + env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} + return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "resume", ctr.ID()) +} + +// ExecContainer executes a command in a running container +// TODO: Split into Create/Start/Attach/Wait +func (r *ConmonOCIRuntime) ExecContainer(c *Container, sessionID string, options *ExecOptions) (int, chan error, error) { + if options == nil { + return -1, nil, errors.Wrapf(define.ErrInvalidArg, "must provide an ExecOptions struct to ExecContainer") + } + if len(options.Cmd) == 0 { + return -1, nil, errors.Wrapf(define.ErrInvalidArg, "must provide a command to execute") + } + + if sessionID == "" { + return -1, nil, errors.Wrapf(define.ErrEmptyID, "must provide a session ID for exec") + } + + // create sync pipe to receive the pid + parentSyncPipe, childSyncPipe, err := newPipe() + if err != nil { + return -1, nil, errors.Wrapf(err, "error creating socket pair") + } + + defer errorhandling.CloseQuiet(parentSyncPipe) + + // create start pipe to set the cgroup before running + // attachToExec is responsible for closing parentStartPipe + childStartPipe, parentStartPipe, err := newPipe() + if err != nil { + return -1, nil, errors.Wrapf(err, "error creating socket pair") + } + + // We want to make sure we close the parent{Start,Attach}Pipes if we fail + // but also don't want to close them after attach to exec is called + attachToExecCalled := false + + defer func() { + if !attachToExecCalled { + errorhandling.CloseQuiet(parentStartPipe) + } + }() + + // create the attach pipe to allow attach socket to be created before + // $RUNTIME exec starts running. This is to make sure we can capture all output + // from the process through that socket, rather than half reading the log, half attaching to the socket + // attachToExec is responsible for closing parentAttachPipe + parentAttachPipe, childAttachPipe, err := newPipe() + if err != nil { + return -1, nil, errors.Wrapf(err, "error creating socket pair") + } + + defer func() { + if !attachToExecCalled { + errorhandling.CloseQuiet(parentAttachPipe) + } + }() + + childrenClosed := false + defer func() { + if !childrenClosed { + errorhandling.CloseQuiet(childSyncPipe) + errorhandling.CloseQuiet(childAttachPipe) + errorhandling.CloseQuiet(childStartPipe) + } + }() + + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return -1, nil, err + } + + finalEnv := make([]string, 0, len(options.Env)) + for k, v := range options.Env { + finalEnv = append(finalEnv, fmt.Sprintf("%s=%s", k, v)) + } + + processFile, err := prepareProcessExec(c, options.Cmd, finalEnv, options.Terminal, options.Cwd, options.User, sessionID) + if err != nil { + return -1, nil, err + } + + var ociLog string + if logrus.GetLevel() != logrus.DebugLevel && r.supportsJSON { + ociLog = c.execOCILog(sessionID) + } + args := r.sharedConmonArgs(c, sessionID, c.execBundlePath(sessionID), c.execPidPath(sessionID), c.execLogPath(sessionID), c.execExitFileDir(sessionID), ociLog) + + if options.PreserveFDs > 0 { + args = append(args, formatRuntimeOpts("--preserve-fds", fmt.Sprintf("%d", options.PreserveFDs))...) + } + + for _, capability := range options.CapAdd { + args = append(args, formatRuntimeOpts("--cap", capability)...) + } + + if options.Terminal { + args = append(args, "-t") + } + + // Append container ID and command + args = append(args, "-e") + // TODO make this optional when we can detach + args = append(args, "--exec-attach") + args = append(args, "--exec-process-spec", processFile.Name()) + + logrus.WithFields(logrus.Fields{ + "args": args, + }).Debugf("running conmon: %s", r.conmonPath) + execCmd := exec.Command(r.conmonPath, args...) + + if options.Streams != nil { + if options.Streams.AttachInput { + execCmd.Stdin = options.Streams.InputStream + } + if options.Streams.AttachOutput { + execCmd.Stdout = options.Streams.OutputStream + } + if options.Streams.AttachError { + execCmd.Stderr = options.Streams.ErrorStream + } + } + + conmonEnv, extraFiles, err := r.configureConmonEnv(runtimeDir) + if err != nil { + return -1, nil, err + } + + if options.PreserveFDs > 0 { + for fd := 3; fd < int(3+options.PreserveFDs); fd++ { + execCmd.ExtraFiles = append(execCmd.ExtraFiles, os.NewFile(uintptr(fd), fmt.Sprintf("fd-%d", fd))) + } + } + + // we don't want to step on users fds they asked to preserve + // Since 0-2 are used for stdio, start the fds we pass in at preserveFDs+3 + execCmd.Env = append(r.conmonEnv, fmt.Sprintf("_OCI_SYNCPIPE=%d", options.PreserveFDs+3), fmt.Sprintf("_OCI_STARTPIPE=%d", options.PreserveFDs+4), fmt.Sprintf("_OCI_ATTACHPIPE=%d", options.PreserveFDs+5)) + execCmd.Env = append(execCmd.Env, conmonEnv...) + + execCmd.ExtraFiles = append(execCmd.ExtraFiles, childSyncPipe, childStartPipe, childAttachPipe) + execCmd.ExtraFiles = append(execCmd.ExtraFiles, extraFiles...) + execCmd.Dir = c.execBundlePath(sessionID) + execCmd.SysProcAttr = &syscall.SysProcAttr{ + Setpgid: true, + } + + err = startCommandGivenSelinux(execCmd) + + // We don't need children pipes on the parent side + errorhandling.CloseQuiet(childSyncPipe) + errorhandling.CloseQuiet(childAttachPipe) + errorhandling.CloseQuiet(childStartPipe) + childrenClosed = true + + if err != nil { + return -1, nil, errors.Wrapf(err, "cannot start container %s", c.ID()) + } + if err := r.moveConmonToCgroupAndSignal(c, execCmd, parentStartPipe, sessionID); err != nil { + return -1, nil, err + } + + if options.PreserveFDs > 0 { + for fd := 3; fd < int(3+options.PreserveFDs); fd++ { + // These fds were passed down to the runtime. Close them + // and not interfere + if err := os.NewFile(uintptr(fd), fmt.Sprintf("fd-%d", fd)).Close(); err != nil { + logrus.Debugf("unable to close file fd-%d", fd) + } + } + } + + // TODO Only create if !detach + // Attach to the container before starting it + attachChan := make(chan error) + go func() { + // attachToExec is responsible for closing pipes + attachChan <- c.attachToExec(options.Streams, options.DetachKeys, options.Resize, sessionID, parentStartPipe, parentAttachPipe) + close(attachChan) + }() + attachToExecCalled = true + + pid, err := readConmonPipeData(parentSyncPipe, ociLog) + + return pid, attachChan, err +} + +// ExecStopContainer stops a given exec session in a running container. +func (r *ConmonOCIRuntime) ExecStopContainer(ctr *Container, sessionID string, timeout uint) error { + session, ok := ctr.state.ExecSessions[sessionID] + if !ok { + // TODO This should probably be a separate error + return errors.Wrapf(define.ErrInvalidArg, "no exec session with ID %s found in container %s", sessionID, ctr.ID()) + } + + logrus.Debugf("Going to stop container %s exec session %s", ctr.ID(), sessionID) + + // Is the session dead? + // Ping the PID with signal 0 to see if it still exists. + if err := unix.Kill(session.PID, 0); err != nil { + if err == unix.ESRCH { + return nil + } + return errors.Wrapf(err, "error pinging container %s exec session %s PID %d with signal 0", ctr.ID(), sessionID, session.PID) + } + + if timeout > 0 { + // Use SIGTERM by default, then SIGSTOP after timeout. + logrus.Debugf("Killing exec session %s (PID %d) of container %s with SIGTERM", sessionID, session.PID, ctr.ID()) + if err := unix.Kill(session.PID, unix.SIGTERM); err != nil { + if err == unix.ESRCH { + return nil + } + return errors.Wrapf(err, "error killing container %s exec session %s PID %d with SIGTERM", ctr.ID(), sessionID, session.PID) + } + + // Wait for the PID to stop + if err := waitPidStop(session.PID, time.Duration(timeout)*time.Second); err != nil { + logrus.Warnf("Timed out waiting for container %s exec session %s to stop, resorting to SIGKILL", ctr.ID(), sessionID) + } else { + // No error, container is dead + return nil + } + } + + // SIGTERM did not work. On to SIGKILL. + logrus.Debugf("Killing exec session %s (PID %d) of container %s with SIGKILL", sessionID, session.PID, ctr.ID()) + if err := unix.Kill(session.PID, unix.SIGTERM); err != nil { + if err == unix.ESRCH { + return nil + } + return errors.Wrapf(err, "error killing container %s exec session %s PID %d with SIGKILL", ctr.ID(), sessionID, session.PID) + } + + // Wait for the PID to stop + if err := waitPidStop(session.PID, killContainerTimeout*time.Second); err != nil { + return errors.Wrapf(err, "timed out waiting for container %s exec session %s PID %d to stop after SIGKILL", ctr.ID(), sessionID, session.PID) + } + + return nil +} + +// ExecCleanupContainer cleans up files created when a command is run via +// ExecContainer. This includes the attach socket for the exec session. +func (r *ConmonOCIRuntime) ExecContainerCleanup(ctr *Container, sessionID string) error { + // Clean up the sockets dir. Issue #3962 + // Also ignore if it doesn't exist for some reason; hence the conditional return below + if err := os.RemoveAll(filepath.Join(r.socketsDir, sessionID)); err != nil && !os.IsNotExist(err) { + return err + } + return nil +} + +// CheckpointContainer checkpoints the given container. +func (r *ConmonOCIRuntime) CheckpointContainer(ctr *Container, options ContainerCheckpointOptions) error { + if err := label.SetSocketLabel(ctr.ProcessLabel()); err != nil { + return err + } + // imagePath is used by CRIU to store the actual checkpoint files + imagePath := ctr.CheckpointPath() + // workPath will be used to store dump.log and stats-dump + workPath := ctr.bundlePath() + logrus.Debugf("Writing checkpoint to %s", imagePath) + logrus.Debugf("Writing checkpoint logs to %s", workPath) + args := []string{} + args = append(args, "checkpoint") + args = append(args, "--image-path") + args = append(args, imagePath) + args = append(args, "--work-path") + args = append(args, workPath) + if options.KeepRunning { + args = append(args, "--leave-running") + } + if options.TCPEstablished { + args = append(args, "--tcp-established") + } + args = append(args, ctr.ID()) + return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, nil, r.path, args...) +} + +// SupportsCheckpoint checks if the OCI runtime supports checkpointing +// containers. +func (r *ConmonOCIRuntime) SupportsCheckpoint() bool { + // Check if the runtime implements checkpointing. Currently only + // runc's checkpoint/restore implementation is supported. + cmd := exec.Command(r.path, "checkpoint", "-h") + if err := cmd.Start(); err != nil { + return false + } + if err := cmd.Wait(); err == nil { + return true + } + return false +} + +// SupportsJSONErrors checks if the OCI runtime supports JSON-formatted error +// messages. +func (r *ConmonOCIRuntime) SupportsJSONErrors() bool { + return r.supportsJSON +} + +// SupportsNoCgroups checks if the OCI runtime supports running containers +// without cgroups (the --cgroup-manager=disabled flag). +func (r *ConmonOCIRuntime) SupportsNoCgroups() bool { + return r.supportsNoCgroups +} + +// AttachSocketPath is the path to a single container's attach socket. +func (r *ConmonOCIRuntime) AttachSocketPath(ctr *Container) (string, error) { + if ctr == nil { + return "", errors.Wrapf(define.ErrInvalidArg, "must provide a valid container to get attach socket path") + } + + return filepath.Join(r.socketsDir, ctr.ID(), "attach"), nil +} + +// ExecAttachSocketPath is the path to a container's exec session attach socket. +func (r *ConmonOCIRuntime) ExecAttachSocketPath(ctr *Container, sessionID string) (string, error) { + // We don't even use container, so don't validity check it + if sessionID == "" { + return "", errors.Wrapf(define.ErrInvalidArg, "must provide a valid session ID to get attach socket path") + } + + return filepath.Join(r.socketsDir, sessionID, "attach"), nil +} + +// ExitFilePath is the path to a container's exit file. +func (r *ConmonOCIRuntime) ExitFilePath(ctr *Container) (string, error) { + if ctr == nil { + return "", errors.Wrapf(define.ErrInvalidArg, "must provide a valid container to get exit file path") + } + return filepath.Join(r.exitsDir, ctr.ID()), nil +} + +// RuntimeInfo provides information on the runtime. +func (r *ConmonOCIRuntime) RuntimeInfo() (map[string]interface{}, error) { + runtimePackage := packageVersion(r.path) + conmonPackage := packageVersion(r.conmonPath) + runtimeVersion, err := r.getOCIRuntimeVersion() + if err != nil { + return nil, errors.Wrapf(err, "error getting version of OCI runtime %s", r.name) + } + conmonVersion, err := r.getConmonVersion() + if err != nil { + return nil, errors.Wrapf(err, "error getting conmon version") + } + + info := make(map[string]interface{}) + info["Conmon"] = map[string]interface{}{ + "path": r.conmonPath, + "package": conmonPackage, + "version": conmonVersion, + } + info["OCIRuntime"] = map[string]interface{}{ + "path": r.path, + "package": runtimePackage, + "version": runtimeVersion, + } + + return info, nil +} + +// makeAccessible changes the path permission and each parent directory to have --x--x--x +func makeAccessible(path string, uid, gid int) error { + for ; path != "/"; path = filepath.Dir(path) { + st, err := os.Stat(path) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + if int(st.Sys().(*syscall.Stat_t).Uid) == uid && int(st.Sys().(*syscall.Stat_t).Gid) == gid { + continue + } + if st.Mode()&0111 != 0111 { + if err := os.Chmod(path, st.Mode()|0111); err != nil { + return err + } + } + } + return nil +} + +// Wait for a container which has been sent a signal to stop +func waitContainerStop(ctr *Container, timeout time.Duration) error { + return waitPidStop(ctr.state.PID, timeout) +} + +// Wait for a given PID to stop +func waitPidStop(pid int, timeout time.Duration) error { + done := make(chan struct{}) + chControl := make(chan struct{}) + go func() { + for { + select { + case <-chControl: + return + default: + if err := unix.Kill(pid, 0); err != nil { + if err == unix.ESRCH { + close(done) + return + } + logrus.Errorf("Error pinging PID %d with signal 0: %v", pid, err) + } + time.Sleep(100 * time.Millisecond) + } + } + }() + select { + case <-done: + return nil + case <-time.After(timeout): + close(chControl) + return errors.Errorf("given PIDs did not die within timeout") + } +} + +// createOCIContainer generates this container's main conmon instance and prepares it for starting +func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (err error) { + var stderrBuf bytes.Buffer + + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return err + } + + parentSyncPipe, childSyncPipe, err := newPipe() + if err != nil { + return errors.Wrapf(err, "error creating socket pair") + } + defer errorhandling.CloseQuiet(parentSyncPipe) + + childStartPipe, parentStartPipe, err := newPipe() + if err != nil { + return errors.Wrapf(err, "error creating socket pair for start pipe") + } + + defer errorhandling.CloseQuiet(parentStartPipe) + + var ociLog string + if logrus.GetLevel() != logrus.DebugLevel && r.supportsJSON { + ociLog = filepath.Join(ctr.state.RunDir, "oci-log") + } + args := r.sharedConmonArgs(ctr, ctr.ID(), ctr.bundlePath(), filepath.Join(ctr.state.RunDir, "pidfile"), ctr.LogPath(), r.exitsDir, ociLog) + + if ctr.config.Spec.Process.Terminal { + args = append(args, "-t") + } else if ctr.config.Stdin { + args = append(args, "-i") + } + + if ctr.config.ConmonPidFile != "" { + args = append(args, "--conmon-pidfile", ctr.config.ConmonPidFile) + } + + if r.noPivot { + args = append(args, "--no-pivot") + } + + if len(ctr.config.ExitCommand) > 0 { + args = append(args, "--exit-command", ctr.config.ExitCommand[0]) + for _, arg := range ctr.config.ExitCommand[1:] { + args = append(args, []string{"--exit-command-arg", arg}...) + } + } + + if restoreOptions != nil { + args = append(args, "--restore", ctr.CheckpointPath()) + if restoreOptions.TCPEstablished { + args = append(args, "--runtime-opt", "--tcp-established") + } + } + + logrus.WithFields(logrus.Fields{ + "args": args, + }).Debugf("running conmon: %s", r.conmonPath) + + cmd := exec.Command(r.conmonPath, args...) + cmd.Dir = ctr.bundlePath() + cmd.SysProcAttr = &syscall.SysProcAttr{ + Setpgid: true, + } + // TODO this is probably a really bad idea for some uses + // Make this configurable + cmd.Stdin = os.Stdin + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + if ctr.config.Spec.Process.Terminal { + cmd.Stderr = &stderrBuf + } + + // 0, 1 and 2 are stdin, stdout and stderr + conmonEnv, envFiles, err := r.configureConmonEnv(runtimeDir) + if err != nil { + return err + } + + cmd.Env = append(r.conmonEnv, fmt.Sprintf("_OCI_SYNCPIPE=%d", 3), fmt.Sprintf("_OCI_STARTPIPE=%d", 4)) + cmd.Env = append(cmd.Env, conmonEnv...) + cmd.ExtraFiles = append(cmd.ExtraFiles, childSyncPipe, childStartPipe) + cmd.ExtraFiles = append(cmd.ExtraFiles, envFiles...) + + if r.reservePorts && !ctr.config.NetMode.IsSlirp4netns() { + ports, err := bindPorts(ctr.config.PortMappings) + if err != nil { + return err + } + + // Leak the port we bound in the conmon process. These fd's won't be used + // by the container and conmon will keep the ports busy so that another + // process cannot use them. + cmd.ExtraFiles = append(cmd.ExtraFiles, ports...) + } + + if ctr.config.NetMode.IsSlirp4netns() { + if ctr.config.PostConfigureNetNS { + ctr.rootlessSlirpSyncR, ctr.rootlessSlirpSyncW, err = os.Pipe() + if err != nil { + return errors.Wrapf(err, "failed to create rootless network sync pipe") + } + } else { + if ctr.rootlessSlirpSyncR != nil { + defer errorhandling.CloseQuiet(ctr.rootlessSlirpSyncR) + } + if ctr.rootlessSlirpSyncW != nil { + defer errorhandling.CloseQuiet(ctr.rootlessSlirpSyncW) + } + } + // Leak one end in conmon, the other one will be leaked into slirp4netns + cmd.ExtraFiles = append(cmd.ExtraFiles, ctr.rootlessSlirpSyncW) + } + + err = startCommandGivenSelinux(cmd) + // regardless of whether we errored or not, we no longer need the children pipes + childSyncPipe.Close() + childStartPipe.Close() + if err != nil { + return err + } + if err := r.moveConmonToCgroupAndSignal(ctr, cmd, parentStartPipe, ctr.ID()); err != nil { + return err + } + /* Wait for initial setup and fork, and reap child */ + err = cmd.Wait() + if err != nil { + return err + } + + pid, err := readConmonPipeData(parentSyncPipe, ociLog) + if err != nil { + if err2 := r.DeleteContainer(ctr); err2 != nil { + logrus.Errorf("Error removing container %s from runtime after creation failed", ctr.ID()) + } + return err + } + ctr.state.PID = pid + + conmonPID, err := readConmonPidFile(ctr.config.ConmonPidFile) + if err != nil { + logrus.Warnf("error reading conmon pid file for container %s: %s", ctr.ID(), err.Error()) + } else if conmonPID > 0 { + // conmon not having a pid file is a valid state, so don't set it if we don't have it + logrus.Infof("Got Conmon PID as %d", conmonPID) + ctr.state.ConmonPID = conmonPID + } + + return nil +} + +// prepareProcessExec returns the path of the process.json used in runc exec -p +// caller is responsible to close the returned *os.File if needed. +func prepareProcessExec(c *Container, cmd, env []string, tty bool, cwd, user, sessionID string) (*os.File, error) { + f, err := ioutil.TempFile(c.execBundlePath(sessionID), "exec-process-") + if err != nil { + return nil, err + } + + pspec := c.config.Spec.Process + pspec.Args = cmd + // We need to default this to false else it will inherit terminal as true + // from the container. + pspec.Terminal = false + if tty { + pspec.Terminal = true + } + if len(env) > 0 { + pspec.Env = append(pspec.Env, env...) + } + + if cwd != "" { + pspec.Cwd = cwd + + } + + overrides := c.getUserOverrides() + execUser, err := lookup.GetUserGroupInfo(c.state.Mountpoint, user, overrides) + if err != nil { + return nil, err + } + + // If user was set, look it up in the container to get a UID to use on + // the host + if user != "" { + sgids := make([]uint32, 0, len(execUser.Sgids)) + for _, sgid := range execUser.Sgids { + sgids = append(sgids, uint32(sgid)) + } + processUser := spec.User{ + UID: uint32(execUser.Uid), + GID: uint32(execUser.Gid), + AdditionalGids: sgids, + } + + pspec.User = processUser + } + + hasHomeSet := false + for _, s := range pspec.Env { + if strings.HasPrefix(s, "HOME=") { + hasHomeSet = true + break + } + } + if !hasHomeSet { + pspec.Env = append(pspec.Env, fmt.Sprintf("HOME=%s", execUser.Home)) + } + + processJSON, err := json.Marshal(pspec) + if err != nil { + return nil, err + } + + if err := ioutil.WriteFile(f.Name(), processJSON, 0644); err != nil { + return nil, err + } + return f, nil +} + +// configureConmonEnv gets the environment values to add to conmon's exec struct +// TODO this may want to be less hardcoded/more configurable in the future +func (r *ConmonOCIRuntime) configureConmonEnv(runtimeDir string) ([]string, []*os.File, error) { + env := make([]string, 0, 6) + env = append(env, fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)) + env = append(env, fmt.Sprintf("_CONTAINERS_USERNS_CONFIGURED=%s", os.Getenv("_CONTAINERS_USERNS_CONFIGURED"))) + env = append(env, fmt.Sprintf("_CONTAINERS_ROOTLESS_UID=%s", os.Getenv("_CONTAINERS_ROOTLESS_UID"))) + home, err := homeDir() + if err != nil { + return nil, nil, err + } + env = append(env, fmt.Sprintf("HOME=%s", home)) + + extraFiles := make([]*os.File, 0) + if notify, ok := os.LookupEnv("NOTIFY_SOCKET"); ok { + env = append(env, fmt.Sprintf("NOTIFY_SOCKET=%s", notify)) + } + if !r.sdNotify { + if listenfds, ok := os.LookupEnv("LISTEN_FDS"); ok { + env = append(env, fmt.Sprintf("LISTEN_FDS=%s", listenfds), "LISTEN_PID=1") + fds := activation.Files(false) + extraFiles = append(extraFiles, fds...) + } + } else { + logrus.Debug("disabling SD notify") + } + return env, extraFiles, nil +} + +// sharedConmonArgs takes common arguments for exec and create/restore and formats them for the conmon CLI +func (r *ConmonOCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, pidPath, logPath, exitDir, ociLogPath string) []string { + // set the conmon API version to be able to use the correct sync struct keys + args := []string{"--api-version", "1"} + if r.cgroupManager == SystemdCgroupsManager && !ctr.config.NoCgroups { + args = append(args, "-s") + } + args = append(args, "-c", ctr.ID()) + args = append(args, "-u", cuuid) + args = append(args, "-r", r.path) + args = append(args, "-b", bundlePath) + args = append(args, "-p", pidPath) + + var logDriver string + switch ctr.LogDriver() { + case JournaldLogging: + logDriver = JournaldLogging + case JSONLogging: + fallthrough + default: //nolint-stylecheck + // No case here should happen except JSONLogging, but keep this here in case the options are extended + logrus.Errorf("%s logging specified but not supported. Choosing k8s-file logging instead", ctr.LogDriver()) + fallthrough + case "": + // to get here, either a user would specify `--log-driver ""`, or this came from another place in libpod + // since the former case is obscure, and the latter case isn't an error, let's silently fallthrough + fallthrough + case KubernetesLogging: + logDriver = fmt.Sprintf("%s:%s", KubernetesLogging, logPath) + } + + args = append(args, "-l", logDriver) + args = append(args, "--exit-dir", exitDir) + args = append(args, "--socket-dir-path", r.socketsDir) + if r.logSizeMax >= 0 { + args = append(args, "--log-size-max", fmt.Sprintf("%v", r.logSizeMax)) + } + + logLevel := logrus.GetLevel() + args = append(args, "--log-level", logLevel.String()) + + if logLevel == logrus.DebugLevel { + logrus.Debugf("%s messages will be logged to syslog", r.conmonPath) + args = append(args, "--syslog") + } + if ociLogPath != "" { + args = append(args, "--runtime-arg", "--log-format=json", "--runtime-arg", "--log", fmt.Sprintf("--runtime-arg=%s", ociLogPath)) + } + if ctr.config.NoCgroups { + logrus.Debugf("Running with no CGroups") + args = append(args, "--runtime-arg", "--cgroup-manager", "--runtime-arg", "disabled") + } + return args +} + +// startCommandGivenSelinux starts a container ensuring to set the labels of +// the process to make sure SELinux doesn't block conmon communication, if SELinux is enabled +func startCommandGivenSelinux(cmd *exec.Cmd) error { + if !selinux.GetEnabled() { + return cmd.Start() + } + // Set the label of the conmon process to be level :s0 + // This will allow the container processes to talk to fifo-files + // passed into the container by conmon + var ( + plabel string + con selinux.Context + err error + ) + plabel, err = selinux.CurrentLabel() + if err != nil { + return errors.Wrapf(err, "Failed to get current SELinux label") + } + + con, err = selinux.NewContext(plabel) + if err != nil { + return errors.Wrapf(err, "Failed to get new context from SELinux label") + } + + runtime.LockOSThread() + if con["level"] != "s0" && con["level"] != "" { + con["level"] = "s0" + if err = label.SetProcessLabel(con.Get()); err != nil { + runtime.UnlockOSThread() + return err + } + } + err = cmd.Start() + // Ignore error returned from SetProcessLabel("") call, + // can't recover. + if labelErr := label.SetProcessLabel(""); labelErr != nil { + logrus.Errorf("unable to set process label: %q", err) + } + runtime.UnlockOSThread() + return err +} + +// moveConmonToCgroupAndSignal gets a container's cgroupParent and moves the conmon process to that cgroup +// it then signals for conmon to start by sending nonse data down the start fd +func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec.Cmd, startFd *os.File, uuid string) error { + mustCreateCgroup := true + // If cgroup creation is disabled - just signal. + if ctr.config.NoCgroups { + mustCreateCgroup = false + } + + if rootless.IsRootless() { + ownsCgroup, err := cgroups.UserOwnsCurrentSystemdCgroup() + if err != nil { + return err + } + mustCreateCgroup = !ownsCgroup + } + + if mustCreateCgroup { + cgroupParent := ctr.CgroupParent() + if r.cgroupManager == SystemdCgroupsManager { + unitName := createUnitName("libpod-conmon", ctr.ID()) + + realCgroupParent := cgroupParent + splitParent := strings.Split(cgroupParent, "/") + if strings.HasSuffix(cgroupParent, ".slice") && len(splitParent) > 1 { + realCgroupParent = splitParent[len(splitParent)-1] + } + + logrus.Infof("Running conmon under slice %s and unitName %s", realCgroupParent, unitName) + if err := utils.RunUnderSystemdScope(cmd.Process.Pid, realCgroupParent, unitName); err != nil { + logrus.Warnf("Failed to add conmon to systemd sandbox cgroup: %v", err) + } + } else { + cgroupPath := filepath.Join(ctr.config.CgroupParent, "conmon") + control, err := cgroups.New(cgroupPath, &spec.LinuxResources{}) + if err != nil { + logrus.Warnf("Failed to add conmon to cgroupfs sandbox cgroup: %v", err) + } else { + // we need to remove this defer and delete the cgroup once conmon exits + // maybe need a conmon monitor? + if err := control.AddPid(cmd.Process.Pid); err != nil { + logrus.Warnf("Failed to add conmon to cgroupfs sandbox cgroup: %v", err) + } + } + } + } + + /* We set the cgroup, now the child can start creating children */ + if err := writeConmonPipeData(startFd); err != nil { + return err + } + return nil +} + +// newPipe creates a unix socket pair for communication +func newPipe() (parent *os.File, child *os.File, err error) { + fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0) + if err != nil { + return nil, nil, err + } + return os.NewFile(uintptr(fds[1]), "parent"), os.NewFile(uintptr(fds[0]), "child"), nil +} + +// readConmonPidFile attempts to read conmon's pid from its pid file +func readConmonPidFile(pidFile string) (int, error) { + // Let's try reading the Conmon pid at the same time. + if pidFile != "" { + contents, err := ioutil.ReadFile(pidFile) + if err != nil { + return -1, err + } + // Convert it to an int + conmonPID, err := strconv.Atoi(string(contents)) + if err != nil { + return -1, err + } + return conmonPID, nil + } + return 0, nil +} + +// readConmonPipeData attempts to read a syncInfo struct from the pipe +func readConmonPipeData(pipe *os.File, ociLog string) (int, error) { + // syncInfo is used to return data from monitor process to daemon + type syncInfo struct { + Data int `json:"data"` + Message string `json:"message,omitempty"` + } + + // Wait to get container pid from conmon + type syncStruct struct { + si *syncInfo + err error + } + ch := make(chan syncStruct) + go func() { + var si *syncInfo + rdr := bufio.NewReader(pipe) + b, err := rdr.ReadBytes('\n') + if err != nil { + ch <- syncStruct{err: err} + } + if err := json.Unmarshal(b, &si); err != nil { + ch <- syncStruct{err: err} + return + } + ch <- syncStruct{si: si} + }() + + data := -1 + select { + case ss := <-ch: + if ss.err != nil { + if ociLog != "" { + ociLogData, err := ioutil.ReadFile(ociLog) + if err == nil { + var ociErr ociError + if err := json.Unmarshal(ociLogData, &ociErr); err == nil { + return -1, getOCIRuntimeError(ociErr.Msg) + } + } + } + return -1, errors.Wrapf(ss.err, "error reading container (probably exited) json message") + } + logrus.Debugf("Received: %d", ss.si.Data) + if ss.si.Data < 0 { + if ociLog != "" { + ociLogData, err := ioutil.ReadFile(ociLog) + if err == nil { + var ociErr ociError + if err := json.Unmarshal(ociLogData, &ociErr); err == nil { + return ss.si.Data, getOCIRuntimeError(ociErr.Msg) + } + } + } + // If we failed to parse the JSON errors, then print the output as it is + if ss.si.Message != "" { + return ss.si.Data, getOCIRuntimeError(ss.si.Message) + } + return ss.si.Data, errors.Wrapf(define.ErrInternal, "container create failed") + } + data = ss.si.Data + case <-time.After(ContainerCreateTimeout): + return -1, errors.Wrapf(define.ErrInternal, "container creation timeout") + } + return data, nil +} + +// writeConmonPipeData writes nonse data to a pipe +func writeConmonPipeData(pipe *os.File) error { + someData := []byte{0} + _, err := pipe.Write(someData) + return err +} + +// formatRuntimeOpts prepends opts passed to it with --runtime-opt for passing to conmon +func formatRuntimeOpts(opts ...string) []string { + args := make([]string, 0, len(opts)*2) + for _, o := range opts { + args = append(args, "--runtime-opt", o) + } + return args +} + +// getConmonVersion returns a string representation of the conmon version. +func (r *ConmonOCIRuntime) getConmonVersion() (string, error) { + output, err := utils.ExecCmd(r.conmonPath, "--version") + if err != nil { + return "", err + } + return strings.TrimSuffix(strings.Replace(output, "\n", ", ", 1), "\n"), nil +} + +// getOCIRuntimeVersion returns a string representation of the OCI runtime's +// version. +func (r *ConmonOCIRuntime) getOCIRuntimeVersion() (string, error) { + output, err := utils.ExecCmd(r.path, "--version") + if err != nil { + return "", err + } + return strings.TrimSuffix(output, "\n"), nil +} diff --git a/libpod/oci_conmon_unsupported.go b/libpod/oci_conmon_unsupported.go new file mode 100644 index 000000000..77b06eed3 --- /dev/null +++ b/libpod/oci_conmon_unsupported.go @@ -0,0 +1,130 @@ +// +build !linux + +package libpod + +import ( + "github.com/containers/libpod/libpod/define" +) + +const ( + osNotSupported = "Not supported on this OS" +) + +// ConmonOCIRuntime is not supported on this OS. +type ConmonOCIRuntime struct { +} + +// newConmonOCIRuntime is not supported on this OS. +func newConmonOCIRuntime(name string, paths []string, conmonPath string, runtimeCfg *RuntimeConfig, supportsJSON, supportsNoCgroups bool) (OCIRuntime, error) { + return nil, define.ErrNotImplemented +} + +// Name is not supported on this OS. +func (r *ConmonOCIRuntime) Name() string { + return osNotSupported +} + +// Path is not supported on this OS. +func (r *ConmonOCIRuntime) Path() string { + return osNotSupported +} + +// CreateContainer is not supported on this OS. +func (r *ConmonOCIRuntime) CreateContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) error { + return define.ErrNotImplemented +} + +// UpdateContainerStatus is not supported on this OS. +func (r *ConmonOCIRuntime) UpdateContainerStatus(ctr *Container, useRuntime bool) error { + return define.ErrNotImplemented +} + +// StartContainer is not supported on this OS. +func (r *ConmonOCIRuntime) StartContainer(ctr *Container) error { + return define.ErrNotImplemented +} + +// KillContainer is not supported on this OS. +func (r *ConmonOCIRuntime) KillContainer(ctr *Container, signal uint, all bool) error { + return define.ErrNotImplemented +} + +// StopContainer is not supported on this OS. +func (r *ConmonOCIRuntime) StopContainer(ctr *Container, timeout uint, all bool) error { + return define.ErrNotImplemented +} + +// DeleteContainer is not supported on this OS. +func (r *ConmonOCIRuntime) DeleteContainer(ctr *Container) error { + return define.ErrNotImplemented +} + +// PauseContainer is not supported on this OS. +func (r *ConmonOCIRuntime) PauseContainer(ctr *Container) error { + return define.ErrNotImplemented +} + +// UnpauseContainer is not supported on this OS. +func (r *ConmonOCIRuntime) UnpauseContainer(ctr *Container) error { + return define.ErrNotImplemented +} + +// ExecContainer is not supported on this OS. +func (r *ConmonOCIRuntime) ExecContainer(ctr *Container, sessionID string, options *ExecOptions) (int, chan error, error) { + return -1, nil, define.ErrNotImplemented +} + +// ExecStopContainer is not supported on this OS. +func (r *ConmonOCIRuntime) ExecStopContainer(ctr *Container, sessionID string, timeout uint) error { + return define.ErrNotImplemented +} + +// CheckpointContainer is not supported on this OS. +func (r *ConmonOCIRuntime) CheckpointContainer(ctr *Container, options ContainerCheckpointOptions) error { + return define.ErrNotImplemented +} + +// SupportsCheckpoint is not supported on this OS. +func (r *ConmonOCIRuntime) SupportsCheckpoint() bool { + return false +} + +// SupportsJSONErrors is not supported on this OS. +func (r *ConmonOCIRuntime) SupportsJSONErrors() bool { + return false +} + +// SupportsNoCgroups is not supported on this OS. +func (r *ConmonOCIRuntime) SupportsNoCgroups() bool { + return false +} + +// AttachSocketPath is not supported on this OS. +func (r *ConmonOCIRuntime) AttachSocketPath(ctr *Container) (string, error) { + return "", define.ErrNotImplemented +} + +// ExecAttachSocketPath is not supported on this OS. +func (r *ConmonOCIRuntime) ExecAttachSocketPath(ctr *Container, sessionID string) (string, error) { + return "", define.ErrNotImplemented +} + +// ExitFilePath is not supported on this OS. +func (r *ConmonOCIRuntime) ExitFilePath(ctr *Container) (string, error) { + return "", define.ErrNotImplemented +} + +// RuntimeInfo is not supported on this OS. +func (r *ConmonOCIRuntime) RuntimeInfo() (map[string]interface{}, error) { + return nil, define.ErrNotImplemented +} + +// Package is not supported on this OS. +func (r *ConmonOCIRuntime) Package() string { + return osNotSupported +} + +// ConmonPackage is not supported on this OS. +func (r *ConmonOCIRuntime) ConmonPackage() string { + return osNotSupported +} diff --git a/libpod/oci_internal_linux.go b/libpod/oci_internal_linux.go deleted file mode 100644 index 437b7cf4d..000000000 --- a/libpod/oci_internal_linux.go +++ /dev/null @@ -1,556 +0,0 @@ -// +build linux - -package libpod - -import ( - "bufio" - "bytes" - "fmt" - "io/ioutil" - "os" - "os/exec" - "path/filepath" - "regexp" - "runtime" - "strconv" - "strings" - "syscall" - "time" - - "github.com/containers/libpod/libpod/define" - "github.com/containers/libpod/pkg/cgroups" - "github.com/containers/libpod/pkg/errorhandling" - "github.com/containers/libpod/pkg/lookup" - "github.com/containers/libpod/pkg/rootless" - "github.com/containers/libpod/pkg/util" - "github.com/containers/libpod/utils" - "github.com/coreos/go-systemd/activation" - spec "github.com/opencontainers/runtime-spec/specs-go" - "github.com/opencontainers/selinux/go-selinux" - "github.com/opencontainers/selinux/go-selinux/label" - "github.com/pkg/errors" - "github.com/sirupsen/logrus" - "golang.org/x/sys/unix" -) - -// createOCIContainer generates this container's main conmon instance and prepares it for starting -func (r *OCIRuntime) createOCIContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (err error) { - var stderrBuf bytes.Buffer - - runtimeDir, err := util.GetRuntimeDir() - if err != nil { - return err - } - - parentSyncPipe, childSyncPipe, err := newPipe() - if err != nil { - return errors.Wrapf(err, "error creating socket pair") - } - defer errorhandling.CloseQuiet(parentSyncPipe) - - childStartPipe, parentStartPipe, err := newPipe() - if err != nil { - return errors.Wrapf(err, "error creating socket pair for start pipe") - } - - defer errorhandling.CloseQuiet(parentStartPipe) - - var ociLog string - if logrus.GetLevel() != logrus.DebugLevel && r.supportsJSON { - ociLog = filepath.Join(ctr.state.RunDir, "oci-log") - } - args := r.sharedConmonArgs(ctr, ctr.ID(), ctr.bundlePath(), filepath.Join(ctr.state.RunDir, "pidfile"), ctr.LogPath(), r.exitsDir, ociLog) - - if ctr.config.Spec.Process.Terminal { - args = append(args, "-t") - } else if ctr.config.Stdin { - args = append(args, "-i") - } - - if ctr.config.ConmonPidFile != "" { - args = append(args, "--conmon-pidfile", ctr.config.ConmonPidFile) - } - - if r.noPivot { - args = append(args, "--no-pivot") - } - - if len(ctr.config.ExitCommand) > 0 { - args = append(args, "--exit-command", ctr.config.ExitCommand[0]) - for _, arg := range ctr.config.ExitCommand[1:] { - args = append(args, []string{"--exit-command-arg", arg}...) - } - } - - if restoreOptions != nil { - args = append(args, "--restore", ctr.CheckpointPath()) - if restoreOptions.TCPEstablished { - args = append(args, "--runtime-opt", "--tcp-established") - } - } - - logrus.WithFields(logrus.Fields{ - "args": args, - }).Debugf("running conmon: %s", r.conmonPath) - - cmd := exec.Command(r.conmonPath, args...) - cmd.Dir = ctr.bundlePath() - cmd.SysProcAttr = &syscall.SysProcAttr{ - Setpgid: true, - } - // TODO this is probably a really bad idea for some uses - // Make this configurable - cmd.Stdin = os.Stdin - cmd.Stdout = os.Stdout - cmd.Stderr = os.Stderr - if ctr.config.Spec.Process.Terminal { - cmd.Stderr = &stderrBuf - } - - // 0, 1 and 2 are stdin, stdout and stderr - conmonEnv, envFiles, err := r.configureConmonEnv(runtimeDir) - if err != nil { - return err - } - - cmd.Env = append(r.conmonEnv, fmt.Sprintf("_OCI_SYNCPIPE=%d", 3), fmt.Sprintf("_OCI_STARTPIPE=%d", 4)) - cmd.Env = append(cmd.Env, conmonEnv...) - cmd.ExtraFiles = append(cmd.ExtraFiles, childSyncPipe, childStartPipe) - cmd.ExtraFiles = append(cmd.ExtraFiles, envFiles...) - - if r.reservePorts && !ctr.config.NetMode.IsSlirp4netns() { - ports, err := bindPorts(ctr.config.PortMappings) - if err != nil { - return err - } - - // Leak the port we bound in the conmon process. These fd's won't be used - // by the container and conmon will keep the ports busy so that another - // process cannot use them. - cmd.ExtraFiles = append(cmd.ExtraFiles, ports...) - } - - if ctr.config.NetMode.IsSlirp4netns() { - if ctr.config.PostConfigureNetNS { - ctr.rootlessSlirpSyncR, ctr.rootlessSlirpSyncW, err = os.Pipe() - if err != nil { - return errors.Wrapf(err, "failed to create rootless network sync pipe") - } - } else { - if ctr.rootlessSlirpSyncR != nil { - defer errorhandling.CloseQuiet(ctr.rootlessSlirpSyncR) - } - if ctr.rootlessSlirpSyncW != nil { - defer errorhandling.CloseQuiet(ctr.rootlessSlirpSyncW) - } - } - // Leak one end in conmon, the other one will be leaked into slirp4netns - cmd.ExtraFiles = append(cmd.ExtraFiles, ctr.rootlessSlirpSyncW) - } - - err = startCommandGivenSelinux(cmd) - // regardless of whether we errored or not, we no longer need the children pipes - childSyncPipe.Close() - childStartPipe.Close() - if err != nil { - return err - } - if err := r.moveConmonToCgroupAndSignal(ctr, cmd, parentStartPipe, ctr.ID()); err != nil { - return err - } - /* Wait for initial setup and fork, and reap child */ - err = cmd.Wait() - if err != nil { - return err - } - - pid, err := readConmonPipeData(parentSyncPipe, ociLog) - if err != nil { - if err2 := r.deleteContainer(ctr); err2 != nil { - logrus.Errorf("Error removing container %s from runtime after creation failed", ctr.ID()) - } - return err - } - ctr.state.PID = pid - - conmonPID, err := readConmonPidFile(ctr.config.ConmonPidFile) - if err != nil { - logrus.Warnf("error reading conmon pid file for container %s: %s", ctr.ID(), err.Error()) - } else if conmonPID > 0 { - // conmon not having a pid file is a valid state, so don't set it if we don't have it - logrus.Infof("Got Conmon PID as %d", conmonPID) - ctr.state.ConmonPID = conmonPID - } - - return nil -} - -// prepareProcessExec returns the path of the process.json used in runc exec -p -// caller is responsible to close the returned *os.File if needed. -func prepareProcessExec(c *Container, cmd, env []string, tty bool, cwd, user, sessionID string) (*os.File, error) { - f, err := ioutil.TempFile(c.execBundlePath(sessionID), "exec-process-") - if err != nil { - return nil, err - } - - pspec := c.config.Spec.Process - pspec.Args = cmd - // We need to default this to false else it will inherit terminal as true - // from the container. - pspec.Terminal = false - if tty { - pspec.Terminal = true - } - if len(env) > 0 { - pspec.Env = append(pspec.Env, env...) - } - - if cwd != "" { - pspec.Cwd = cwd - - } - - overrides := c.getUserOverrides() - execUser, err := lookup.GetUserGroupInfo(c.state.Mountpoint, user, overrides) - if err != nil { - return nil, err - } - - // If user was set, look it up in the container to get a UID to use on - // the host - if user != "" { - sgids := make([]uint32, 0, len(execUser.Sgids)) - for _, sgid := range execUser.Sgids { - sgids = append(sgids, uint32(sgid)) - } - processUser := spec.User{ - UID: uint32(execUser.Uid), - GID: uint32(execUser.Gid), - AdditionalGids: sgids, - } - - pspec.User = processUser - } - - hasHomeSet := false - for _, s := range pspec.Env { - if strings.HasPrefix(s, "HOME=") { - hasHomeSet = true - break - } - } - if !hasHomeSet { - pspec.Env = append(pspec.Env, fmt.Sprintf("HOME=%s", execUser.Home)) - } - - processJSON, err := json.Marshal(pspec) - if err != nil { - return nil, err - } - - if err := ioutil.WriteFile(f.Name(), processJSON, 0644); err != nil { - return nil, err - } - return f, nil -} - -// configureConmonEnv gets the environment values to add to conmon's exec struct -// TODO this may want to be less hardcoded/more configurable in the future -func (r *OCIRuntime) configureConmonEnv(runtimeDir string) ([]string, []*os.File, error) { - env := make([]string, 0, 6) - env = append(env, fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)) - env = append(env, fmt.Sprintf("_CONTAINERS_USERNS_CONFIGURED=%s", os.Getenv("_CONTAINERS_USERNS_CONFIGURED"))) - env = append(env, fmt.Sprintf("_CONTAINERS_ROOTLESS_UID=%s", os.Getenv("_CONTAINERS_ROOTLESS_UID"))) - home, err := homeDir() - if err != nil { - return nil, nil, err - } - env = append(env, fmt.Sprintf("HOME=%s", home)) - - extraFiles := make([]*os.File, 0) - if notify, ok := os.LookupEnv("NOTIFY_SOCKET"); ok { - env = append(env, fmt.Sprintf("NOTIFY_SOCKET=%s", notify)) - } - if !r.sdNotify { - if listenfds, ok := os.LookupEnv("LISTEN_FDS"); ok { - env = append(env, fmt.Sprintf("LISTEN_FDS=%s", listenfds), "LISTEN_PID=1") - fds := activation.Files(false) - extraFiles = append(extraFiles, fds...) - } - } else { - logrus.Debug("disabling SD notify") - } - return env, extraFiles, nil -} - -// sharedConmonArgs takes common arguments for exec and create/restore and formats them for the conmon CLI -func (r *OCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, pidPath, logPath, exitDir, ociLogPath string) []string { - // set the conmon API version to be able to use the correct sync struct keys - args := []string{"--api-version", "1"} - if r.cgroupManager == SystemdCgroupsManager && !ctr.config.NoCgroups { - args = append(args, "-s") - } - args = append(args, "-c", ctr.ID()) - args = append(args, "-u", cuuid) - args = append(args, "-r", r.path) - args = append(args, "-b", bundlePath) - args = append(args, "-p", pidPath) - - var logDriver string - switch ctr.LogDriver() { - case JournaldLogging: - logDriver = JournaldLogging - case JSONLogging: - fallthrough - default: //nolint-stylecheck - // No case here should happen except JSONLogging, but keep this here in case the options are extended - logrus.Errorf("%s logging specified but not supported. Choosing k8s-file logging instead", ctr.LogDriver()) - fallthrough - case "": - // to get here, either a user would specify `--log-driver ""`, or this came from another place in libpod - // since the former case is obscure, and the latter case isn't an error, let's silently fallthrough - fallthrough - case KubernetesLogging: - logDriver = fmt.Sprintf("%s:%s", KubernetesLogging, logPath) - } - - args = append(args, "-l", logDriver) - args = append(args, "--exit-dir", exitDir) - args = append(args, "--socket-dir-path", r.socketsDir) - if r.logSizeMax >= 0 { - args = append(args, "--log-size-max", fmt.Sprintf("%v", r.logSizeMax)) - } - - logLevel := logrus.GetLevel() - args = append(args, "--log-level", logLevel.String()) - - if logLevel == logrus.DebugLevel { - logrus.Debugf("%s messages will be logged to syslog", r.conmonPath) - args = append(args, "--syslog") - } - if ociLogPath != "" { - args = append(args, "--runtime-arg", "--log-format=json", "--runtime-arg", "--log", fmt.Sprintf("--runtime-arg=%s", ociLogPath)) - } - if ctr.config.NoCgroups { - logrus.Debugf("Running with no CGroups") - args = append(args, "--runtime-arg", "--cgroup-manager", "--runtime-arg", "disabled") - } - return args -} - -// startCommandGivenSelinux starts a container ensuring to set the labels of -// the process to make sure SELinux doesn't block conmon communication, if SELinux is enabled -func startCommandGivenSelinux(cmd *exec.Cmd) error { - if !selinux.GetEnabled() { - return cmd.Start() - } - // Set the label of the conmon process to be level :s0 - // This will allow the container processes to talk to fifo-files - // passed into the container by conmon - var ( - plabel string - con selinux.Context - err error - ) - plabel, err = selinux.CurrentLabel() - if err != nil { - return errors.Wrapf(err, "Failed to get current SELinux label") - } - - con, err = selinux.NewContext(plabel) - if err != nil { - return errors.Wrapf(err, "Failed to get new context from SELinux label") - } - - runtime.LockOSThread() - if con["level"] != "s0" && con["level"] != "" { - con["level"] = "s0" - if err = label.SetProcessLabel(con.Get()); err != nil { - runtime.UnlockOSThread() - return err - } - } - err = cmd.Start() - // Ignore error returned from SetProcessLabel("") call, - // can't recover. - if labelErr := label.SetProcessLabel(""); labelErr != nil { - logrus.Errorf("unable to set process label: %q", err) - } - runtime.UnlockOSThread() - return err -} - -// moveConmonToCgroupAndSignal gets a container's cgroupParent and moves the conmon process to that cgroup -// it then signals for conmon to start by sending nonse data down the start fd -func (r *OCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec.Cmd, startFd *os.File, uuid string) error { - mustCreateCgroup := true - // If cgroup creation is disabled - just signal. - if ctr.config.NoCgroups { - mustCreateCgroup = false - } - - if rootless.IsRootless() { - ownsCgroup, err := cgroups.UserOwnsCurrentSystemdCgroup() - if err != nil { - return err - } - mustCreateCgroup = !ownsCgroup - } - - if mustCreateCgroup { - cgroupParent := ctr.CgroupParent() - if r.cgroupManager == SystemdCgroupsManager { - unitName := createUnitName("libpod-conmon", ctr.ID()) - - realCgroupParent := cgroupParent - splitParent := strings.Split(cgroupParent, "/") - if strings.HasSuffix(cgroupParent, ".slice") && len(splitParent) > 1 { - realCgroupParent = splitParent[len(splitParent)-1] - } - - logrus.Infof("Running conmon under slice %s and unitName %s", realCgroupParent, unitName) - if err := utils.RunUnderSystemdScope(cmd.Process.Pid, realCgroupParent, unitName); err != nil { - logrus.Warnf("Failed to add conmon to systemd sandbox cgroup: %v", err) - } - } else { - cgroupPath := filepath.Join(ctr.config.CgroupParent, "conmon") - control, err := cgroups.New(cgroupPath, &spec.LinuxResources{}) - if err != nil { - logrus.Warnf("Failed to add conmon to cgroupfs sandbox cgroup: %v", err) - } else { - // we need to remove this defer and delete the cgroup once conmon exits - // maybe need a conmon monitor? - if err := control.AddPid(cmd.Process.Pid); err != nil { - logrus.Warnf("Failed to add conmon to cgroupfs sandbox cgroup: %v", err) - } - } - } - } - - /* We set the cgroup, now the child can start creating children */ - if err := writeConmonPipeData(startFd); err != nil { - return err - } - return nil -} - -// newPipe creates a unix socket pair for communication -func newPipe() (parent *os.File, child *os.File, err error) { - fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0) - if err != nil { - return nil, nil, err - } - return os.NewFile(uintptr(fds[1]), "parent"), os.NewFile(uintptr(fds[0]), "child"), nil -} - -// readConmonPidFile attempts to read conmon's pid from its pid file -func readConmonPidFile(pidFile string) (int, error) { - // Let's try reading the Conmon pid at the same time. - if pidFile != "" { - contents, err := ioutil.ReadFile(pidFile) - if err != nil { - return -1, err - } - // Convert it to an int - conmonPID, err := strconv.Atoi(string(contents)) - if err != nil { - return -1, err - } - return conmonPID, nil - } - return 0, nil -} - -// readConmonPipeData attempts to read a syncInfo struct from the pipe -func readConmonPipeData(pipe *os.File, ociLog string) (int, error) { - // syncInfo is used to return data from monitor process to daemon - type syncInfo struct { - Data int `json:"data"` - Message string `json:"message,omitempty"` - } - - // Wait to get container pid from conmon - type syncStruct struct { - si *syncInfo - err error - } - ch := make(chan syncStruct) - go func() { - var si *syncInfo - rdr := bufio.NewReader(pipe) - b, err := rdr.ReadBytes('\n') - if err != nil { - ch <- syncStruct{err: err} - } - if err := json.Unmarshal(b, &si); err != nil { - ch <- syncStruct{err: err} - return - } - ch <- syncStruct{si: si} - }() - - data := -1 - select { - case ss := <-ch: - if ss.err != nil { - if ociLog != "" { - ociLogData, err := ioutil.ReadFile(ociLog) - if err == nil { - var ociErr ociError - if err := json.Unmarshal(ociLogData, &ociErr); err == nil { - return -1, getOCIRuntimeError(ociErr.Msg) - } - } - } - return -1, errors.Wrapf(ss.err, "error reading container (probably exited) json message") - } - logrus.Debugf("Received: %d", ss.si.Data) - if ss.si.Data < 0 { - if ociLog != "" { - ociLogData, err := ioutil.ReadFile(ociLog) - if err == nil { - var ociErr ociError - if err := json.Unmarshal(ociLogData, &ociErr); err == nil { - return ss.si.Data, getOCIRuntimeError(ociErr.Msg) - } - } - } - // If we failed to parse the JSON errors, then print the output as it is - if ss.si.Message != "" { - return ss.si.Data, getOCIRuntimeError(ss.si.Message) - } - return ss.si.Data, errors.Wrapf(define.ErrInternal, "container create failed") - } - data = ss.si.Data - case <-time.After(ContainerCreateTimeout): - return -1, errors.Wrapf(define.ErrInternal, "container creation timeout") - } - return data, nil -} - -func getOCIRuntimeError(runtimeMsg string) error { - r := strings.ToLower(runtimeMsg) - if match, _ := regexp.MatchString(".*permission denied.*|.*operation not permitted.*", r); match { - return errors.Wrapf(define.ErrOCIRuntimePermissionDenied, "%s", strings.Trim(runtimeMsg, "\n")) - } - if match, _ := regexp.MatchString(".*executable file not found in.*|.*no such file or directory.*", r); match { - return errors.Wrapf(define.ErrOCIRuntimeNotFound, "%s", strings.Trim(runtimeMsg, "\n")) - } - return errors.Wrapf(define.ErrOCIRuntime, "%s", strings.Trim(runtimeMsg, "\n")) -} - -// writeConmonPipeData writes nonse data to a pipe -func writeConmonPipeData(pipe *os.File) error { - someData := []byte{0} - _, err := pipe.Write(someData) - return err -} - -// formatRuntimeOpts prepends opts passed to it with --runtime-opt for passing to conmon -func formatRuntimeOpts(opts ...string) []string { - args := make([]string, 0, len(opts)*2) - for _, o := range opts { - args = append(args, "--runtime-opt", o) - } - return args -} diff --git a/libpod/oci_linux.go b/libpod/oci_linux.go deleted file mode 100644 index 9ec074704..000000000 --- a/libpod/oci_linux.go +++ /dev/null @@ -1,503 +0,0 @@ -// +build linux - -package libpod - -import ( - "fmt" - "os" - "os/exec" - "path/filepath" - "runtime" - "strconv" - "strings" - "syscall" - "time" - - "github.com/containers/libpod/libpod/define" - "github.com/containers/libpod/pkg/errorhandling" - "github.com/containers/libpod/pkg/rootless" - "github.com/containers/libpod/pkg/util" - "github.com/containers/libpod/utils" - pmount "github.com/containers/storage/pkg/mount" - "github.com/pkg/errors" - "github.com/sirupsen/logrus" - "golang.org/x/sys/unix" - "k8s.io/client-go/tools/remotecommand" -) - -// makeAccessible changes the path permission and each parent directory to have --x--x--x -func makeAccessible(path string, uid, gid int) error { - for ; path != "/"; path = filepath.Dir(path) { - st, err := os.Stat(path) - if err != nil { - if os.IsNotExist(err) { - return nil - } - return err - } - if int(st.Sys().(*syscall.Stat_t).Uid) == uid && int(st.Sys().(*syscall.Stat_t).Gid) == gid { - continue - } - if st.Mode()&0111 != 0111 { - if err := os.Chmod(path, st.Mode()|0111); err != nil { - return err - } - } - } - return nil -} - -// CreateContainer creates a container in the OCI runtime -// TODO terminal support for container -// Presently just ignoring conmon opts related to it -func (r *OCIRuntime) createContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (err error) { - if len(ctr.config.IDMappings.UIDMap) != 0 || len(ctr.config.IDMappings.GIDMap) != 0 { - for _, i := range []string{ctr.state.RunDir, ctr.runtime.config.TmpDir, ctr.config.StaticDir, ctr.state.Mountpoint, ctr.runtime.config.VolumePath} { - if err := makeAccessible(i, ctr.RootUID(), ctr.RootGID()); err != nil { - return err - } - } - - // if we are running a non privileged container, be sure to umount some kernel paths so they are not - // bind mounted inside the container at all. - if !ctr.config.Privileged && !rootless.IsRootless() { - ch := make(chan error) - go func() { - runtime.LockOSThread() - err := func() error { - fd, err := os.Open(fmt.Sprintf("/proc/%d/task/%d/ns/mnt", os.Getpid(), unix.Gettid())) - if err != nil { - return err - } - defer errorhandling.CloseQuiet(fd) - - // create a new mountns on the current thread - if err = unix.Unshare(unix.CLONE_NEWNS); err != nil { - return err - } - defer func() { - if err := unix.Setns(int(fd.Fd()), unix.CLONE_NEWNS); err != nil { - logrus.Errorf("unable to clone new namespace: %q", err) - } - }() - - // don't spread our mounts around. We are setting only /sys to be slave - // so that the cleanup process is still able to umount the storage and the - // changes are propagated to the host. - err = unix.Mount("/sys", "/sys", "none", unix.MS_REC|unix.MS_SLAVE, "") - if err != nil { - return errors.Wrapf(err, "cannot make /sys slave") - } - - mounts, err := pmount.GetMounts() - if err != nil { - return err - } - for _, m := range mounts { - if !strings.HasPrefix(m.Mountpoint, "/sys/kernel") { - continue - } - err = unix.Unmount(m.Mountpoint, 0) - if err != nil && !os.IsNotExist(err) { - return errors.Wrapf(err, "cannot unmount %s", m.Mountpoint) - } - } - return r.createOCIContainer(ctr, restoreOptions) - }() - ch <- err - }() - err := <-ch - return err - } - } - return r.createOCIContainer(ctr, restoreOptions) -} - -func (r *OCIRuntime) pathPackage() string { - return packageVersion(r.path) -} - -func (r *OCIRuntime) conmonPackage() string { - return packageVersion(r.conmonPath) -} - -// execContainer executes a command in a running container -// TODO: Add --detach support -// TODO: Convert to use conmon -// TODO: add --pid-file and use that to generate exec session tracking -func (r *OCIRuntime) execContainer(c *Container, cmd, capAdd, env []string, tty bool, cwd, user, sessionID string, streams *AttachStreams, preserveFDs int, resize chan remotecommand.TerminalSize, detachKeys string) (int, chan error, error) { - if len(cmd) == 0 { - return -1, nil, errors.Wrapf(define.ErrInvalidArg, "must provide a command to execute") - } - - if sessionID == "" { - return -1, nil, errors.Wrapf(define.ErrEmptyID, "must provide a session ID for exec") - } - - // create sync pipe to receive the pid - parentSyncPipe, childSyncPipe, err := newPipe() - if err != nil { - return -1, nil, errors.Wrapf(err, "error creating socket pair") - } - - defer errorhandling.CloseQuiet(parentSyncPipe) - - // create start pipe to set the cgroup before running - // attachToExec is responsible for closing parentStartPipe - childStartPipe, parentStartPipe, err := newPipe() - if err != nil { - return -1, nil, errors.Wrapf(err, "error creating socket pair") - } - - // We want to make sure we close the parent{Start,Attach}Pipes if we fail - // but also don't want to close them after attach to exec is called - attachToExecCalled := false - - defer func() { - if !attachToExecCalled { - errorhandling.CloseQuiet(parentStartPipe) - } - }() - - // create the attach pipe to allow attach socket to be created before - // $RUNTIME exec starts running. This is to make sure we can capture all output - // from the process through that socket, rather than half reading the log, half attaching to the socket - // attachToExec is responsible for closing parentAttachPipe - parentAttachPipe, childAttachPipe, err := newPipe() - if err != nil { - return -1, nil, errors.Wrapf(err, "error creating socket pair") - } - - defer func() { - if !attachToExecCalled { - errorhandling.CloseQuiet(parentAttachPipe) - } - }() - - childrenClosed := false - defer func() { - if !childrenClosed { - errorhandling.CloseQuiet(childSyncPipe) - errorhandling.CloseQuiet(childAttachPipe) - errorhandling.CloseQuiet(childStartPipe) - } - }() - - runtimeDir, err := util.GetRuntimeDir() - if err != nil { - return -1, nil, err - } - - processFile, err := prepareProcessExec(c, cmd, env, tty, cwd, user, sessionID) - if err != nil { - return -1, nil, err - } - - var ociLog string - if logrus.GetLevel() != logrus.DebugLevel && r.supportsJSON { - ociLog = c.execOCILog(sessionID) - } - args := r.sharedConmonArgs(c, sessionID, c.execBundlePath(sessionID), c.execPidPath(sessionID), c.execLogPath(sessionID), c.execExitFileDir(sessionID), ociLog) - - if preserveFDs > 0 { - args = append(args, formatRuntimeOpts("--preserve-fds", strconv.Itoa(preserveFDs))...) - } - - for _, capability := range capAdd { - args = append(args, formatRuntimeOpts("--cap", capability)...) - } - - if tty { - args = append(args, "-t") - } - - // Append container ID and command - args = append(args, "-e") - // TODO make this optional when we can detach - args = append(args, "--exec-attach") - args = append(args, "--exec-process-spec", processFile.Name()) - - logrus.WithFields(logrus.Fields{ - "args": args, - }).Debugf("running conmon: %s", r.conmonPath) - execCmd := exec.Command(r.conmonPath, args...) - - if streams.AttachInput { - execCmd.Stdin = streams.InputStream - } - if streams.AttachOutput { - execCmd.Stdout = streams.OutputStream - } - if streams.AttachError { - execCmd.Stderr = streams.ErrorStream - } - - conmonEnv, extraFiles, err := r.configureConmonEnv(runtimeDir) - if err != nil { - return -1, nil, err - } - - if preserveFDs > 0 { - for fd := 3; fd < 3+preserveFDs; fd++ { - execCmd.ExtraFiles = append(execCmd.ExtraFiles, os.NewFile(uintptr(fd), fmt.Sprintf("fd-%d", fd))) - } - } - - // we don't want to step on users fds they asked to preserve - // Since 0-2 are used for stdio, start the fds we pass in at preserveFDs+3 - execCmd.Env = append(r.conmonEnv, fmt.Sprintf("_OCI_SYNCPIPE=%d", preserveFDs+3), fmt.Sprintf("_OCI_STARTPIPE=%d", preserveFDs+4), fmt.Sprintf("_OCI_ATTACHPIPE=%d", preserveFDs+5)) - execCmd.Env = append(execCmd.Env, conmonEnv...) - - execCmd.ExtraFiles = append(execCmd.ExtraFiles, childSyncPipe, childStartPipe, childAttachPipe) - execCmd.ExtraFiles = append(execCmd.ExtraFiles, extraFiles...) - execCmd.Dir = c.execBundlePath(sessionID) - execCmd.SysProcAttr = &syscall.SysProcAttr{ - Setpgid: true, - } - - err = startCommandGivenSelinux(execCmd) - - // We don't need children pipes on the parent side - errorhandling.CloseQuiet(childSyncPipe) - errorhandling.CloseQuiet(childAttachPipe) - errorhandling.CloseQuiet(childStartPipe) - childrenClosed = true - - if err != nil { - return -1, nil, errors.Wrapf(err, "cannot start container %s", c.ID()) - } - if err := r.moveConmonToCgroupAndSignal(c, execCmd, parentStartPipe, sessionID); err != nil { - return -1, nil, err - } - - if preserveFDs > 0 { - for fd := 3; fd < 3+preserveFDs; fd++ { - // These fds were passed down to the runtime. Close them - // and not interfere - if err := os.NewFile(uintptr(fd), fmt.Sprintf("fd-%d", fd)).Close(); err != nil { - logrus.Debugf("unable to close file fd-%d", fd) - } - } - } - - // TODO Only create if !detach - // Attach to the container before starting it - attachChan := make(chan error) - go func() { - // attachToExec is responsible for closing pipes - attachChan <- c.attachToExec(streams, detachKeys, resize, sessionID, parentStartPipe, parentAttachPipe) - close(attachChan) - }() - attachToExecCalled = true - - pid, err := readConmonPipeData(parentSyncPipe, ociLog) - - return pid, attachChan, err -} - -// Wait for a container which has been sent a signal to stop -func waitContainerStop(ctr *Container, timeout time.Duration) error { - done := make(chan struct{}) - chControl := make(chan struct{}) - go func() { - for { - select { - case <-chControl: - return - default: - // Check if the process is still around - err := unix.Kill(ctr.state.PID, 0) - if err == unix.ESRCH { - close(done) - return - } - time.Sleep(100 * time.Millisecond) - } - } - }() - select { - case <-done: - return nil - case <-time.After(timeout): - close(chControl) - logrus.Debugf("container %s did not die within timeout %d", ctr.ID(), timeout) - return errors.Errorf("container %s did not die within timeout", ctr.ID()) - } -} - -// Wait for a set of given PIDs to stop -func waitPidsStop(pids []int, timeout time.Duration) error { - done := make(chan struct{}) - chControl := make(chan struct{}) - go func() { - for { - select { - case <-chControl: - return - default: - allClosed := true - for _, pid := range pids { - if err := unix.Kill(pid, 0); err != unix.ESRCH { - allClosed = false - break - } - } - if allClosed { - close(done) - return - } - time.Sleep(100 * time.Millisecond) - } - } - }() - select { - case <-done: - return nil - case <-time.After(timeout): - close(chControl) - return errors.Errorf("given PIDs did not die within timeout") - } -} - -// stopContainer stops a container, first using its given stop signal (or -// SIGTERM if no signal was specified), then using SIGKILL -// Timeout is given in seconds. If timeout is 0, the container will be -// immediately kill with SIGKILL -// Does not set finished time for container, assumes you will run updateStatus -// after to pull the exit code -func (r *OCIRuntime) stopContainer(ctr *Container, timeout uint) error { - logrus.Debugf("Stopping container %s (PID %d)", ctr.ID(), ctr.state.PID) - - // Ping the container to see if it's alive - // If it's not, it's already stopped, return - err := unix.Kill(ctr.state.PID, 0) - if err == unix.ESRCH { - return nil - } - - stopSignal := ctr.config.StopSignal - if stopSignal == 0 { - stopSignal = uint(syscall.SIGTERM) - } - - if timeout > 0 { - if err := r.killContainer(ctr, stopSignal); err != nil { - // Is the container gone? - // If so, it probably died between the first check and - // our sending the signal - // The container is stopped, so exit cleanly - err := unix.Kill(ctr.state.PID, 0) - if err == unix.ESRCH { - return nil - } - - return err - } - - if err := waitContainerStop(ctr, time.Duration(timeout)*time.Second); err != nil { - logrus.Warnf("Timed out stopping container %s, resorting to SIGKILL", ctr.ID()) - } else { - // No error, the container is dead - return nil - } - } - - var args []string - if rootless.IsRootless() || ctr.config.NoCgroups { - // we don't use --all for rootless containers as the OCI runtime might use - // the cgroups to determine the PIDs, but for rootless containers there is - // not any. - // Same logic for NoCgroups - we can't use cgroups as the user - // explicitly requested none be created. - args = []string{"kill", ctr.ID(), "KILL"} - } else { - args = []string{"kill", "--all", ctr.ID(), "KILL"} - } - - runtimeDir, err := util.GetRuntimeDir() - if err != nil { - return err - } - env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} - if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, args...); err != nil { - // Again, check if the container is gone. If it is, exit cleanly. - err := unix.Kill(ctr.state.PID, 0) - if err == unix.ESRCH { - return nil - } - - return errors.Wrapf(err, "error sending SIGKILL to container %s", ctr.ID()) - } - - // Give runtime a few seconds to make it happen - if err := waitContainerStop(ctr, killContainerTimeout); err != nil { - return err - } - - return nil -} - -// execStopContainer stops all active exec sessions in a container -// It will also stop all other processes in the container. It is only intended -// to be used to assist in cleanup when removing a container. -// SIGTERM is used by default to stop processes. If SIGTERM fails, SIGKILL will be used. -func (r *OCIRuntime) execStopContainer(ctr *Container, timeout uint) error { - // Do we have active exec sessions? - if len(ctr.state.ExecSessions) == 0 { - return nil - } - - // Get a list of active exec sessions - execSessions := []int{} - for _, session := range ctr.state.ExecSessions { - pid := session.PID - // Ping the PID with signal 0 to see if it still exists - if err := unix.Kill(pid, 0); err == unix.ESRCH { - continue - } - - execSessions = append(execSessions, pid) - } - - // All the sessions may be dead - // If they are, just return - if len(execSessions) == 0 { - return nil - } - runtimeDir, err := util.GetRuntimeDir() - if err != nil { - return err - } - env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} - - // If timeout is 0, just use SIGKILL - if timeout > 0 { - // Stop using SIGTERM by default - // Use SIGSTOP after a timeout - logrus.Debugf("Killing all processes in container %s with SIGTERM", ctr.ID()) - if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "kill", "--all", ctr.ID(), "TERM"); err != nil { - return errors.Wrapf(err, "error sending SIGTERM to container %s processes", ctr.ID()) - } - - // Wait for all processes to stop - if err := waitPidsStop(execSessions, time.Duration(timeout)*time.Second); err != nil { - logrus.Warnf("Timed out stopping container %s exec sessions", ctr.ID()) - } else { - // No error, all exec sessions are dead - return nil - } - } - - // Send SIGKILL - logrus.Debugf("Killing all processes in container %s with SIGKILL", ctr.ID()) - if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "kill", "--all", ctr.ID(), "KILL"); err != nil { - return errors.Wrapf(err, "error sending SIGKILL to container %s processes", ctr.ID()) - } - - // Give the processes a few seconds to go down - if err := waitPidsStop(execSessions, killContainerTimeout); err != nil { - return errors.Wrapf(err, "failed to kill container %s exec sessions", ctr.ID()) - } - - return nil -} diff --git a/libpod/oci_unsupported.go b/libpod/oci_unsupported.go deleted file mode 100644 index 4a65d4d1d..000000000 --- a/libpod/oci_unsupported.go +++ /dev/null @@ -1,47 +0,0 @@ -// +build !linux - -package libpod - -import ( - "os" - "os/exec" - - "github.com/containers/libpod/libpod/define" - "k8s.io/client-go/tools/remotecommand" -) - -func (r *OCIRuntime) moveConmonToCgroup(ctr *Container, cgroupParent string, cmd *exec.Cmd) error { - return define.ErrOSNotSupported -} - -func newPipe() (parent *os.File, child *os.File, err error) { - return nil, nil, define.ErrNotImplemented -} - -func (r *OCIRuntime) createContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (err error) { - return define.ErrNotImplemented -} - -func (r *OCIRuntime) pathPackage() string { - return "" -} - -func (r *OCIRuntime) conmonPackage() string { - return "" -} - -func (r *OCIRuntime) createOCIContainer(ctr *Container, cgroupParent string, restoreOptions *ContainerCheckpointOptions) (err error) { - return define.ErrOSNotSupported -} - -func (r *OCIRuntime) execStopContainer(ctr *Container, timeout uint) error { - return define.ErrOSNotSupported -} - -func (r *OCIRuntime) stopContainer(ctr *Container, timeout uint) error { - return define.ErrOSNotSupported -} - -func (r *OCIRuntime) execContainer(c *Container, cmd, capAdd, env []string, tty bool, cwd, user, sessionID string, streams *AttachStreams, preserveFDs int, resize chan remotecommand.TerminalSize, detachKeys string) (int, chan error, error) { - return -1, nil, define.ErrOSNotSupported -} diff --git a/libpod/oci_util.go b/libpod/oci_util.go new file mode 100644 index 000000000..cb85b153d --- /dev/null +++ b/libpod/oci_util.go @@ -0,0 +1,113 @@ +package libpod + +import ( + "fmt" + "net" + "os" + "regexp" + "strings" + "time" + + "github.com/containers/libpod/libpod/define" + "github.com/cri-o/ocicni/pkg/ocicni" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" +) + +const ( + // CgroupfsCgroupsManager represents cgroupfs native cgroup manager + CgroupfsCgroupsManager = "cgroupfs" + // SystemdCgroupsManager represents systemd native cgroup manager + SystemdCgroupsManager = "systemd" + + // ContainerCreateTimeout is the timeout before we decide we've failed + // to create a container. + // TODO: Make this generic - all OCI runtime operations should use the + // same timeout, this one. + // TODO: Consider dropping from 240 to 60 seconds. I don't think waiting + // 4 minutes versus 1 minute makes a real difference. + ContainerCreateTimeout = 240 * time.Second + + // Timeout before declaring that runtime has failed to kill a given + // container + killContainerTimeout = 5 * time.Second + // DefaultShmSize is the default shm size + DefaultShmSize = 64 * 1024 * 1024 + // NsRunDir is the default directory in which running network namespaces + // are stored + NsRunDir = "/var/run/netns" +) + +// ociError is used to parse the OCI runtime JSON log. It is not part of the +// OCI runtime specifications, it follows what runc does +type ociError struct { + Level string `json:"level,omitempty"` + Time string `json:"time,omitempty"` + Msg string `json:"msg,omitempty"` +} + +// Create systemd unit name for cgroup scopes +func createUnitName(prefix string, name string) string { + return fmt.Sprintf("%s-%s.scope", prefix, name) +} + +// Bind ports to keep them closed on the host +func bindPorts(ports []ocicni.PortMapping) ([]*os.File, error) { + var files []*os.File + notifySCTP := false + for _, i := range ports { + switch i.Protocol { + case "udp": + addr, err := net.ResolveUDPAddr("udp", fmt.Sprintf("%s:%d", i.HostIP, i.HostPort)) + if err != nil { + return nil, errors.Wrapf(err, "cannot resolve the UDP address") + } + + server, err := net.ListenUDP("udp", addr) + if err != nil { + return nil, errors.Wrapf(err, "cannot listen on the UDP port") + } + f, err := server.File() + if err != nil { + return nil, errors.Wrapf(err, "cannot get file for UDP socket") + } + files = append(files, f) + + case "tcp": + addr, err := net.ResolveTCPAddr("tcp4", fmt.Sprintf("%s:%d", i.HostIP, i.HostPort)) + if err != nil { + return nil, errors.Wrapf(err, "cannot resolve the TCP address") + } + + server, err := net.ListenTCP("tcp4", addr) + if err != nil { + return nil, errors.Wrapf(err, "cannot listen on the TCP port") + } + f, err := server.File() + if err != nil { + return nil, errors.Wrapf(err, "cannot get file for TCP socket") + } + files = append(files, f) + case "sctp": + if !notifySCTP { + notifySCTP = true + logrus.Warnf("port reservation for SCTP is not supported") + } + default: + return nil, fmt.Errorf("unknown protocol %s", i.Protocol) + + } + } + return files, nil +} + +func getOCIRuntimeError(runtimeMsg string) error { + r := strings.ToLower(runtimeMsg) + if match, _ := regexp.MatchString(".*permission denied.*|.*operation not permitted.*", r); match { + return errors.Wrapf(define.ErrOCIRuntimePermissionDenied, "%s", strings.Trim(runtimeMsg, "\n")) + } + if match, _ := regexp.MatchString(".*executable file not found in.*|.*no such file or directory.*", r); match { + return errors.Wrapf(define.ErrOCIRuntimeNotFound, "%s", strings.Trim(runtimeMsg, "\n")) + } + return errors.Wrapf(define.ErrOCIRuntime, "%s", strings.Trim(runtimeMsg, "\n")) +} diff --git a/libpod/pod_api.go b/libpod/pod_api.go index 7c786b835..3a194f04b 100644 --- a/libpod/pod_api.go +++ b/libpod/pod_api.go @@ -123,7 +123,7 @@ func (p *Pod) StopWithTimeout(ctx context.Context, cleanup bool, timeout int) (m if timeout > -1 { stopTimeout = uint(timeout) } - if err := ctr.stop(stopTimeout); err != nil { + if err := ctr.stop(stopTimeout, false); err != nil { ctr.lock.Unlock() ctrErrors[ctr.ID()] = err continue @@ -370,7 +370,7 @@ func (p *Pod) Kill(signal uint) (map[string]error, error) { continue } - if err := ctr.ociRuntime.killContainer(ctr, signal); err != nil { + if err := ctr.ociRuntime.KillContainer(ctr, signal, false); err != nil { ctr.lock.Unlock() ctrErrors[ctr.ID()] = err continue diff --git a/libpod/runtime.go b/libpod/runtime.go index cdb5670ba..e961145f5 100644 --- a/libpod/runtime.go +++ b/libpod/runtime.go @@ -99,8 +99,8 @@ type Runtime struct { store storage.Store storageService *storageService imageContext *types.SystemContext - defaultOCIRuntime *OCIRuntime - ociRuntimes map[string]*OCIRuntime + defaultOCIRuntime OCIRuntime + ociRuntimes map[string]OCIRuntime netPlugin ocicni.CNIPlugin conmonPath string imageRuntime *image.Runtime @@ -1053,7 +1053,7 @@ func makeRuntime(ctx context.Context, runtime *Runtime) (err error) { } // Get us at least one working OCI runtime. - runtime.ociRuntimes = make(map[string]*OCIRuntime) + runtime.ociRuntimes = make(map[string]OCIRuntime) // Is the old runtime_path defined? if runtime.config.RuntimePath != nil { @@ -1072,7 +1072,7 @@ func makeRuntime(ctx context.Context, runtime *Runtime) (err error) { json := supportsJSON[name] nocgroups := supportsNoCgroups[name] - ociRuntime, err := newOCIRuntime(name, runtime.config.RuntimePath, runtime.conmonPath, runtime.config, json, nocgroups) + ociRuntime, err := newConmonOCIRuntime(name, runtime.config.RuntimePath, runtime.conmonPath, runtime.config, json, nocgroups) if err != nil { return err } @@ -1086,7 +1086,7 @@ func makeRuntime(ctx context.Context, runtime *Runtime) (err error) { json := supportsJSON[name] nocgroups := supportsNoCgroups[name] - ociRuntime, err := newOCIRuntime(name, paths, runtime.conmonPath, runtime.config, json, nocgroups) + ociRuntime, err := newConmonOCIRuntime(name, paths, runtime.conmonPath, runtime.config, json, nocgroups) if err != nil { // Don't fatally error. // This will allow us to ship configs including optional @@ -1109,7 +1109,7 @@ func makeRuntime(ctx context.Context, runtime *Runtime) (err error) { json := supportsJSON[name] nocgroups := supportsNoCgroups[name] - ociRuntime, err := newOCIRuntime(name, []string{runtime.config.OCIRuntime}, runtime.conmonPath, runtime.config, json, nocgroups) + ociRuntime, err := newConmonOCIRuntime(name, []string{runtime.config.OCIRuntime}, runtime.conmonPath, runtime.config, json, nocgroups) if err != nil { return err } @@ -1474,6 +1474,11 @@ func (r *Runtime) SystemContext() *types.SystemContext { return r.imageContext } +// GetOCIRuntimePath retrieves the path of the default OCI runtime. +func (r *Runtime) GetOCIRuntimePath() string { + return r.defaultOCIRuntime.Path() +} + // Since runc does not currently support cgroupV2 // Change to default crun on first running of libpod.conf // TODO Once runc has support for cgroups, this function should be removed. diff --git a/libpod/runtime_ctr.go b/libpod/runtime_ctr.go index 78176a400..c1d7571e2 100644 --- a/libpod/runtime_ctr.go +++ b/libpod/runtime_ctr.go @@ -102,7 +102,7 @@ func (r *Runtime) initContainerVariables(rSpec *spec.Spec, config *ContainerConf ctr.config.StopTimeout = define.CtrRemoveTimeout - ctr.config.OCIRuntime = r.defaultOCIRuntime.name + ctr.config.OCIRuntime = r.defaultOCIRuntime.Name() // Set namespace based on current runtime namespace // Do so before options run so they can override it @@ -167,8 +167,8 @@ func (r *Runtime) setupContainer(ctx context.Context, ctr *Container) (c *Contai // Check NoCgroups support if ctr.config.NoCgroups { - if !ctr.ociRuntime.supportsNoCgroups { - return nil, errors.Wrapf(define.ErrInvalidArg, "requested OCI runtime %s is not compatible with NoCgroups", ctr.ociRuntime.name) + if !ctr.ociRuntime.SupportsNoCgroups() { + return nil, errors.Wrapf(define.ErrInvalidArg, "requested OCI runtime %s is not compatible with NoCgroups", ctr.ociRuntime.Name()) } } @@ -430,7 +430,7 @@ func (r *Runtime) removeContainer(ctx context.Context, c *Container, force bool, } if c.state.State == define.ContainerStatePaused { - if err := c.ociRuntime.killContainer(c, 9); err != nil { + if err := c.ociRuntime.KillContainer(c, 9, false); err != nil { return err } if err := c.unpause(); err != nil { @@ -444,15 +444,15 @@ func (r *Runtime) removeContainer(ctx context.Context, c *Container, force bool, // Check that the container's in a good state to be removed if c.state.State == define.ContainerStateRunning { - if err := c.stop(c.StopTimeout()); err != nil { + if err := c.stop(c.StopTimeout(), true); err != nil { return errors.Wrapf(err, "cannot remove container %s as it could not be stopped", c.ID()) } } // Check that all of our exec sessions have finished - if len(c.state.ExecSessions) != 0 { - if err := c.ociRuntime.execStopContainer(c, c.StopTimeout()); err != nil { - return err + for _, session := range c.state.ExecSessions { + if err := c.ociRuntime.ExecStopContainer(c, session.ID, c.StopTimeout()); err != nil { + return errors.Wrapf(err, "error stopping exec session %s of container %s", session.ID, c.ID()) } } |