summaryrefslogtreecommitdiff
path: root/libpod
diff options
context:
space:
mode:
Diffstat (limited to 'libpod')
-rw-r--r--libpod/container.go2
-rw-r--r--libpod/container_api.go39
-rw-r--r--libpod/container_commit.go4
-rw-r--r--libpod/container_internal.go78
-rw-r--r--libpod/container_internal_linux.go53
-rw-r--r--libpod/container_internal_unsupported.go4
-rw-r--r--libpod/healthcheck.go2
-rw-r--r--libpod/info.go45
-rw-r--r--libpod/networking_linux.go2
-rw-r--r--libpod/oci.go557
-rw-r--r--libpod/oci_attach_linux.go12
-rw-r--r--libpod/oci_conmon_linux.go1421
-rw-r--r--libpod/oci_conmon_unsupported.go130
-rw-r--r--libpod/oci_internal_linux.go556
-rw-r--r--libpod/oci_linux.go503
-rw-r--r--libpod/oci_unsupported.go47
-rw-r--r--libpod/oci_util.go113
-rw-r--r--libpod/options.go22
-rw-r--r--libpod/pod_api.go4
-rw-r--r--libpod/runtime.go21
-rw-r--r--libpod/runtime_ctr.go24
-rw-r--r--libpod/runtime_migrate.go26
22 files changed, 2008 insertions, 1657 deletions
diff --git a/libpod/container.go b/libpod/container.go
index f36ddbd3f..7be73b3c3 100644
--- a/libpod/container.go
+++ b/libpod/container.go
@@ -129,7 +129,7 @@ type Container struct {
valid bool
lock lock.Locker
runtime *Runtime
- ociRuntime *OCIRuntime
+ ociRuntime OCIRuntime
rootlessSlirpSyncR *os.File
rootlessSlirpSyncW *os.File
diff --git a/libpod/container_api.go b/libpod/container_api.go
index 4f0d5301c..04c796410 100644
--- a/libpod/container_api.go
+++ b/libpod/container_api.go
@@ -187,7 +187,7 @@ func (c *Container) StopWithTimeout(timeout uint) error {
return define.ErrCtrStopped
}
- return c.stop(timeout)
+ return c.stop(timeout, false)
}
// Kill sends a signal to a container
@@ -205,13 +205,15 @@ func (c *Container) Kill(signal uint) error {
return errors.Wrapf(define.ErrCtrStateInvalid, "can only kill running containers. %s is in state %s", c.ID(), c.state.State.String())
}
- defer c.newContainerEvent(events.Kill)
- if err := c.ociRuntime.killContainer(c, signal); err != nil {
+ // Hardcode all = false, we only use all when removing.
+ if err := c.ociRuntime.KillContainer(c, signal, false); err != nil {
return err
}
c.state.StoppedByUser = true
+ c.newContainerEvent(events.Kill)
+
return c.save()
}
@@ -221,7 +223,7 @@ func (c *Container) Kill(signal uint) error {
// Sometimes, the $RUNTIME exec call errors, and if that is the case, the exit code is the exit code of the call.
// Otherwise, the exit code will be the exit code of the executed call inside of the container.
// TODO investigate allowing exec without attaching
-func (c *Container) Exec(tty, privileged bool, env, cmd []string, user, workDir string, streams *AttachStreams, preserveFDs int, resize chan remotecommand.TerminalSize, detachKeys string) (int, error) {
+func (c *Container) Exec(tty, privileged bool, env map[string]string, cmd []string, user, workDir string, streams *AttachStreams, preserveFDs uint, resize chan remotecommand.TerminalSize, detachKeys string) (int, error) {
var capList []string
if !c.batched {
c.lock.Lock()
@@ -278,7 +280,19 @@ func (c *Container) Exec(tty, privileged bool, env, cmd []string, user, workDir
user = c.config.User
}
- pid, attachChan, err := c.ociRuntime.execContainer(c, cmd, capList, env, tty, workDir, user, sessionID, streams, preserveFDs, resize, detachKeys)
+ opts := new(ExecOptions)
+ opts.Cmd = cmd
+ opts.CapAdd = capList
+ opts.Env = env
+ opts.Terminal = tty
+ opts.Cwd = workDir
+ opts.User = user
+ opts.Streams = streams
+ opts.PreserveFDs = preserveFDs
+ opts.Resize = resize
+ opts.DetachKeys = detachKeys
+
+ pid, attachChan, err := c.ociRuntime.ExecContainer(c, sessionID, opts)
if err != nil {
ec := define.ExecErrorCodeGeneric
// Conmon will pass a non-zero exit code from the runtime as a pid here.
@@ -524,7 +538,10 @@ func (c *Container) WaitWithInterval(waitTimeout time.Duration) (int32, error) {
return -1, define.ErrCtrRemoved
}
- exitFile := c.exitFilePath()
+ exitFile, err := c.exitFilePath()
+ if err != nil {
+ return -1, err
+ }
chWait := make(chan error, 1)
defer close(chWait)
@@ -639,7 +656,7 @@ func (c *Container) Sync() error {
(c.state.State != define.ContainerStateConfigured) &&
(c.state.State != define.ContainerStateExited) {
oldState := c.state.State
- if err := c.ociRuntime.updateContainerStatus(c, true); err != nil {
+ if err := c.ociRuntime.UpdateContainerStatus(c, true); err != nil {
return err
}
// Only save back to DB if state changed
@@ -687,7 +704,7 @@ func (c *Container) Refresh(ctx context.Context) error {
// Next, if the container is running, stop it
if c.state.State == define.ContainerStateRunning {
- if err := c.stop(c.config.StopTimeout); err != nil {
+ if err := c.stop(c.config.StopTimeout, false); err != nil {
return err
}
}
@@ -696,8 +713,10 @@ func (c *Container) Refresh(ctx context.Context) error {
if len(c.state.ExecSessions) > 0 {
logrus.Infof("Killing %d exec sessions in container %s. They will not be restored after refresh.",
len(c.state.ExecSessions), c.ID())
- if err := c.ociRuntime.execStopContainer(c, c.config.StopTimeout); err != nil {
- return err
+ }
+ for _, session := range c.state.ExecSessions {
+ if err := c.ociRuntime.ExecStopContainer(c, session.ID, c.StopTimeout()); err != nil {
+ return errors.Wrapf(err, "error stopping exec session %s of container %s", session.ID, c.ID())
}
}
diff --git a/libpod/container_commit.go b/libpod/container_commit.go
index 570d406b7..d5afe0da7 100644
--- a/libpod/container_commit.go
+++ b/libpod/container_commit.go
@@ -50,11 +50,11 @@ func (c *Container) Commit(ctx context.Context, destImage string, options Contai
}
if c.state.State == define.ContainerStateRunning && options.Pause {
- if err := c.ociRuntime.pauseContainer(c); err != nil {
+ if err := c.pause(); err != nil {
return nil, errors.Wrapf(err, "error pausing container %q", c.ID())
}
defer func() {
- if err := c.ociRuntime.unpauseContainer(c); err != nil {
+ if err := c.unpause(); err != nil {
logrus.Errorf("error unpausing container %q: %v", c.ID(), err)
}
}()
diff --git a/libpod/container_internal.go b/libpod/container_internal.go
index ac921d737..e7f541c52 100644
--- a/libpod/container_internal.go
+++ b/libpod/container_internal.go
@@ -131,13 +131,13 @@ func (c *Container) CheckpointPath() string {
}
// AttachSocketPath retrieves the path of the container's attach socket
-func (c *Container) AttachSocketPath() string {
- return filepath.Join(c.ociRuntime.socketsDir, c.ID(), "attach")
+func (c *Container) AttachSocketPath() (string, error) {
+ return c.ociRuntime.AttachSocketPath(c)
}
// exitFilePath gets the path to the container's exit file
-func (c *Container) exitFilePath() string {
- return filepath.Join(c.ociRuntime.exitsDir, c.ID())
+func (c *Container) exitFilePath() (string, error) {
+ return c.ociRuntime.ExitFilePath(c)
}
// create a bundle path and associated files for an exec session
@@ -167,12 +167,8 @@ func (c *Container) cleanupExecBundle(sessionID string) error {
if err := os.RemoveAll(c.execBundlePath(sessionID)); err != nil && !os.IsNotExist(err) {
return err
}
- // Clean up the sockets dir. Issue #3962
- // Also ignore if it doesn't exist for some reason; hence the conditional return below
- if err := os.RemoveAll(filepath.Join(c.ociRuntime.socketsDir, sessionID)); err != nil && !os.IsNotExist(err) {
- return err
- }
- return nil
+
+ return c.ociRuntime.ExecContainerCleanup(c, sessionID)
}
// the path to a containers exec session bundle
@@ -191,8 +187,8 @@ func (c *Container) execLogPath(sessionID string) string {
}
// the socket conmon creates for an exec session
-func (c *Container) execAttachSocketPath(sessionID string) string {
- return filepath.Join(c.ociRuntime.socketsDir, sessionID, "attach")
+func (c *Container) execAttachSocketPath(sessionID string) (string, error) {
+ return c.ociRuntime.ExecAttachSocketPath(c, sessionID)
}
// execExitFileDir gets the path to the container's exit file
@@ -202,7 +198,7 @@ func (c *Container) execExitFileDir(sessionID string) string {
// execOCILog returns the file path for the exec sessions oci log
func (c *Container) execOCILog(sessionID string) string {
- if !c.ociRuntime.supportsJSON {
+ if !c.ociRuntime.SupportsJSONErrors() {
return ""
}
return filepath.Join(c.execBundlePath(sessionID), "oci-log")
@@ -233,12 +229,15 @@ func (c *Container) readExecExitCode(sessionID string) (int, error) {
// Wait for the container's exit file to appear.
// When it does, update our state based on it.
func (c *Container) waitForExitFileAndSync() error {
- exitFile := c.exitFilePath()
+ exitFile, err := c.exitFilePath()
+ if err != nil {
+ return err
+ }
chWait := make(chan error)
defer close(chWait)
- _, err := WaitForFile(exitFile, chWait, time.Second*5)
+ _, err = WaitForFile(exitFile, chWait, time.Second*5)
if err != nil {
// Exit file did not appear
// Reset our state
@@ -253,7 +252,7 @@ func (c *Container) waitForExitFileAndSync() error {
return err
}
- if err := c.ociRuntime.updateContainerStatus(c, false); err != nil {
+ if err := c.ociRuntime.UpdateContainerStatus(c, false); err != nil {
return err
}
@@ -388,7 +387,7 @@ func (c *Container) syncContainer() error {
(c.state.State != define.ContainerStateExited) {
oldState := c.state.State
// TODO: optionally replace this with a stat for the exit file
- if err := c.ociRuntime.updateContainerStatus(c, false); err != nil {
+ if err := c.ociRuntime.UpdateContainerStatus(c, false); err != nil {
return err
}
// Only save back to DB if state changed
@@ -649,7 +648,10 @@ func (c *Container) removeConmonFiles() error {
}
// Remove the exit file so we don't leak memory in tmpfs
- exitFile := filepath.Join(c.ociRuntime.exitsDir, c.ID())
+ exitFile, err := c.exitFilePath()
+ if err != nil {
+ return err
+ }
if err := os.Remove(exitFile); err != nil && !os.IsNotExist(err) {
return errors.Wrapf(err, "error removing container %s exit file", c.ID())
}
@@ -938,9 +940,13 @@ func (c *Container) init(ctx context.Context, retainRetries bool) error {
}
// With the spec complete, do an OCI create
- if err := c.ociRuntime.createContainer(c, nil); err != nil {
+ if err := c.ociRuntime.CreateContainer(c, nil); err != nil {
+ // Fedora 31 is carrying a patch to display improved error
+ // messages to better handle the V2 transition. This is NOT
+ // upstream in any OCI runtime.
+ // TODO: Remove once runc supports cgroupsv2
if strings.Contains(err.Error(), "this version of runc doesn't work on cgroups v2") {
- logrus.Errorf("oci runtime %q does not support CGroups V2: use system migrate to mitigate", c.ociRuntime.name)
+ logrus.Errorf("oci runtime %q does not support CGroups V2: use system migrate to mitigate", c.ociRuntime.Name())
}
return err
}
@@ -1088,7 +1094,7 @@ func (c *Container) start() error {
logrus.Debugf("Starting container %s with command %v", c.ID(), c.config.Spec.Process.Args)
}
- if err := c.ociRuntime.startContainer(c); err != nil {
+ if err := c.ociRuntime.StartContainer(c); err != nil {
return err
}
logrus.Debugf("Started container %s", c.ID())
@@ -1110,10 +1116,28 @@ func (c *Container) start() error {
}
// Internal, non-locking function to stop container
-func (c *Container) stop(timeout uint) error {
+func (c *Container) stop(timeout uint, all bool) error {
logrus.Debugf("Stopping ctr %s (timeout %d)", c.ID(), timeout)
- if err := c.ociRuntime.stopContainer(c, timeout); err != nil {
+ // We can't use --all if CGroups aren't present.
+ // Rootless containers with CGroups v1 and NoCgroups are both cases
+ // where this can happen.
+ if all {
+ if c.config.NoCgroups {
+ all = false
+ } else if rootless.IsRootless() {
+ // Only do this check if we need to
+ unified, err := cgroups.IsCgroup2UnifiedMode()
+ if err != nil {
+ return err
+ }
+ if !unified {
+ all = false
+ }
+ }
+ }
+
+ if err := c.ociRuntime.StopContainer(c, timeout, all); err != nil {
return err
}
@@ -1150,7 +1174,7 @@ func (c *Container) pause() error {
}
}
- if err := c.ociRuntime.pauseContainer(c); err != nil {
+ if err := c.ociRuntime.PauseContainer(c); err != nil {
return err
}
@@ -1167,7 +1191,7 @@ func (c *Container) unpause() error {
return errors.Wrapf(define.ErrNoCgroups, "cannot unpause without using CGroups")
}
- if err := c.ociRuntime.unpauseContainer(c); err != nil {
+ if err := c.ociRuntime.UnpauseContainer(c); err != nil {
return err
}
@@ -1188,7 +1212,7 @@ func (c *Container) restartWithTimeout(ctx context.Context, timeout uint) (err e
if c.state.State == define.ContainerStateRunning {
conmonPID := c.state.ConmonPID
- if err := c.stop(timeout); err != nil {
+ if err := c.stop(timeout, false); err != nil {
return err
}
// Old versions of conmon have a bug where they create the exit file before
@@ -1475,7 +1499,7 @@ func (c *Container) delete(ctx context.Context) (err error) {
span.SetTag("struct", "container")
defer span.Finish()
- if err := c.ociRuntime.deleteContainer(c); err != nil {
+ if err := c.ociRuntime.DeleteContainer(c); err != nil {
return errors.Wrapf(err, "error removing container %s from runtime", c.ID())
}
diff --git a/libpod/container_internal_linux.go b/libpod/container_internal_linux.go
index 2636fdb6c..b7d353327 100644
--- a/libpod/container_internal_linux.go
+++ b/libpod/container_internal_linux.go
@@ -419,27 +419,11 @@ func (c *Container) generateSpec(ctx context.Context) (*spec.Spec, error) {
g.AddProcessEnv("container", "libpod")
}
- unified, err := cgroups.IsCgroup2UnifiedMode()
+ cgroupPath, err := c.getOCICgroupPath()
if err != nil {
return nil, err
}
- if (rootless.IsRootless() && !unified) || c.config.NoCgroups {
- g.SetLinuxCgroupsPath("")
- } else if c.runtime.config.CgroupManager == SystemdCgroupsManager {
- // When runc is set to use Systemd as a cgroup manager, it
- // expects cgroups to be passed as follows:
- // slice:prefix:name
- systemdCgroups := fmt.Sprintf("%s:libpod:%s", path.Base(c.config.CgroupParent), c.ID())
- logrus.Debugf("Setting CGroups for container %s to %s", c.ID(), systemdCgroups)
- g.SetLinuxCgroupsPath(systemdCgroups)
- } else {
- cgroupPath, err := c.CGroupPath()
- if err != nil {
- return nil, err
- }
- logrus.Debugf("Setting CGroup path for container %s to %s", c.ID(), cgroupPath)
- g.SetLinuxCgroupsPath(cgroupPath)
- }
+ g.SetLinuxCgroupsPath(cgroupPath)
// Mounts need to be sorted so paths will not cover other paths
mounts := sortMounts(g.Mounts())
@@ -659,7 +643,7 @@ func (c *Container) checkpointRestoreSupported() (err error) {
if !criu.CheckForCriu() {
return errors.Errorf("Checkpoint/Restore requires at least CRIU %d", criu.MinCriuVersion)
}
- if !c.ociRuntime.featureCheckCheckpointing() {
+ if !c.ociRuntime.SupportsCheckpoint() {
return errors.Errorf("Configured runtime does not support checkpoint/restore")
}
return nil
@@ -695,7 +679,7 @@ func (c *Container) checkpoint(ctx context.Context, options ContainerCheckpointO
return err
}
- if err := c.ociRuntime.checkpointContainer(c, options); err != nil {
+ if err := c.ociRuntime.CheckpointContainer(c, options); err != nil {
return err
}
@@ -923,7 +907,7 @@ func (c *Container) restore(ctx context.Context, options ContainerCheckpointOpti
}
}
- if err := c.ociRuntime.createContainer(c, &options); err != nil {
+ if err := c.ociRuntime.CreateContainer(c, &options); err != nil {
return err
}
@@ -1332,3 +1316,30 @@ func (c *Container) refreshCNI() error {
podNetwork := c.runtime.getPodNetwork(c.ID(), c.config.Name, "", c.config.Networks, c.config.PortMappings, c.config.StaticIP)
return c.runtime.netPlugin.TearDownPod(podNetwork)
}
+
+// Get cgroup path in a format suitable for the OCI spec
+func (c *Container) getOCICgroupPath() (string, error) {
+ unified, err := cgroups.IsCgroup2UnifiedMode()
+ if err != nil {
+ return "", err
+ }
+ if (rootless.IsRootless() && !unified) || c.config.NoCgroups {
+ return "", nil
+ } else if c.runtime.config.CgroupManager == SystemdCgroupsManager {
+ // When runc is set to use Systemd as a cgroup manager, it
+ // expects cgroups to be passed as follows:
+ // slice:prefix:name
+ systemdCgroups := fmt.Sprintf("%s:libpod:%s", path.Base(c.config.CgroupParent), c.ID())
+ logrus.Debugf("Setting CGroups for container %s to %s", c.ID(), systemdCgroups)
+ return systemdCgroups, nil
+ } else if c.runtime.config.CgroupManager == CgroupfsCgroupsManager {
+ cgroupPath, err := c.CGroupPath()
+ if err != nil {
+ return "", err
+ }
+ logrus.Debugf("Setting CGroup path for container %s to %s", c.ID(), cgroupPath)
+ return cgroupPath, nil
+ } else {
+ return "", errors.Wrapf(define.ErrInvalidArg, "invalid cgroup manager %s requested", c.runtime.config.CgroupManager)
+ }
+}
diff --git a/libpod/container_internal_unsupported.go b/libpod/container_internal_unsupported.go
index 05a587c59..4abaa6362 100644
--- a/libpod/container_internal_unsupported.go
+++ b/libpod/container_internal_unsupported.go
@@ -44,3 +44,7 @@ func (c *Container) copyOwnerAndPerms(source, dest string) error {
func (c *Container) refreshCNI() error {
return define.ErrNotImplemented
}
+
+func (c *Container) getOCICgroupPath() (string, error) {
+ return "", define.ErrNotImplemented
+}
diff --git a/libpod/healthcheck.go b/libpod/healthcheck.go
index 0338828e4..68ffc2349 100644
--- a/libpod/healthcheck.go
+++ b/libpod/healthcheck.go
@@ -141,7 +141,7 @@ func (c *Container) runHealthCheck() (HealthCheckStatus, error) {
logrus.Debugf("executing health check command %s for %s", strings.Join(newCommand, " "), c.ID())
timeStart := time.Now()
hcResult := HealthCheckSuccess
- _, hcErr := c.Exec(false, false, []string{}, newCommand, "", "", streams, 0, nil, "")
+ _, hcErr := c.Exec(false, false, map[string]string{}, newCommand, "", "", streams, 0, nil, "")
if hcErr != nil {
errCause := errors.Cause(hcErr)
hcResult = HealthCheckFailure
diff --git a/libpod/info.go b/libpod/info.go
index 6caa87038..e5c075d97 100644
--- a/libpod/info.go
+++ b/libpod/info.go
@@ -15,7 +15,6 @@ import (
"github.com/containers/buildah"
"github.com/containers/libpod/pkg/cgroups"
"github.com/containers/libpod/pkg/rootless"
- "github.com/containers/libpod/utils"
"github.com/containers/storage"
"github.com/containers/storage/pkg/system"
"github.com/pkg/errors"
@@ -48,14 +47,7 @@ func (r *Runtime) hostInfo() (map[string]interface{}, error) {
info["MemFree"] = mi.MemFree
info["SwapTotal"] = mi.SwapTotal
info["SwapFree"] = mi.SwapFree
- conmonVersion, _ := r.GetConmonVersion()
- ociruntimeVersion, _ := r.GetOCIRuntimeVersion()
hostDistributionInfo := r.GetHostDistributionInfo()
- info["Conmon"] = map[string]interface{}{
- "path": r.conmonPath,
- "package": r.defaultOCIRuntime.conmonPackage(),
- "version": conmonVersion,
- }
if rootless.IsRootless() {
if path, err := exec.LookPath("slirp4netns"); err == nil {
logrus.Warnf("Failed to retrieve program version for %s: %v", path, err)
@@ -82,11 +74,6 @@ func (r *Runtime) hostInfo() (map[string]interface{}, error) {
idmappings["gidmap"] = gidmappings
info["IDMappings"] = idmappings
}
- info["OCIRuntime"] = map[string]interface{}{
- "path": r.defaultOCIRuntime.path,
- "package": r.defaultOCIRuntime.pathPackage(),
- "version": ociruntimeVersion,
- }
info["Distribution"] = map[string]interface{}{
"distribution": hostDistributionInfo["Distribution"],
"version": hostDistributionInfo["Version"],
@@ -98,6 +85,15 @@ func (r *Runtime) hostInfo() (map[string]interface{}, error) {
}
info["kernel"] = kv
+ runtimeInfo, err := r.defaultOCIRuntime.RuntimeInfo()
+ if err != nil {
+ logrus.Errorf("Error getting info on OCI runtime %s: %v", r.defaultOCIRuntime.Name(), err)
+ } else {
+ for k, v := range runtimeInfo {
+ info[k] = v
+ }
+ }
+
up, err := readUptime()
if err != nil {
return nil, errors.Wrapf(err, "error reading up time")
@@ -228,29 +224,6 @@ func readUptime() (string, error) {
return string(f[0]), nil
}
-// GetConmonVersion returns a string representation of the conmon version
-func (r *Runtime) GetConmonVersion() (string, error) {
- output, err := utils.ExecCmd(r.conmonPath, "--version")
- if err != nil {
- return "", err
- }
- return strings.TrimSuffix(strings.Replace(output, "\n", ", ", 1), "\n"), nil
-}
-
-// GetOCIRuntimePath returns the path to the OCI Runtime Path the runtime is using
-func (r *Runtime) GetOCIRuntimePath() string {
- return r.defaultOCIRuntime.path
-}
-
-// GetOCIRuntimeVersion returns a string representation of the oci runtimes version
-func (r *Runtime) GetOCIRuntimeVersion() (string, error) {
- output, err := utils.ExecCmd(r.GetOCIRuntimePath(), "--version")
- if err != nil {
- return "", err
- }
- return strings.TrimSuffix(output, "\n"), nil
-}
-
// GetHostDistributionInfo returns a map containing the host's distribution and version
func (r *Runtime) GetHostDistributionInfo() map[string]string {
dist := make(map[string]string)
diff --git a/libpod/networking_linux.go b/libpod/networking_linux.go
index 61ab57d65..8181cbc8a 100644
--- a/libpod/networking_linux.go
+++ b/libpod/networking_linux.go
@@ -157,7 +157,7 @@ func (r *Runtime) setupRootlessNetNS(ctr *Container) (err error) {
defer errorhandling.CloseQuiet(syncW)
havePortMapping := len(ctr.Config().PortMappings) > 0
- apiSocket := filepath.Join(ctr.ociRuntime.tmpDir, fmt.Sprintf("%s.net", ctr.config.ID))
+ apiSocket := filepath.Join(ctr.runtime.config.TmpDir, fmt.Sprintf("%s.net", ctr.config.ID))
cmdArgs := []string{}
if havePortMapping {
diff --git a/libpod/oci.go b/libpod/oci.go
index 9879fa90e..37d04349f 100644
--- a/libpod/oci.go
+++ b/libpod/oci.go
@@ -1,441 +1,132 @@
package libpod
import (
- "bytes"
- "fmt"
- "io/ioutil"
- "net"
- "os"
- "os/exec"
- "path/filepath"
- "strings"
- "time"
-
- "github.com/containers/libpod/libpod/define"
- "github.com/containers/libpod/pkg/util"
- "github.com/cri-o/ocicni/pkg/ocicni"
- spec "github.com/opencontainers/runtime-spec/specs-go"
- "github.com/opencontainers/selinux/go-selinux/label"
- "github.com/pkg/errors"
- "github.com/sirupsen/logrus"
-
- // TODO import these functions into libpod and remove the import
- // Trying to keep libpod from depending on CRI-O code
- "github.com/containers/libpod/utils"
-)
-
-// OCI code is undergoing heavy rewrite
-
-const (
- // CgroupfsCgroupsManager represents cgroupfs native cgroup manager
- CgroupfsCgroupsManager = "cgroupfs"
- // SystemdCgroupsManager represents systemd native cgroup manager
- SystemdCgroupsManager = "systemd"
-
- // ContainerCreateTimeout represents the value of container creating timeout
- ContainerCreateTimeout = 240 * time.Second
-
- // Timeout before declaring that runtime has failed to kill a given
- // container
- killContainerTimeout = 5 * time.Second
- // DefaultShmSize is the default shm size
- DefaultShmSize = 64 * 1024 * 1024
- // NsRunDir is the default directory in which running network namespaces
- // are stored
- NsRunDir = "/var/run/netns"
+ "k8s.io/client-go/tools/remotecommand"
)
-// OCIRuntime represents an OCI-compatible runtime that libpod can call into
-// to perform container operations
-type OCIRuntime struct {
- name string
- path string
- conmonPath string
- conmonEnv []string
- cgroupManager string
- tmpDir string
- exitsDir string
- socketsDir string
- logSizeMax int64
- noPivot bool
- reservePorts bool
- supportsJSON bool
- supportsNoCgroups bool
- sdNotify bool
-}
-
-// ociError is used to parse the OCI runtime JSON log. It is not part of the
-// OCI runtime specifications, it follows what runc does
-type ociError struct {
- Level string `json:"level,omitempty"`
- Time string `json:"time,omitempty"`
- Msg string `json:"msg,omitempty"`
-}
-
-// Make a new OCI runtime with provided options.
-// The first path that points to a valid executable will be used.
-func newOCIRuntime(name string, paths []string, conmonPath string, runtimeCfg *RuntimeConfig, supportsJSON, supportsNoCgroups bool) (*OCIRuntime, error) {
- if name == "" {
- return nil, errors.Wrapf(define.ErrInvalidArg, "the OCI runtime must be provided a non-empty name")
- }
-
- runtime := new(OCIRuntime)
- runtime.name = name
- runtime.conmonPath = conmonPath
-
- runtime.conmonEnv = runtimeCfg.ConmonEnvVars
- runtime.cgroupManager = runtimeCfg.CgroupManager
- runtime.tmpDir = runtimeCfg.TmpDir
- runtime.logSizeMax = runtimeCfg.MaxLogSize
- runtime.noPivot = runtimeCfg.NoPivotRoot
- runtime.reservePorts = runtimeCfg.EnablePortReservation
- runtime.sdNotify = runtimeCfg.SDNotify
-
- // TODO: probe OCI runtime for feature and enable automatically if
- // available.
- runtime.supportsJSON = supportsJSON
- runtime.supportsNoCgroups = supportsNoCgroups
-
- foundPath := false
- for _, path := range paths {
- stat, err := os.Stat(path)
- if err != nil {
- if os.IsNotExist(err) {
- continue
- }
- return nil, errors.Wrapf(err, "cannot stat %s", path)
- }
- if !stat.Mode().IsRegular() {
- continue
- }
- foundPath = true
- runtime.path = path
- logrus.Debugf("using runtime %q", path)
- break
- }
-
- // Search the $PATH as last fallback
- if !foundPath {
- if foundRuntime, err := exec.LookPath(name); err == nil {
- foundPath = true
- runtime.path = foundRuntime
- logrus.Debugf("using runtime %q from $PATH: %q", name, foundRuntime)
- }
- }
-
- if !foundPath {
- return nil, errors.Wrapf(define.ErrInvalidArg, "no valid executable found for OCI runtime %s", name)
- }
-
- runtime.exitsDir = filepath.Join(runtime.tmpDir, "exits")
- runtime.socketsDir = filepath.Join(runtime.tmpDir, "socket")
-
- if runtime.cgroupManager != CgroupfsCgroupsManager && runtime.cgroupManager != SystemdCgroupsManager {
- return nil, errors.Wrapf(define.ErrInvalidArg, "invalid cgroup manager specified: %s", runtime.cgroupManager)
- }
-
- // Create the exit files and attach sockets directories
- if err := os.MkdirAll(runtime.exitsDir, 0750); err != nil {
- // The directory is allowed to exist
- if !os.IsExist(err) {
- return nil, errors.Wrapf(err, "error creating OCI runtime exit files directory %s",
- runtime.exitsDir)
- }
- }
- if err := os.MkdirAll(runtime.socketsDir, 0750); err != nil {
- // The directory is allowed to exist
- if !os.IsExist(err) {
- return nil, errors.Wrapf(err, "error creating OCI runtime attach sockets directory %s",
- runtime.socketsDir)
- }
- }
-
- return runtime, nil
-}
-
-// Create systemd unit name for cgroup scopes
-func createUnitName(prefix string, name string) string {
- return fmt.Sprintf("%s-%s.scope", prefix, name)
-}
-
-func bindPorts(ports []ocicni.PortMapping) ([]*os.File, error) {
- var files []*os.File
- notifySCTP := false
- for _, i := range ports {
- switch i.Protocol {
- case "udp":
- addr, err := net.ResolveUDPAddr("udp", fmt.Sprintf("%s:%d", i.HostIP, i.HostPort))
- if err != nil {
- return nil, errors.Wrapf(err, "cannot resolve the UDP address")
- }
-
- server, err := net.ListenUDP("udp", addr)
- if err != nil {
- return nil, errors.Wrapf(err, "cannot listen on the UDP port")
- }
- f, err := server.File()
- if err != nil {
- return nil, errors.Wrapf(err, "cannot get file for UDP socket")
- }
- files = append(files, f)
-
- case "tcp":
- addr, err := net.ResolveTCPAddr("tcp4", fmt.Sprintf("%s:%d", i.HostIP, i.HostPort))
- if err != nil {
- return nil, errors.Wrapf(err, "cannot resolve the TCP address")
- }
-
- server, err := net.ListenTCP("tcp4", addr)
- if err != nil {
- return nil, errors.Wrapf(err, "cannot listen on the TCP port")
- }
- f, err := server.File()
- if err != nil {
- return nil, errors.Wrapf(err, "cannot get file for TCP socket")
- }
- files = append(files, f)
- case "sctp":
- if !notifySCTP {
- notifySCTP = true
- logrus.Warnf("port reservation for SCTP is not supported")
- }
- default:
- return nil, fmt.Errorf("unknown protocol %s", i.Protocol)
-
- }
- }
- return files, nil
-}
-
-// updateContainerStatus retrieves the current status of the container from the
-// runtime. It updates the container's state but does not save it.
-// If useRunc is false, we will not directly hit runc to see the container's
-// status, but will instead only check for the existence of the conmon exit file
-// and update state to stopped if it exists.
-func (r *OCIRuntime) updateContainerStatus(ctr *Container, useRuntime bool) error {
- exitFile := ctr.exitFilePath()
-
- runtimeDir, err := util.GetRuntimeDir()
- if err != nil {
- return err
- }
-
- // If not using the OCI runtime, we don't need to do most of this.
- if !useRuntime {
- // If the container's not running, nothing to do.
- if ctr.state.State != define.ContainerStateRunning && ctr.state.State != define.ContainerStatePaused {
- return nil
- }
-
- // Check for the exit file conmon makes
- info, err := os.Stat(exitFile)
- if err != nil {
- if os.IsNotExist(err) {
- // Container is still running, no error
- return nil
- }
-
- return errors.Wrapf(err, "error running stat on container %s exit file", ctr.ID())
- }
-
- // Alright, it exists. Transition to Stopped state.
- ctr.state.State = define.ContainerStateStopped
- ctr.state.PID = 0
- ctr.state.ConmonPID = 0
-
- // Read the exit file to get our stopped time and exit code.
- return ctr.handleExitFile(exitFile, info)
- }
-
- // Store old state so we know if we were already stopped
- oldState := ctr.state.State
-
- state := new(spec.State)
-
- cmd := exec.Command(r.path, "state", ctr.ID())
- cmd.Env = append(cmd.Env, fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir))
-
- outPipe, err := cmd.StdoutPipe()
- if err != nil {
- return errors.Wrapf(err, "getting stdout pipe")
- }
- errPipe, err := cmd.StderrPipe()
- if err != nil {
- return errors.Wrapf(err, "getting stderr pipe")
- }
-
- if err := cmd.Start(); err != nil {
- out, err2 := ioutil.ReadAll(errPipe)
- if err2 != nil {
- return errors.Wrapf(err, "error getting container %s state", ctr.ID())
- }
- if strings.Contains(string(out), "does not exist") {
- if err := ctr.removeConmonFiles(); err != nil {
- logrus.Debugf("unable to remove conmon files for container %s", ctr.ID())
- }
- ctr.state.ExitCode = -1
- ctr.state.FinishedTime = time.Now()
- ctr.state.State = define.ContainerStateExited
- return nil
- }
- return errors.Wrapf(err, "error getting container %s state. stderr/out: %s", ctr.ID(), out)
- }
- defer func() {
- _ = cmd.Wait()
- }()
-
- if err := errPipe.Close(); err != nil {
- return err
- }
- out, err := ioutil.ReadAll(outPipe)
- if err != nil {
- return errors.Wrapf(err, "error reading stdout: %s", ctr.ID())
- }
- if err := json.NewDecoder(bytes.NewBuffer(out)).Decode(state); err != nil {
- return errors.Wrapf(err, "error decoding container status for container %s", ctr.ID())
- }
- ctr.state.PID = state.Pid
-
- switch state.Status {
- case "created":
- ctr.state.State = define.ContainerStateCreated
- case "paused":
- ctr.state.State = define.ContainerStatePaused
- case "running":
- ctr.state.State = define.ContainerStateRunning
- case "stopped":
- ctr.state.State = define.ContainerStateStopped
- default:
- return errors.Wrapf(define.ErrInternal, "unrecognized status returned by runtime for container %s: %s",
- ctr.ID(), state.Status)
- }
-
- // Only grab exit status if we were not already stopped
- // If we were, it should already be in the database
- if ctr.state.State == define.ContainerStateStopped && oldState != define.ContainerStateStopped {
- var fi os.FileInfo
- chWait := make(chan error)
- defer close(chWait)
-
- _, err := WaitForFile(exitFile, chWait, time.Second*5)
- if err == nil {
- fi, err = os.Stat(exitFile)
- }
- if err != nil {
- ctr.state.ExitCode = -1
- ctr.state.FinishedTime = time.Now()
- logrus.Errorf("No exit file for container %s found: %v", ctr.ID(), err)
- return nil
- }
-
- return ctr.handleExitFile(exitFile, fi)
- }
-
- return nil
-}
-
-// startContainer starts the given container
-// Sets time the container was started, but does not save it.
-func (r *OCIRuntime) startContainer(ctr *Container) error {
- // TODO: streams should probably *not* be our STDIN/OUT/ERR - redirect to buffers?
- runtimeDir, err := util.GetRuntimeDir()
- if err != nil {
- return err
- }
- env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)}
- if notify, ok := os.LookupEnv("NOTIFY_SOCKET"); ok {
- env = append(env, fmt.Sprintf("NOTIFY_SOCKET=%s", notify))
- }
- if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "start", ctr.ID()); err != nil {
- return err
- }
-
- ctr.state.StartedTime = time.Now()
-
- return nil
-}
-
-// killContainer sends the given signal to the given container
-func (r *OCIRuntime) killContainer(ctr *Container, signal uint) error {
- logrus.Debugf("Sending signal %d to container %s", signal, ctr.ID())
- runtimeDir, err := util.GetRuntimeDir()
- if err != nil {
- return err
- }
- env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)}
- if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "kill", ctr.ID(), fmt.Sprintf("%d", signal)); err != nil {
- return errors.Wrapf(err, "error sending signal to container %s", ctr.ID())
- }
-
- return nil
-}
-
-// deleteContainer deletes a container from the OCI runtime
-func (r *OCIRuntime) deleteContainer(ctr *Container) error {
- runtimeDir, err := util.GetRuntimeDir()
- if err != nil {
- return err
- }
- env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)}
- return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "delete", "--force", ctr.ID())
-}
-
-// pauseContainer pauses the given container
-func (r *OCIRuntime) pauseContainer(ctr *Container) error {
- runtimeDir, err := util.GetRuntimeDir()
- if err != nil {
- return err
- }
- env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)}
- return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "pause", ctr.ID())
-}
-
-// unpauseContainer unpauses the given container
-func (r *OCIRuntime) unpauseContainer(ctr *Container) error {
- runtimeDir, err := util.GetRuntimeDir()
- if err != nil {
- return err
- }
- env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)}
- return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "resume", ctr.ID())
-}
-
-// checkpointContainer checkpoints the given container
-func (r *OCIRuntime) checkpointContainer(ctr *Container, options ContainerCheckpointOptions) error {
- if err := label.SetSocketLabel(ctr.ProcessLabel()); err != nil {
- return err
- }
- // imagePath is used by CRIU to store the actual checkpoint files
- imagePath := ctr.CheckpointPath()
- // workPath will be used to store dump.log and stats-dump
- workPath := ctr.bundlePath()
- logrus.Debugf("Writing checkpoint to %s", imagePath)
- logrus.Debugf("Writing checkpoint logs to %s", workPath)
- args := []string{}
- args = append(args, "checkpoint")
- args = append(args, "--image-path")
- args = append(args, imagePath)
- args = append(args, "--work-path")
- args = append(args, workPath)
- if options.KeepRunning {
- args = append(args, "--leave-running")
- }
- if options.TCPEstablished {
- args = append(args, "--tcp-established")
- }
- args = append(args, ctr.ID())
- return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, nil, r.path, args...)
+// OCIRuntime is an implementation of an OCI runtime.
+// The OCI runtime implementation is expected to be a fairly thin wrapper around
+// the actual runtime, and is not expected to include things like state
+// management logic - e.g., we do not expect it to determine on its own that
+// calling 'UnpauseContainer()' on a container that is not paused is an error.
+// The code calling the OCIRuntime will manage this.
+// TODO: May want to move the Attach() code under this umbrella. It's highly OCI
+// runtime dependent.
+// TODO: May want to move the conmon cleanup code here too - it depends on
+// Conmon being in use.
+type OCIRuntime interface {
+ // Name returns the name of the runtime.
+ Name() string
+ // Path returns the path to the runtime executable.
+ Path() string
+
+ // CreateContainer creates the container in the OCI runtime.
+ CreateContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) error
+ // UpdateContainerStatus updates the status of the given container.
+ // It includes a switch for whether to perform a hard query of the
+ // runtime. If unset, the exit file (if supported by the implementation)
+ // will be used.
+ UpdateContainerStatus(ctr *Container, useRuntime bool) error
+ // StartContainer starts the given container.
+ StartContainer(ctr *Container) error
+ // KillContainer sends the given signal to the given container.
+ // If all is set, all processes in the container will be signalled;
+ // otherwise, only init will be signalled.
+ KillContainer(ctr *Container, signal uint, all bool) error
+ // StopContainer stops the given container.
+ // The container's stop signal (or SIGTERM if unspecified) will be sent
+ // first.
+ // After the given timeout, SIGKILL will be sent.
+ // If the given timeout is 0, SIGKILL will be sent immediately, and the
+ // stop signal will be omitted.
+ // If all is set, we will attempt to use the --all flag will `kill` in
+ // the OCI runtime to kill all processes in the container, including
+ // exec sessions. This is only supported if the container has cgroups.
+ StopContainer(ctr *Container, timeout uint, all bool) error
+ // DeleteContainer deletes the given container from the OCI runtime.
+ DeleteContainer(ctr *Container) error
+ // PauseContainer pauses the given container.
+ PauseContainer(ctr *Container) error
+ // UnpauseContainer unpauses the given container.
+ UnpauseContainer(ctr *Container) error
+
+ // ExecContainer executes a command in a running container.
+ // Returns an int (exit code), error channel (errors from attach), and
+ // error (errors that occurred attempting to start the exec session).
+ ExecContainer(ctr *Container, sessionID string, options *ExecOptions) (int, chan error, error)
+ // ExecStopContainer stops a given exec session in a running container.
+ // SIGTERM with be sent initially, then SIGKILL after the given timeout.
+ // If timeout is 0, SIGKILL will be sent immediately, and SIGTERM will
+ // be omitted.
+ ExecStopContainer(ctr *Container, sessionID string, timeout uint) error
+ // ExecContainerCleanup cleans up after an exec session exits.
+ // It removes any files left by the exec session that are no longer
+ // needed, including the attach socket.
+ ExecContainerCleanup(ctr *Container, sessionID string) error
+
+ // CheckpointContainer checkpoints the given container.
+ // Some OCI runtimes may not support this - if SupportsCheckpoint()
+ // returns false, this is not implemented, and will always return an
+ // error.
+ CheckpointContainer(ctr *Container, options ContainerCheckpointOptions) error
+
+ // SupportsCheckpoint returns whether this OCI runtime
+ // implementation supports the CheckpointContainer() operation.
+ SupportsCheckpoint() bool
+ // SupportsJSONErrors is whether the runtime can return JSON-formatted
+ // error messages.
+ SupportsJSONErrors() bool
+ // SupportsNoCgroups is whether the runtime supports running containers
+ // without cgroups.
+ SupportsNoCgroups() bool
+
+ // AttachSocketPath is the path to the socket to attach to a given
+ // container.
+ // TODO: If we move Attach code in here, this should be made internal.
+ // We don't want to force all runtimes to share the same attach
+ // implementation.
+ AttachSocketPath(ctr *Container) (string, error)
+ // ExecAttachSocketPath is the path to the socket to attach to a given
+ // exec session in the given container.
+ // TODO: Probably should be made internal.
+ ExecAttachSocketPath(ctr *Container, sessionID string) (string, error)
+ // ExitFilePath is the path to a container's exit file.
+ // All runtime implementations must create an exit file when containers
+ // exit, containing the exit code of the container (as a string).
+ // This is the path to that file for a given container.
+ ExitFilePath(ctr *Container) (string, error)
+
+ // RuntimeInfo returns verbose information about the runtime.
+ RuntimeInfo() (map[string]interface{}, error)
}
-func (r *OCIRuntime) featureCheckCheckpointing() bool {
- // Check if the runtime implements checkpointing. Currently only
- // runc's checkpoint/restore implementation is supported.
- cmd := exec.Command(r.path, "checkpoint", "-h")
- if err := cmd.Start(); err != nil {
- return false
- }
- if err := cmd.Wait(); err == nil {
- return true
- }
- return false
+// ExecOptions are options passed into ExecContainer. They control the command
+// that will be executed and how the exec will proceed.
+type ExecOptions struct {
+ // Cmd is the command to execute.
+ Cmd []string
+ // CapAdd is a set of capabilities to add to the executed command.
+ CapAdd []string
+ // Env is a set of environment variables to add to the container.
+ Env map[string]string
+ // Terminal is whether to create a new TTY for the exec session.
+ Terminal bool
+ // Cwd is the working directory for the executed command. If unset, the
+ // working directory of the container will be used.
+ Cwd string
+ // User is the user the command will be executed as. If unset, the user
+ // the container was run as will be used.
+ User string
+ // Streams are the streams that will be attached to the container.
+ Streams *AttachStreams
+ // PreserveFDs is a number of additional file descriptors (in addition
+ // to 0, 1, 2) that will be passed to the executed process. The total FDs
+ // passed will be 3 + PreserveFDs.
+ PreserveFDs uint
+ // Resize is a channel where terminal resize events are sent to be
+ // handled.
+ Resize chan remotecommand.TerminalSize
+ // DetachKeys is a set of keys that, when pressed in sequence, will
+ // detach from the container.
+ DetachKeys string
}
diff --git a/libpod/oci_attach_linux.go b/libpod/oci_attach_linux.go
index 6cada0801..a383f6eab 100644
--- a/libpod/oci_attach_linux.go
+++ b/libpod/oci_attach_linux.go
@@ -47,7 +47,11 @@ func (c *Container) attach(streams *AttachStreams, keys string, resize <-chan re
registerResizeFunc(resize, c.bundlePath())
- socketPath := buildSocketPath(c.AttachSocketPath())
+ attachSock, err := c.AttachSocketPath()
+ if err != nil {
+ return err
+ }
+ socketPath := buildSocketPath(attachSock)
conn, err := net.DialUnix("unixpacket", nil, &net.UnixAddr{Name: socketPath, Net: "unixpacket"})
if err != nil {
@@ -108,7 +112,11 @@ func (c *Container) attachToExec(streams *AttachStreams, keys string, resize <-c
logrus.Debugf("Attaching to container %s exec session %s", c.ID(), sessionID)
// set up the socket path, such that it is the correct length and location for exec
- socketPath := buildSocketPath(c.execAttachSocketPath(sessionID))
+ sockPath, err := c.execAttachSocketPath(sessionID)
+ if err != nil {
+ return err
+ }
+ socketPath := buildSocketPath(sockPath)
// 2: read from attachFd that the parent process has set up the console socket
if _, err := readConmonPipeData(attachFd, ""); err != nil {
diff --git a/libpod/oci_conmon_linux.go b/libpod/oci_conmon_linux.go
new file mode 100644
index 000000000..f29758a69
--- /dev/null
+++ b/libpod/oci_conmon_linux.go
@@ -0,0 +1,1421 @@
+// +build linux
+
+package libpod
+
+import (
+ "bufio"
+ "bytes"
+ "fmt"
+ "io/ioutil"
+ "os"
+ "os/exec"
+ "path/filepath"
+ "runtime"
+ "strconv"
+ "strings"
+ "syscall"
+ "time"
+
+ "github.com/containers/libpod/libpod/define"
+ "github.com/containers/libpod/pkg/cgroups"
+ "github.com/containers/libpod/pkg/errorhandling"
+ "github.com/containers/libpod/pkg/lookup"
+ "github.com/containers/libpod/pkg/rootless"
+ "github.com/containers/libpod/pkg/util"
+ "github.com/containers/libpod/utils"
+ pmount "github.com/containers/storage/pkg/mount"
+ "github.com/coreos/go-systemd/activation"
+ spec "github.com/opencontainers/runtime-spec/specs-go"
+ "github.com/opencontainers/selinux/go-selinux"
+ "github.com/opencontainers/selinux/go-selinux/label"
+ "github.com/pkg/errors"
+ "github.com/sirupsen/logrus"
+ "golang.org/x/sys/unix"
+)
+
+// ConmonOCIRuntime is an OCI runtime managed by Conmon.
+// TODO: Make all calls to OCI runtime have a timeout.
+type ConmonOCIRuntime struct {
+ name string
+ path string
+ conmonPath string
+ conmonEnv []string
+ cgroupManager string
+ tmpDir string
+ exitsDir string
+ socketsDir string
+ logSizeMax int64
+ noPivot bool
+ reservePorts bool
+ supportsJSON bool
+ supportsNoCgroups bool
+ sdNotify bool
+}
+
+// Make a new Conmon-based OCI runtime with the given options.
+// Conmon will wrap the given OCI runtime, which can be `runc`, `crun`, or
+// any runtime with a runc-compatible CLI.
+// The first path that points to a valid executable will be used.
+// Deliberately private. Someone should not be able to construct this outside of
+// libpod.
+func newConmonOCIRuntime(name string, paths []string, conmonPath string, runtimeCfg *RuntimeConfig, supportsJSON, supportsNoCgroups bool) (OCIRuntime, error) {
+ if name == "" {
+ return nil, errors.Wrapf(define.ErrInvalidArg, "the OCI runtime must be provided a non-empty name")
+ }
+
+ runtime := new(ConmonOCIRuntime)
+ runtime.name = name
+ runtime.conmonPath = conmonPath
+
+ runtime.conmonEnv = runtimeCfg.ConmonEnvVars
+ runtime.cgroupManager = runtimeCfg.CgroupManager
+ runtime.tmpDir = runtimeCfg.TmpDir
+ runtime.logSizeMax = runtimeCfg.MaxLogSize
+ runtime.noPivot = runtimeCfg.NoPivotRoot
+ runtime.reservePorts = runtimeCfg.EnablePortReservation
+ runtime.sdNotify = runtimeCfg.SDNotify
+
+ // TODO: probe OCI runtime for feature and enable automatically if
+ // available.
+ runtime.supportsJSON = supportsJSON
+ runtime.supportsNoCgroups = supportsNoCgroups
+
+ foundPath := false
+ for _, path := range paths {
+ stat, err := os.Stat(path)
+ if err != nil {
+ if os.IsNotExist(err) {
+ continue
+ }
+ return nil, errors.Wrapf(err, "cannot stat %s", path)
+ }
+ if !stat.Mode().IsRegular() {
+ continue
+ }
+ foundPath = true
+ runtime.path = path
+ logrus.Debugf("using runtime %q", path)
+ break
+ }
+
+ // Search the $PATH as last fallback
+ if !foundPath {
+ if foundRuntime, err := exec.LookPath(name); err == nil {
+ foundPath = true
+ runtime.path = foundRuntime
+ logrus.Debugf("using runtime %q from $PATH: %q", name, foundRuntime)
+ }
+ }
+
+ if !foundPath {
+ return nil, errors.Wrapf(define.ErrInvalidArg, "no valid executable found for OCI runtime %s", name)
+ }
+
+ runtime.exitsDir = filepath.Join(runtime.tmpDir, "exits")
+ runtime.socketsDir = filepath.Join(runtime.tmpDir, "socket")
+
+ if runtime.cgroupManager != CgroupfsCgroupsManager && runtime.cgroupManager != SystemdCgroupsManager {
+ return nil, errors.Wrapf(define.ErrInvalidArg, "invalid cgroup manager specified: %s", runtime.cgroupManager)
+ }
+
+ // Create the exit files and attach sockets directories
+ if err := os.MkdirAll(runtime.exitsDir, 0750); err != nil {
+ // The directory is allowed to exist
+ if !os.IsExist(err) {
+ return nil, errors.Wrapf(err, "error creating OCI runtime exit files directory %s",
+ runtime.exitsDir)
+ }
+ }
+ if err := os.MkdirAll(runtime.socketsDir, 0750); err != nil {
+ // The directory is allowed to exist
+ if !os.IsExist(err) {
+ return nil, errors.Wrapf(err, "error creating OCI runtime attach sockets directory %s",
+ runtime.socketsDir)
+ }
+ }
+
+ return runtime, nil
+}
+
+// Name returns the name of the runtime being wrapped by Conmon.
+func (r *ConmonOCIRuntime) Name() string {
+ return r.name
+}
+
+// Path returns the path of the OCI runtime being wrapped by Conmon.
+func (r *ConmonOCIRuntime) Path() string {
+ return r.path
+}
+
+// CreateContainer creates a container.
+func (r *ConmonOCIRuntime) CreateContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (err error) {
+ if len(ctr.config.IDMappings.UIDMap) != 0 || len(ctr.config.IDMappings.GIDMap) != 0 {
+ for _, i := range []string{ctr.state.RunDir, ctr.runtime.config.TmpDir, ctr.config.StaticDir, ctr.state.Mountpoint, ctr.runtime.config.VolumePath} {
+ if err := makeAccessible(i, ctr.RootUID(), ctr.RootGID()); err != nil {
+ return err
+ }
+ }
+
+ // if we are running a non privileged container, be sure to umount some kernel paths so they are not
+ // bind mounted inside the container at all.
+ if !ctr.config.Privileged && !rootless.IsRootless() {
+ ch := make(chan error)
+ go func() {
+ runtime.LockOSThread()
+ err := func() error {
+ fd, err := os.Open(fmt.Sprintf("/proc/%d/task/%d/ns/mnt", os.Getpid(), unix.Gettid()))
+ if err != nil {
+ return err
+ }
+ defer errorhandling.CloseQuiet(fd)
+
+ // create a new mountns on the current thread
+ if err = unix.Unshare(unix.CLONE_NEWNS); err != nil {
+ return err
+ }
+ defer func() {
+ if err := unix.Setns(int(fd.Fd()), unix.CLONE_NEWNS); err != nil {
+ logrus.Errorf("unable to clone new namespace: %q", err)
+ }
+ }()
+
+ // don't spread our mounts around. We are setting only /sys to be slave
+ // so that the cleanup process is still able to umount the storage and the
+ // changes are propagated to the host.
+ err = unix.Mount("/sys", "/sys", "none", unix.MS_REC|unix.MS_SLAVE, "")
+ if err != nil {
+ return errors.Wrapf(err, "cannot make /sys slave")
+ }
+
+ mounts, err := pmount.GetMounts()
+ if err != nil {
+ return err
+ }
+ for _, m := range mounts {
+ if !strings.HasPrefix(m.Mountpoint, "/sys/kernel") {
+ continue
+ }
+ err = unix.Unmount(m.Mountpoint, 0)
+ if err != nil && !os.IsNotExist(err) {
+ return errors.Wrapf(err, "cannot unmount %s", m.Mountpoint)
+ }
+ }
+ return r.createOCIContainer(ctr, restoreOptions)
+ }()
+ ch <- err
+ }()
+ err := <-ch
+ return err
+ }
+ }
+ return r.createOCIContainer(ctr, restoreOptions)
+}
+
+// UpdateContainerStatus retrieves the current status of the container from the
+// runtime. It updates the container's state but does not save it.
+// If useRuntime is false, we will not directly hit runc to see the container's
+// status, but will instead only check for the existence of the conmon exit file
+// and update state to stopped if it exists.
+func (r *ConmonOCIRuntime) UpdateContainerStatus(ctr *Container, useRuntime bool) error {
+ exitFile, err := ctr.exitFilePath()
+ if err != nil {
+ return err
+ }
+
+ runtimeDir, err := util.GetRuntimeDir()
+ if err != nil {
+ return err
+ }
+
+ // If not using the OCI runtime, we don't need to do most of this.
+ if !useRuntime {
+ // If the container's not running, nothing to do.
+ if ctr.state.State != define.ContainerStateRunning && ctr.state.State != define.ContainerStatePaused {
+ return nil
+ }
+
+ // Check for the exit file conmon makes
+ info, err := os.Stat(exitFile)
+ if err != nil {
+ if os.IsNotExist(err) {
+ // Container is still running, no error
+ return nil
+ }
+
+ return errors.Wrapf(err, "error running stat on container %s exit file", ctr.ID())
+ }
+
+ // Alright, it exists. Transition to Stopped state.
+ ctr.state.State = define.ContainerStateStopped
+ ctr.state.PID = 0
+ ctr.state.ConmonPID = 0
+
+ // Read the exit file to get our stopped time and exit code.
+ return ctr.handleExitFile(exitFile, info)
+ }
+
+ // Store old state so we know if we were already stopped
+ oldState := ctr.state.State
+
+ state := new(spec.State)
+
+ cmd := exec.Command(r.path, "state", ctr.ID())
+ cmd.Env = append(cmd.Env, fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir))
+
+ outPipe, err := cmd.StdoutPipe()
+ if err != nil {
+ return errors.Wrapf(err, "getting stdout pipe")
+ }
+ errPipe, err := cmd.StderrPipe()
+ if err != nil {
+ return errors.Wrapf(err, "getting stderr pipe")
+ }
+
+ if err := cmd.Start(); err != nil {
+ out, err2 := ioutil.ReadAll(errPipe)
+ if err2 != nil {
+ return errors.Wrapf(err, "error getting container %s state", ctr.ID())
+ }
+ if strings.Contains(string(out), "does not exist") {
+ if err := ctr.removeConmonFiles(); err != nil {
+ logrus.Debugf("unable to remove conmon files for container %s", ctr.ID())
+ }
+ ctr.state.ExitCode = -1
+ ctr.state.FinishedTime = time.Now()
+ ctr.state.State = define.ContainerStateExited
+ return nil
+ }
+ return errors.Wrapf(err, "error getting container %s state. stderr/out: %s", ctr.ID(), out)
+ }
+ defer func() {
+ _ = cmd.Wait()
+ }()
+
+ if err := errPipe.Close(); err != nil {
+ return err
+ }
+ out, err := ioutil.ReadAll(outPipe)
+ if err != nil {
+ return errors.Wrapf(err, "error reading stdout: %s", ctr.ID())
+ }
+ if err := json.NewDecoder(bytes.NewBuffer(out)).Decode(state); err != nil {
+ return errors.Wrapf(err, "error decoding container status for container %s", ctr.ID())
+ }
+ ctr.state.PID = state.Pid
+
+ switch state.Status {
+ case "created":
+ ctr.state.State = define.ContainerStateCreated
+ case "paused":
+ ctr.state.State = define.ContainerStatePaused
+ case "running":
+ ctr.state.State = define.ContainerStateRunning
+ case "stopped":
+ ctr.state.State = define.ContainerStateStopped
+ default:
+ return errors.Wrapf(define.ErrInternal, "unrecognized status returned by runtime for container %s: %s",
+ ctr.ID(), state.Status)
+ }
+
+ // Only grab exit status if we were not already stopped
+ // If we were, it should already be in the database
+ if ctr.state.State == define.ContainerStateStopped && oldState != define.ContainerStateStopped {
+ var fi os.FileInfo
+ chWait := make(chan error)
+ defer close(chWait)
+
+ _, err := WaitForFile(exitFile, chWait, time.Second*5)
+ if err == nil {
+ fi, err = os.Stat(exitFile)
+ }
+ if err != nil {
+ ctr.state.ExitCode = -1
+ ctr.state.FinishedTime = time.Now()
+ logrus.Errorf("No exit file for container %s found: %v", ctr.ID(), err)
+ return nil
+ }
+
+ return ctr.handleExitFile(exitFile, fi)
+ }
+
+ return nil
+}
+
+// StartContainer starts the given container.
+// Sets time the container was started, but does not save it.
+func (r *ConmonOCIRuntime) StartContainer(ctr *Container) error {
+ // TODO: streams should probably *not* be our STDIN/OUT/ERR - redirect to buffers?
+ runtimeDir, err := util.GetRuntimeDir()
+ if err != nil {
+ return err
+ }
+ env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)}
+ if notify, ok := os.LookupEnv("NOTIFY_SOCKET"); ok {
+ env = append(env, fmt.Sprintf("NOTIFY_SOCKET=%s", notify))
+ }
+ if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "start", ctr.ID()); err != nil {
+ return err
+ }
+
+ ctr.state.StartedTime = time.Now()
+
+ return nil
+}
+
+// KillContainer sends the given signal to the given container.
+// If all is set, send to all PIDs in the container.
+// All is only supported if the container created cgroups.
+func (r *ConmonOCIRuntime) KillContainer(ctr *Container, signal uint, all bool) error {
+ logrus.Debugf("Sending signal %d to container %s", signal, ctr.ID())
+ runtimeDir, err := util.GetRuntimeDir()
+ if err != nil {
+ return err
+ }
+ env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)}
+ var args []string
+ if all {
+ args = []string{"kill", "--all", ctr.ID(), fmt.Sprintf("%d", signal)}
+ } else {
+ args = []string{"kill", ctr.ID(), fmt.Sprintf("%d", signal)}
+ }
+ if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, args...); err != nil {
+ return errors.Wrapf(err, "error sending signal to container %s", ctr.ID())
+ }
+
+ return nil
+}
+
+// StopContainer stops a container, first using its given stop signal (or
+// SIGTERM if no signal was specified), then using SIGKILL.
+// Timeout is given in seconds. If timeout is 0, the container will be
+// immediately kill with SIGKILL.
+// Does not set finished time for container, assumes you will run updateStatus
+// after to pull the exit code.
+func (r *ConmonOCIRuntime) StopContainer(ctr *Container, timeout uint, all bool) error {
+ logrus.Debugf("Stopping container %s (PID %d)", ctr.ID(), ctr.state.PID)
+
+ // Ping the container to see if it's alive
+ // If it's not, it's already stopped, return
+ err := unix.Kill(ctr.state.PID, 0)
+ if err == unix.ESRCH {
+ return nil
+ }
+
+ stopSignal := ctr.config.StopSignal
+ if stopSignal == 0 {
+ stopSignal = uint(syscall.SIGTERM)
+ }
+
+ if timeout > 0 {
+ if err := r.KillContainer(ctr, stopSignal, all); err != nil {
+ // Is the container gone?
+ // If so, it probably died between the first check and
+ // our sending the signal
+ // The container is stopped, so exit cleanly
+ err := unix.Kill(ctr.state.PID, 0)
+ if err == unix.ESRCH {
+ return nil
+ }
+
+ return err
+ }
+
+ if err := waitContainerStop(ctr, time.Duration(timeout)*time.Second); err != nil {
+ logrus.Warnf("Timed out stopping container %s, resorting to SIGKILL", ctr.ID())
+ } else {
+ // No error, the container is dead
+ return nil
+ }
+ }
+
+ if err := r.KillContainer(ctr, 9, all); err != nil {
+ // Again, check if the container is gone. If it is, exit cleanly.
+ err := unix.Kill(ctr.state.PID, 0)
+ if err == unix.ESRCH {
+ return nil
+ }
+
+ return errors.Wrapf(err, "error sending SIGKILL to container %s", ctr.ID())
+ }
+
+ // Give runtime a few seconds to make it happen
+ if err := waitContainerStop(ctr, killContainerTimeout); err != nil {
+ return err
+ }
+
+ return nil
+}
+
+// DeleteContainer deletes a container from the OCI runtime.
+func (r *ConmonOCIRuntime) DeleteContainer(ctr *Container) error {
+ runtimeDir, err := util.GetRuntimeDir()
+ if err != nil {
+ return err
+ }
+ env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)}
+ return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "delete", "--force", ctr.ID())
+}
+
+// PauseContainer pauses the given container.
+func (r *ConmonOCIRuntime) PauseContainer(ctr *Container) error {
+ runtimeDir, err := util.GetRuntimeDir()
+ if err != nil {
+ return err
+ }
+ env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)}
+ return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "pause", ctr.ID())
+}
+
+// UnpauseContainer unpauses the given container.
+func (r *ConmonOCIRuntime) UnpauseContainer(ctr *Container) error {
+ runtimeDir, err := util.GetRuntimeDir()
+ if err != nil {
+ return err
+ }
+ env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)}
+ return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "resume", ctr.ID())
+}
+
+// ExecContainer executes a command in a running container
+// TODO: Split into Create/Start/Attach/Wait
+func (r *ConmonOCIRuntime) ExecContainer(c *Container, sessionID string, options *ExecOptions) (int, chan error, error) {
+ if options == nil {
+ return -1, nil, errors.Wrapf(define.ErrInvalidArg, "must provide an ExecOptions struct to ExecContainer")
+ }
+ if len(options.Cmd) == 0 {
+ return -1, nil, errors.Wrapf(define.ErrInvalidArg, "must provide a command to execute")
+ }
+
+ if sessionID == "" {
+ return -1, nil, errors.Wrapf(define.ErrEmptyID, "must provide a session ID for exec")
+ }
+
+ // create sync pipe to receive the pid
+ parentSyncPipe, childSyncPipe, err := newPipe()
+ if err != nil {
+ return -1, nil, errors.Wrapf(err, "error creating socket pair")
+ }
+
+ defer errorhandling.CloseQuiet(parentSyncPipe)
+
+ // create start pipe to set the cgroup before running
+ // attachToExec is responsible for closing parentStartPipe
+ childStartPipe, parentStartPipe, err := newPipe()
+ if err != nil {
+ return -1, nil, errors.Wrapf(err, "error creating socket pair")
+ }
+
+ // We want to make sure we close the parent{Start,Attach}Pipes if we fail
+ // but also don't want to close them after attach to exec is called
+ attachToExecCalled := false
+
+ defer func() {
+ if !attachToExecCalled {
+ errorhandling.CloseQuiet(parentStartPipe)
+ }
+ }()
+
+ // create the attach pipe to allow attach socket to be created before
+ // $RUNTIME exec starts running. This is to make sure we can capture all output
+ // from the process through that socket, rather than half reading the log, half attaching to the socket
+ // attachToExec is responsible for closing parentAttachPipe
+ parentAttachPipe, childAttachPipe, err := newPipe()
+ if err != nil {
+ return -1, nil, errors.Wrapf(err, "error creating socket pair")
+ }
+
+ defer func() {
+ if !attachToExecCalled {
+ errorhandling.CloseQuiet(parentAttachPipe)
+ }
+ }()
+
+ childrenClosed := false
+ defer func() {
+ if !childrenClosed {
+ errorhandling.CloseQuiet(childSyncPipe)
+ errorhandling.CloseQuiet(childAttachPipe)
+ errorhandling.CloseQuiet(childStartPipe)
+ }
+ }()
+
+ runtimeDir, err := util.GetRuntimeDir()
+ if err != nil {
+ return -1, nil, err
+ }
+
+ finalEnv := make([]string, 0, len(options.Env))
+ for k, v := range options.Env {
+ finalEnv = append(finalEnv, fmt.Sprintf("%s=%s", k, v))
+ }
+
+ processFile, err := prepareProcessExec(c, options.Cmd, finalEnv, options.Terminal, options.Cwd, options.User, sessionID)
+ if err != nil {
+ return -1, nil, err
+ }
+
+ var ociLog string
+ if logrus.GetLevel() != logrus.DebugLevel && r.supportsJSON {
+ ociLog = c.execOCILog(sessionID)
+ }
+ args := r.sharedConmonArgs(c, sessionID, c.execBundlePath(sessionID), c.execPidPath(sessionID), c.execLogPath(sessionID), c.execExitFileDir(sessionID), ociLog)
+
+ if options.PreserveFDs > 0 {
+ args = append(args, formatRuntimeOpts("--preserve-fds", fmt.Sprintf("%d", options.PreserveFDs))...)
+ }
+
+ for _, capability := range options.CapAdd {
+ args = append(args, formatRuntimeOpts("--cap", capability)...)
+ }
+
+ if options.Terminal {
+ args = append(args, "-t")
+ }
+
+ // Append container ID and command
+ args = append(args, "-e")
+ // TODO make this optional when we can detach
+ args = append(args, "--exec-attach")
+ args = append(args, "--exec-process-spec", processFile.Name())
+
+ logrus.WithFields(logrus.Fields{
+ "args": args,
+ }).Debugf("running conmon: %s", r.conmonPath)
+ execCmd := exec.Command(r.conmonPath, args...)
+
+ if options.Streams != nil {
+ if options.Streams.AttachInput {
+ execCmd.Stdin = options.Streams.InputStream
+ }
+ if options.Streams.AttachOutput {
+ execCmd.Stdout = options.Streams.OutputStream
+ }
+ if options.Streams.AttachError {
+ execCmd.Stderr = options.Streams.ErrorStream
+ }
+ }
+
+ conmonEnv, extraFiles, err := r.configureConmonEnv(runtimeDir)
+ if err != nil {
+ return -1, nil, err
+ }
+
+ if options.PreserveFDs > 0 {
+ for fd := 3; fd < int(3+options.PreserveFDs); fd++ {
+ execCmd.ExtraFiles = append(execCmd.ExtraFiles, os.NewFile(uintptr(fd), fmt.Sprintf("fd-%d", fd)))
+ }
+ }
+
+ // we don't want to step on users fds they asked to preserve
+ // Since 0-2 are used for stdio, start the fds we pass in at preserveFDs+3
+ execCmd.Env = append(r.conmonEnv, fmt.Sprintf("_OCI_SYNCPIPE=%d", options.PreserveFDs+3), fmt.Sprintf("_OCI_STARTPIPE=%d", options.PreserveFDs+4), fmt.Sprintf("_OCI_ATTACHPIPE=%d", options.PreserveFDs+5))
+ execCmd.Env = append(execCmd.Env, conmonEnv...)
+
+ execCmd.ExtraFiles = append(execCmd.ExtraFiles, childSyncPipe, childStartPipe, childAttachPipe)
+ execCmd.ExtraFiles = append(execCmd.ExtraFiles, extraFiles...)
+ execCmd.Dir = c.execBundlePath(sessionID)
+ execCmd.SysProcAttr = &syscall.SysProcAttr{
+ Setpgid: true,
+ }
+
+ err = startCommandGivenSelinux(execCmd)
+
+ // We don't need children pipes on the parent side
+ errorhandling.CloseQuiet(childSyncPipe)
+ errorhandling.CloseQuiet(childAttachPipe)
+ errorhandling.CloseQuiet(childStartPipe)
+ childrenClosed = true
+
+ if err != nil {
+ return -1, nil, errors.Wrapf(err, "cannot start container %s", c.ID())
+ }
+ if err := r.moveConmonToCgroupAndSignal(c, execCmd, parentStartPipe, sessionID); err != nil {
+ return -1, nil, err
+ }
+
+ if options.PreserveFDs > 0 {
+ for fd := 3; fd < int(3+options.PreserveFDs); fd++ {
+ // These fds were passed down to the runtime. Close them
+ // and not interfere
+ if err := os.NewFile(uintptr(fd), fmt.Sprintf("fd-%d", fd)).Close(); err != nil {
+ logrus.Debugf("unable to close file fd-%d", fd)
+ }
+ }
+ }
+
+ // TODO Only create if !detach
+ // Attach to the container before starting it
+ attachChan := make(chan error)
+ go func() {
+ // attachToExec is responsible for closing pipes
+ attachChan <- c.attachToExec(options.Streams, options.DetachKeys, options.Resize, sessionID, parentStartPipe, parentAttachPipe)
+ close(attachChan)
+ }()
+ attachToExecCalled = true
+
+ pid, err := readConmonPipeData(parentSyncPipe, ociLog)
+
+ return pid, attachChan, err
+}
+
+// ExecStopContainer stops a given exec session in a running container.
+func (r *ConmonOCIRuntime) ExecStopContainer(ctr *Container, sessionID string, timeout uint) error {
+ session, ok := ctr.state.ExecSessions[sessionID]
+ if !ok {
+ // TODO This should probably be a separate error
+ return errors.Wrapf(define.ErrInvalidArg, "no exec session with ID %s found in container %s", sessionID, ctr.ID())
+ }
+
+ logrus.Debugf("Going to stop container %s exec session %s", ctr.ID(), sessionID)
+
+ // Is the session dead?
+ // Ping the PID with signal 0 to see if it still exists.
+ if err := unix.Kill(session.PID, 0); err != nil {
+ if err == unix.ESRCH {
+ return nil
+ }
+ return errors.Wrapf(err, "error pinging container %s exec session %s PID %d with signal 0", ctr.ID(), sessionID, session.PID)
+ }
+
+ if timeout > 0 {
+ // Use SIGTERM by default, then SIGSTOP after timeout.
+ logrus.Debugf("Killing exec session %s (PID %d) of container %s with SIGTERM", sessionID, session.PID, ctr.ID())
+ if err := unix.Kill(session.PID, unix.SIGTERM); err != nil {
+ if err == unix.ESRCH {
+ return nil
+ }
+ return errors.Wrapf(err, "error killing container %s exec session %s PID %d with SIGTERM", ctr.ID(), sessionID, session.PID)
+ }
+
+ // Wait for the PID to stop
+ if err := waitPidStop(session.PID, time.Duration(timeout)*time.Second); err != nil {
+ logrus.Warnf("Timed out waiting for container %s exec session %s to stop, resorting to SIGKILL", ctr.ID(), sessionID)
+ } else {
+ // No error, container is dead
+ return nil
+ }
+ }
+
+ // SIGTERM did not work. On to SIGKILL.
+ logrus.Debugf("Killing exec session %s (PID %d) of container %s with SIGKILL", sessionID, session.PID, ctr.ID())
+ if err := unix.Kill(session.PID, unix.SIGTERM); err != nil {
+ if err == unix.ESRCH {
+ return nil
+ }
+ return errors.Wrapf(err, "error killing container %s exec session %s PID %d with SIGKILL", ctr.ID(), sessionID, session.PID)
+ }
+
+ // Wait for the PID to stop
+ if err := waitPidStop(session.PID, killContainerTimeout*time.Second); err != nil {
+ return errors.Wrapf(err, "timed out waiting for container %s exec session %s PID %d to stop after SIGKILL", ctr.ID(), sessionID, session.PID)
+ }
+
+ return nil
+}
+
+// ExecCleanupContainer cleans up files created when a command is run via
+// ExecContainer. This includes the attach socket for the exec session.
+func (r *ConmonOCIRuntime) ExecContainerCleanup(ctr *Container, sessionID string) error {
+ // Clean up the sockets dir. Issue #3962
+ // Also ignore if it doesn't exist for some reason; hence the conditional return below
+ if err := os.RemoveAll(filepath.Join(r.socketsDir, sessionID)); err != nil && !os.IsNotExist(err) {
+ return err
+ }
+ return nil
+}
+
+// CheckpointContainer checkpoints the given container.
+func (r *ConmonOCIRuntime) CheckpointContainer(ctr *Container, options ContainerCheckpointOptions) error {
+ if err := label.SetSocketLabel(ctr.ProcessLabel()); err != nil {
+ return err
+ }
+ // imagePath is used by CRIU to store the actual checkpoint files
+ imagePath := ctr.CheckpointPath()
+ // workPath will be used to store dump.log and stats-dump
+ workPath := ctr.bundlePath()
+ logrus.Debugf("Writing checkpoint to %s", imagePath)
+ logrus.Debugf("Writing checkpoint logs to %s", workPath)
+ args := []string{}
+ args = append(args, "checkpoint")
+ args = append(args, "--image-path")
+ args = append(args, imagePath)
+ args = append(args, "--work-path")
+ args = append(args, workPath)
+ if options.KeepRunning {
+ args = append(args, "--leave-running")
+ }
+ if options.TCPEstablished {
+ args = append(args, "--tcp-established")
+ }
+ args = append(args, ctr.ID())
+ return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, nil, r.path, args...)
+}
+
+// SupportsCheckpoint checks if the OCI runtime supports checkpointing
+// containers.
+func (r *ConmonOCIRuntime) SupportsCheckpoint() bool {
+ // Check if the runtime implements checkpointing. Currently only
+ // runc's checkpoint/restore implementation is supported.
+ cmd := exec.Command(r.path, "checkpoint", "-h")
+ if err := cmd.Start(); err != nil {
+ return false
+ }
+ if err := cmd.Wait(); err == nil {
+ return true
+ }
+ return false
+}
+
+// SupportsJSONErrors checks if the OCI runtime supports JSON-formatted error
+// messages.
+func (r *ConmonOCIRuntime) SupportsJSONErrors() bool {
+ return r.supportsJSON
+}
+
+// SupportsNoCgroups checks if the OCI runtime supports running containers
+// without cgroups (the --cgroup-manager=disabled flag).
+func (r *ConmonOCIRuntime) SupportsNoCgroups() bool {
+ return r.supportsNoCgroups
+}
+
+// AttachSocketPath is the path to a single container's attach socket.
+func (r *ConmonOCIRuntime) AttachSocketPath(ctr *Container) (string, error) {
+ if ctr == nil {
+ return "", errors.Wrapf(define.ErrInvalidArg, "must provide a valid container to get attach socket path")
+ }
+
+ return filepath.Join(r.socketsDir, ctr.ID(), "attach"), nil
+}
+
+// ExecAttachSocketPath is the path to a container's exec session attach socket.
+func (r *ConmonOCIRuntime) ExecAttachSocketPath(ctr *Container, sessionID string) (string, error) {
+ // We don't even use container, so don't validity check it
+ if sessionID == "" {
+ return "", errors.Wrapf(define.ErrInvalidArg, "must provide a valid session ID to get attach socket path")
+ }
+
+ return filepath.Join(r.socketsDir, sessionID, "attach"), nil
+}
+
+// ExitFilePath is the path to a container's exit file.
+func (r *ConmonOCIRuntime) ExitFilePath(ctr *Container) (string, error) {
+ if ctr == nil {
+ return "", errors.Wrapf(define.ErrInvalidArg, "must provide a valid container to get exit file path")
+ }
+ return filepath.Join(r.exitsDir, ctr.ID()), nil
+}
+
+// RuntimeInfo provides information on the runtime.
+func (r *ConmonOCIRuntime) RuntimeInfo() (map[string]interface{}, error) {
+ runtimePackage := packageVersion(r.path)
+ conmonPackage := packageVersion(r.conmonPath)
+ runtimeVersion, err := r.getOCIRuntimeVersion()
+ if err != nil {
+ return nil, errors.Wrapf(err, "error getting version of OCI runtime %s", r.name)
+ }
+ conmonVersion, err := r.getConmonVersion()
+ if err != nil {
+ return nil, errors.Wrapf(err, "error getting conmon version")
+ }
+
+ info := make(map[string]interface{})
+ info["Conmon"] = map[string]interface{}{
+ "path": r.conmonPath,
+ "package": conmonPackage,
+ "version": conmonVersion,
+ }
+ info["OCIRuntime"] = map[string]interface{}{
+ "path": r.path,
+ "package": runtimePackage,
+ "version": runtimeVersion,
+ }
+
+ return info, nil
+}
+
+// makeAccessible changes the path permission and each parent directory to have --x--x--x
+func makeAccessible(path string, uid, gid int) error {
+ for ; path != "/"; path = filepath.Dir(path) {
+ st, err := os.Stat(path)
+ if err != nil {
+ if os.IsNotExist(err) {
+ return nil
+ }
+ return err
+ }
+ if int(st.Sys().(*syscall.Stat_t).Uid) == uid && int(st.Sys().(*syscall.Stat_t).Gid) == gid {
+ continue
+ }
+ if st.Mode()&0111 != 0111 {
+ if err := os.Chmod(path, st.Mode()|0111); err != nil {
+ return err
+ }
+ }
+ }
+ return nil
+}
+
+// Wait for a container which has been sent a signal to stop
+func waitContainerStop(ctr *Container, timeout time.Duration) error {
+ return waitPidStop(ctr.state.PID, timeout)
+}
+
+// Wait for a given PID to stop
+func waitPidStop(pid int, timeout time.Duration) error {
+ done := make(chan struct{})
+ chControl := make(chan struct{})
+ go func() {
+ for {
+ select {
+ case <-chControl:
+ return
+ default:
+ if err := unix.Kill(pid, 0); err != nil {
+ if err == unix.ESRCH {
+ close(done)
+ return
+ }
+ logrus.Errorf("Error pinging PID %d with signal 0: %v", pid, err)
+ }
+ time.Sleep(100 * time.Millisecond)
+ }
+ }
+ }()
+ select {
+ case <-done:
+ return nil
+ case <-time.After(timeout):
+ close(chControl)
+ return errors.Errorf("given PIDs did not die within timeout")
+ }
+}
+
+// createOCIContainer generates this container's main conmon instance and prepares it for starting
+func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (err error) {
+ var stderrBuf bytes.Buffer
+
+ runtimeDir, err := util.GetRuntimeDir()
+ if err != nil {
+ return err
+ }
+
+ parentSyncPipe, childSyncPipe, err := newPipe()
+ if err != nil {
+ return errors.Wrapf(err, "error creating socket pair")
+ }
+ defer errorhandling.CloseQuiet(parentSyncPipe)
+
+ childStartPipe, parentStartPipe, err := newPipe()
+ if err != nil {
+ return errors.Wrapf(err, "error creating socket pair for start pipe")
+ }
+
+ defer errorhandling.CloseQuiet(parentStartPipe)
+
+ var ociLog string
+ if logrus.GetLevel() != logrus.DebugLevel && r.supportsJSON {
+ ociLog = filepath.Join(ctr.state.RunDir, "oci-log")
+ }
+ args := r.sharedConmonArgs(ctr, ctr.ID(), ctr.bundlePath(), filepath.Join(ctr.state.RunDir, "pidfile"), ctr.LogPath(), r.exitsDir, ociLog)
+
+ if ctr.config.Spec.Process.Terminal {
+ args = append(args, "-t")
+ } else if ctr.config.Stdin {
+ args = append(args, "-i")
+ }
+
+ if ctr.config.ConmonPidFile != "" {
+ args = append(args, "--conmon-pidfile", ctr.config.ConmonPidFile)
+ }
+
+ if r.noPivot {
+ args = append(args, "--no-pivot")
+ }
+
+ if len(ctr.config.ExitCommand) > 0 {
+ args = append(args, "--exit-command", ctr.config.ExitCommand[0])
+ for _, arg := range ctr.config.ExitCommand[1:] {
+ args = append(args, []string{"--exit-command-arg", arg}...)
+ }
+ }
+
+ if restoreOptions != nil {
+ args = append(args, "--restore", ctr.CheckpointPath())
+ if restoreOptions.TCPEstablished {
+ args = append(args, "--runtime-opt", "--tcp-established")
+ }
+ }
+
+ logrus.WithFields(logrus.Fields{
+ "args": args,
+ }).Debugf("running conmon: %s", r.conmonPath)
+
+ cmd := exec.Command(r.conmonPath, args...)
+ cmd.Dir = ctr.bundlePath()
+ cmd.SysProcAttr = &syscall.SysProcAttr{
+ Setpgid: true,
+ }
+ // TODO this is probably a really bad idea for some uses
+ // Make this configurable
+ cmd.Stdin = os.Stdin
+ cmd.Stdout = os.Stdout
+ cmd.Stderr = os.Stderr
+ if ctr.config.Spec.Process.Terminal {
+ cmd.Stderr = &stderrBuf
+ }
+
+ // 0, 1 and 2 are stdin, stdout and stderr
+ conmonEnv, envFiles, err := r.configureConmonEnv(runtimeDir)
+ if err != nil {
+ return err
+ }
+
+ cmd.Env = append(r.conmonEnv, fmt.Sprintf("_OCI_SYNCPIPE=%d", 3), fmt.Sprintf("_OCI_STARTPIPE=%d", 4))
+ cmd.Env = append(cmd.Env, conmonEnv...)
+ cmd.ExtraFiles = append(cmd.ExtraFiles, childSyncPipe, childStartPipe)
+ cmd.ExtraFiles = append(cmd.ExtraFiles, envFiles...)
+
+ if r.reservePorts && !ctr.config.NetMode.IsSlirp4netns() {
+ ports, err := bindPorts(ctr.config.PortMappings)
+ if err != nil {
+ return err
+ }
+
+ // Leak the port we bound in the conmon process. These fd's won't be used
+ // by the container and conmon will keep the ports busy so that another
+ // process cannot use them.
+ cmd.ExtraFiles = append(cmd.ExtraFiles, ports...)
+ }
+
+ if ctr.config.NetMode.IsSlirp4netns() {
+ if ctr.config.PostConfigureNetNS {
+ ctr.rootlessSlirpSyncR, ctr.rootlessSlirpSyncW, err = os.Pipe()
+ if err != nil {
+ return errors.Wrapf(err, "failed to create rootless network sync pipe")
+ }
+ } else {
+ if ctr.rootlessSlirpSyncR != nil {
+ defer errorhandling.CloseQuiet(ctr.rootlessSlirpSyncR)
+ }
+ if ctr.rootlessSlirpSyncW != nil {
+ defer errorhandling.CloseQuiet(ctr.rootlessSlirpSyncW)
+ }
+ }
+ // Leak one end in conmon, the other one will be leaked into slirp4netns
+ cmd.ExtraFiles = append(cmd.ExtraFiles, ctr.rootlessSlirpSyncW)
+ }
+
+ err = startCommandGivenSelinux(cmd)
+ // regardless of whether we errored or not, we no longer need the children pipes
+ childSyncPipe.Close()
+ childStartPipe.Close()
+ if err != nil {
+ return err
+ }
+ if err := r.moveConmonToCgroupAndSignal(ctr, cmd, parentStartPipe, ctr.ID()); err != nil {
+ return err
+ }
+ /* Wait for initial setup and fork, and reap child */
+ err = cmd.Wait()
+ if err != nil {
+ return err
+ }
+
+ pid, err := readConmonPipeData(parentSyncPipe, ociLog)
+ if err != nil {
+ if err2 := r.DeleteContainer(ctr); err2 != nil {
+ logrus.Errorf("Error removing container %s from runtime after creation failed", ctr.ID())
+ }
+ return err
+ }
+ ctr.state.PID = pid
+
+ conmonPID, err := readConmonPidFile(ctr.config.ConmonPidFile)
+ if err != nil {
+ logrus.Warnf("error reading conmon pid file for container %s: %s", ctr.ID(), err.Error())
+ } else if conmonPID > 0 {
+ // conmon not having a pid file is a valid state, so don't set it if we don't have it
+ logrus.Infof("Got Conmon PID as %d", conmonPID)
+ ctr.state.ConmonPID = conmonPID
+ }
+
+ return nil
+}
+
+// prepareProcessExec returns the path of the process.json used in runc exec -p
+// caller is responsible to close the returned *os.File if needed.
+func prepareProcessExec(c *Container, cmd, env []string, tty bool, cwd, user, sessionID string) (*os.File, error) {
+ f, err := ioutil.TempFile(c.execBundlePath(sessionID), "exec-process-")
+ if err != nil {
+ return nil, err
+ }
+
+ pspec := c.config.Spec.Process
+ pspec.Args = cmd
+ // We need to default this to false else it will inherit terminal as true
+ // from the container.
+ pspec.Terminal = false
+ if tty {
+ pspec.Terminal = true
+ }
+ if len(env) > 0 {
+ pspec.Env = append(pspec.Env, env...)
+ }
+
+ if cwd != "" {
+ pspec.Cwd = cwd
+
+ }
+
+ overrides := c.getUserOverrides()
+ execUser, err := lookup.GetUserGroupInfo(c.state.Mountpoint, user, overrides)
+ if err != nil {
+ return nil, err
+ }
+
+ // If user was set, look it up in the container to get a UID to use on
+ // the host
+ if user != "" {
+ sgids := make([]uint32, 0, len(execUser.Sgids))
+ for _, sgid := range execUser.Sgids {
+ sgids = append(sgids, uint32(sgid))
+ }
+ processUser := spec.User{
+ UID: uint32(execUser.Uid),
+ GID: uint32(execUser.Gid),
+ AdditionalGids: sgids,
+ }
+
+ pspec.User = processUser
+ }
+
+ hasHomeSet := false
+ for _, s := range pspec.Env {
+ if strings.HasPrefix(s, "HOME=") {
+ hasHomeSet = true
+ break
+ }
+ }
+ if !hasHomeSet {
+ pspec.Env = append(pspec.Env, fmt.Sprintf("HOME=%s", execUser.Home))
+ }
+
+ processJSON, err := json.Marshal(pspec)
+ if err != nil {
+ return nil, err
+ }
+
+ if err := ioutil.WriteFile(f.Name(), processJSON, 0644); err != nil {
+ return nil, err
+ }
+ return f, nil
+}
+
+// configureConmonEnv gets the environment values to add to conmon's exec struct
+// TODO this may want to be less hardcoded/more configurable in the future
+func (r *ConmonOCIRuntime) configureConmonEnv(runtimeDir string) ([]string, []*os.File, error) {
+ env := make([]string, 0, 6)
+ env = append(env, fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir))
+ env = append(env, fmt.Sprintf("_CONTAINERS_USERNS_CONFIGURED=%s", os.Getenv("_CONTAINERS_USERNS_CONFIGURED")))
+ env = append(env, fmt.Sprintf("_CONTAINERS_ROOTLESS_UID=%s", os.Getenv("_CONTAINERS_ROOTLESS_UID")))
+ home, err := homeDir()
+ if err != nil {
+ return nil, nil, err
+ }
+ env = append(env, fmt.Sprintf("HOME=%s", home))
+
+ extraFiles := make([]*os.File, 0)
+ if notify, ok := os.LookupEnv("NOTIFY_SOCKET"); ok {
+ env = append(env, fmt.Sprintf("NOTIFY_SOCKET=%s", notify))
+ }
+ if !r.sdNotify {
+ if listenfds, ok := os.LookupEnv("LISTEN_FDS"); ok {
+ env = append(env, fmt.Sprintf("LISTEN_FDS=%s", listenfds), "LISTEN_PID=1")
+ fds := activation.Files(false)
+ extraFiles = append(extraFiles, fds...)
+ }
+ } else {
+ logrus.Debug("disabling SD notify")
+ }
+ return env, extraFiles, nil
+}
+
+// sharedConmonArgs takes common arguments for exec and create/restore and formats them for the conmon CLI
+func (r *ConmonOCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, pidPath, logPath, exitDir, ociLogPath string) []string {
+ // set the conmon API version to be able to use the correct sync struct keys
+ args := []string{"--api-version", "1"}
+ if r.cgroupManager == SystemdCgroupsManager && !ctr.config.NoCgroups {
+ args = append(args, "-s")
+ }
+ args = append(args, "-c", ctr.ID())
+ args = append(args, "-u", cuuid)
+ args = append(args, "-r", r.path)
+ args = append(args, "-b", bundlePath)
+ args = append(args, "-p", pidPath)
+
+ var logDriver string
+ switch ctr.LogDriver() {
+ case JournaldLogging:
+ logDriver = JournaldLogging
+ case JSONLogging:
+ fallthrough
+ default: //nolint-stylecheck
+ // No case here should happen except JSONLogging, but keep this here in case the options are extended
+ logrus.Errorf("%s logging specified but not supported. Choosing k8s-file logging instead", ctr.LogDriver())
+ fallthrough
+ case "":
+ // to get here, either a user would specify `--log-driver ""`, or this came from another place in libpod
+ // since the former case is obscure, and the latter case isn't an error, let's silently fallthrough
+ fallthrough
+ case KubernetesLogging:
+ logDriver = fmt.Sprintf("%s:%s", KubernetesLogging, logPath)
+ }
+
+ args = append(args, "-l", logDriver)
+ args = append(args, "--exit-dir", exitDir)
+ args = append(args, "--socket-dir-path", r.socketsDir)
+ if r.logSizeMax >= 0 {
+ args = append(args, "--log-size-max", fmt.Sprintf("%v", r.logSizeMax))
+ }
+
+ logLevel := logrus.GetLevel()
+ args = append(args, "--log-level", logLevel.String())
+
+ if logLevel == logrus.DebugLevel {
+ logrus.Debugf("%s messages will be logged to syslog", r.conmonPath)
+ args = append(args, "--syslog")
+ }
+ if ociLogPath != "" {
+ args = append(args, "--runtime-arg", "--log-format=json", "--runtime-arg", "--log", fmt.Sprintf("--runtime-arg=%s", ociLogPath))
+ }
+ if ctr.config.NoCgroups {
+ logrus.Debugf("Running with no CGroups")
+ args = append(args, "--runtime-arg", "--cgroup-manager", "--runtime-arg", "disabled")
+ }
+ return args
+}
+
+// startCommandGivenSelinux starts a container ensuring to set the labels of
+// the process to make sure SELinux doesn't block conmon communication, if SELinux is enabled
+func startCommandGivenSelinux(cmd *exec.Cmd) error {
+ if !selinux.GetEnabled() {
+ return cmd.Start()
+ }
+ // Set the label of the conmon process to be level :s0
+ // This will allow the container processes to talk to fifo-files
+ // passed into the container by conmon
+ var (
+ plabel string
+ con selinux.Context
+ err error
+ )
+ plabel, err = selinux.CurrentLabel()
+ if err != nil {
+ return errors.Wrapf(err, "Failed to get current SELinux label")
+ }
+
+ con, err = selinux.NewContext(plabel)
+ if err != nil {
+ return errors.Wrapf(err, "Failed to get new context from SELinux label")
+ }
+
+ runtime.LockOSThread()
+ if con["level"] != "s0" && con["level"] != "" {
+ con["level"] = "s0"
+ if err = label.SetProcessLabel(con.Get()); err != nil {
+ runtime.UnlockOSThread()
+ return err
+ }
+ }
+ err = cmd.Start()
+ // Ignore error returned from SetProcessLabel("") call,
+ // can't recover.
+ if labelErr := label.SetProcessLabel(""); labelErr != nil {
+ logrus.Errorf("unable to set process label: %q", err)
+ }
+ runtime.UnlockOSThread()
+ return err
+}
+
+// moveConmonToCgroupAndSignal gets a container's cgroupParent and moves the conmon process to that cgroup
+// it then signals for conmon to start by sending nonse data down the start fd
+func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec.Cmd, startFd *os.File, uuid string) error {
+ mustCreateCgroup := true
+ // If cgroup creation is disabled - just signal.
+ if ctr.config.NoCgroups {
+ mustCreateCgroup = false
+ }
+
+ if rootless.IsRootless() {
+ ownsCgroup, err := cgroups.UserOwnsCurrentSystemdCgroup()
+ if err != nil {
+ return err
+ }
+ mustCreateCgroup = !ownsCgroup
+ }
+
+ if mustCreateCgroup {
+ cgroupParent := ctr.CgroupParent()
+ if r.cgroupManager == SystemdCgroupsManager {
+ unitName := createUnitName("libpod-conmon", ctr.ID())
+
+ realCgroupParent := cgroupParent
+ splitParent := strings.Split(cgroupParent, "/")
+ if strings.HasSuffix(cgroupParent, ".slice") && len(splitParent) > 1 {
+ realCgroupParent = splitParent[len(splitParent)-1]
+ }
+
+ logrus.Infof("Running conmon under slice %s and unitName %s", realCgroupParent, unitName)
+ if err := utils.RunUnderSystemdScope(cmd.Process.Pid, realCgroupParent, unitName); err != nil {
+ logrus.Warnf("Failed to add conmon to systemd sandbox cgroup: %v", err)
+ }
+ } else {
+ cgroupPath := filepath.Join(ctr.config.CgroupParent, "conmon")
+ control, err := cgroups.New(cgroupPath, &spec.LinuxResources{})
+ if err != nil {
+ logrus.Warnf("Failed to add conmon to cgroupfs sandbox cgroup: %v", err)
+ } else {
+ // we need to remove this defer and delete the cgroup once conmon exits
+ // maybe need a conmon monitor?
+ if err := control.AddPid(cmd.Process.Pid); err != nil {
+ logrus.Warnf("Failed to add conmon to cgroupfs sandbox cgroup: %v", err)
+ }
+ }
+ }
+ }
+
+ /* We set the cgroup, now the child can start creating children */
+ if err := writeConmonPipeData(startFd); err != nil {
+ return err
+ }
+ return nil
+}
+
+// newPipe creates a unix socket pair for communication
+func newPipe() (parent *os.File, child *os.File, err error) {
+ fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0)
+ if err != nil {
+ return nil, nil, err
+ }
+ return os.NewFile(uintptr(fds[1]), "parent"), os.NewFile(uintptr(fds[0]), "child"), nil
+}
+
+// readConmonPidFile attempts to read conmon's pid from its pid file
+func readConmonPidFile(pidFile string) (int, error) {
+ // Let's try reading the Conmon pid at the same time.
+ if pidFile != "" {
+ contents, err := ioutil.ReadFile(pidFile)
+ if err != nil {
+ return -1, err
+ }
+ // Convert it to an int
+ conmonPID, err := strconv.Atoi(string(contents))
+ if err != nil {
+ return -1, err
+ }
+ return conmonPID, nil
+ }
+ return 0, nil
+}
+
+// readConmonPipeData attempts to read a syncInfo struct from the pipe
+func readConmonPipeData(pipe *os.File, ociLog string) (int, error) {
+ // syncInfo is used to return data from monitor process to daemon
+ type syncInfo struct {
+ Data int `json:"data"`
+ Message string `json:"message,omitempty"`
+ }
+
+ // Wait to get container pid from conmon
+ type syncStruct struct {
+ si *syncInfo
+ err error
+ }
+ ch := make(chan syncStruct)
+ go func() {
+ var si *syncInfo
+ rdr := bufio.NewReader(pipe)
+ b, err := rdr.ReadBytes('\n')
+ if err != nil {
+ ch <- syncStruct{err: err}
+ }
+ if err := json.Unmarshal(b, &si); err != nil {
+ ch <- syncStruct{err: err}
+ return
+ }
+ ch <- syncStruct{si: si}
+ }()
+
+ data := -1
+ select {
+ case ss := <-ch:
+ if ss.err != nil {
+ if ociLog != "" {
+ ociLogData, err := ioutil.ReadFile(ociLog)
+ if err == nil {
+ var ociErr ociError
+ if err := json.Unmarshal(ociLogData, &ociErr); err == nil {
+ return -1, getOCIRuntimeError(ociErr.Msg)
+ }
+ }
+ }
+ return -1, errors.Wrapf(ss.err, "error reading container (probably exited) json message")
+ }
+ logrus.Debugf("Received: %d", ss.si.Data)
+ if ss.si.Data < 0 {
+ if ociLog != "" {
+ ociLogData, err := ioutil.ReadFile(ociLog)
+ if err == nil {
+ var ociErr ociError
+ if err := json.Unmarshal(ociLogData, &ociErr); err == nil {
+ return ss.si.Data, getOCIRuntimeError(ociErr.Msg)
+ }
+ }
+ }
+ // If we failed to parse the JSON errors, then print the output as it is
+ if ss.si.Message != "" {
+ return ss.si.Data, getOCIRuntimeError(ss.si.Message)
+ }
+ return ss.si.Data, errors.Wrapf(define.ErrInternal, "container create failed")
+ }
+ data = ss.si.Data
+ case <-time.After(ContainerCreateTimeout):
+ return -1, errors.Wrapf(define.ErrInternal, "container creation timeout")
+ }
+ return data, nil
+}
+
+// writeConmonPipeData writes nonse data to a pipe
+func writeConmonPipeData(pipe *os.File) error {
+ someData := []byte{0}
+ _, err := pipe.Write(someData)
+ return err
+}
+
+// formatRuntimeOpts prepends opts passed to it with --runtime-opt for passing to conmon
+func formatRuntimeOpts(opts ...string) []string {
+ args := make([]string, 0, len(opts)*2)
+ for _, o := range opts {
+ args = append(args, "--runtime-opt", o)
+ }
+ return args
+}
+
+// getConmonVersion returns a string representation of the conmon version.
+func (r *ConmonOCIRuntime) getConmonVersion() (string, error) {
+ output, err := utils.ExecCmd(r.conmonPath, "--version")
+ if err != nil {
+ return "", err
+ }
+ return strings.TrimSuffix(strings.Replace(output, "\n", ", ", 1), "\n"), nil
+}
+
+// getOCIRuntimeVersion returns a string representation of the OCI runtime's
+// version.
+func (r *ConmonOCIRuntime) getOCIRuntimeVersion() (string, error) {
+ output, err := utils.ExecCmd(r.path, "--version")
+ if err != nil {
+ return "", err
+ }
+ return strings.TrimSuffix(output, "\n"), nil
+}
diff --git a/libpod/oci_conmon_unsupported.go b/libpod/oci_conmon_unsupported.go
new file mode 100644
index 000000000..77b06eed3
--- /dev/null
+++ b/libpod/oci_conmon_unsupported.go
@@ -0,0 +1,130 @@
+// +build !linux
+
+package libpod
+
+import (
+ "github.com/containers/libpod/libpod/define"
+)
+
+const (
+ osNotSupported = "Not supported on this OS"
+)
+
+// ConmonOCIRuntime is not supported on this OS.
+type ConmonOCIRuntime struct {
+}
+
+// newConmonOCIRuntime is not supported on this OS.
+func newConmonOCIRuntime(name string, paths []string, conmonPath string, runtimeCfg *RuntimeConfig, supportsJSON, supportsNoCgroups bool) (OCIRuntime, error) {
+ return nil, define.ErrNotImplemented
+}
+
+// Name is not supported on this OS.
+func (r *ConmonOCIRuntime) Name() string {
+ return osNotSupported
+}
+
+// Path is not supported on this OS.
+func (r *ConmonOCIRuntime) Path() string {
+ return osNotSupported
+}
+
+// CreateContainer is not supported on this OS.
+func (r *ConmonOCIRuntime) CreateContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) error {
+ return define.ErrNotImplemented
+}
+
+// UpdateContainerStatus is not supported on this OS.
+func (r *ConmonOCIRuntime) UpdateContainerStatus(ctr *Container, useRuntime bool) error {
+ return define.ErrNotImplemented
+}
+
+// StartContainer is not supported on this OS.
+func (r *ConmonOCIRuntime) StartContainer(ctr *Container) error {
+ return define.ErrNotImplemented
+}
+
+// KillContainer is not supported on this OS.
+func (r *ConmonOCIRuntime) KillContainer(ctr *Container, signal uint, all bool) error {
+ return define.ErrNotImplemented
+}
+
+// StopContainer is not supported on this OS.
+func (r *ConmonOCIRuntime) StopContainer(ctr *Container, timeout uint, all bool) error {
+ return define.ErrNotImplemented
+}
+
+// DeleteContainer is not supported on this OS.
+func (r *ConmonOCIRuntime) DeleteContainer(ctr *Container) error {
+ return define.ErrNotImplemented
+}
+
+// PauseContainer is not supported on this OS.
+func (r *ConmonOCIRuntime) PauseContainer(ctr *Container) error {
+ return define.ErrNotImplemented
+}
+
+// UnpauseContainer is not supported on this OS.
+func (r *ConmonOCIRuntime) UnpauseContainer(ctr *Container) error {
+ return define.ErrNotImplemented
+}
+
+// ExecContainer is not supported on this OS.
+func (r *ConmonOCIRuntime) ExecContainer(ctr *Container, sessionID string, options *ExecOptions) (int, chan error, error) {
+ return -1, nil, define.ErrNotImplemented
+}
+
+// ExecStopContainer is not supported on this OS.
+func (r *ConmonOCIRuntime) ExecStopContainer(ctr *Container, sessionID string, timeout uint) error {
+ return define.ErrNotImplemented
+}
+
+// CheckpointContainer is not supported on this OS.
+func (r *ConmonOCIRuntime) CheckpointContainer(ctr *Container, options ContainerCheckpointOptions) error {
+ return define.ErrNotImplemented
+}
+
+// SupportsCheckpoint is not supported on this OS.
+func (r *ConmonOCIRuntime) SupportsCheckpoint() bool {
+ return false
+}
+
+// SupportsJSONErrors is not supported on this OS.
+func (r *ConmonOCIRuntime) SupportsJSONErrors() bool {
+ return false
+}
+
+// SupportsNoCgroups is not supported on this OS.
+func (r *ConmonOCIRuntime) SupportsNoCgroups() bool {
+ return false
+}
+
+// AttachSocketPath is not supported on this OS.
+func (r *ConmonOCIRuntime) AttachSocketPath(ctr *Container) (string, error) {
+ return "", define.ErrNotImplemented
+}
+
+// ExecAttachSocketPath is not supported on this OS.
+func (r *ConmonOCIRuntime) ExecAttachSocketPath(ctr *Container, sessionID string) (string, error) {
+ return "", define.ErrNotImplemented
+}
+
+// ExitFilePath is not supported on this OS.
+func (r *ConmonOCIRuntime) ExitFilePath(ctr *Container) (string, error) {
+ return "", define.ErrNotImplemented
+}
+
+// RuntimeInfo is not supported on this OS.
+func (r *ConmonOCIRuntime) RuntimeInfo() (map[string]interface{}, error) {
+ return nil, define.ErrNotImplemented
+}
+
+// Package is not supported on this OS.
+func (r *ConmonOCIRuntime) Package() string {
+ return osNotSupported
+}
+
+// ConmonPackage is not supported on this OS.
+func (r *ConmonOCIRuntime) ConmonPackage() string {
+ return osNotSupported
+}
diff --git a/libpod/oci_internal_linux.go b/libpod/oci_internal_linux.go
deleted file mode 100644
index 437b7cf4d..000000000
--- a/libpod/oci_internal_linux.go
+++ /dev/null
@@ -1,556 +0,0 @@
-// +build linux
-
-package libpod
-
-import (
- "bufio"
- "bytes"
- "fmt"
- "io/ioutil"
- "os"
- "os/exec"
- "path/filepath"
- "regexp"
- "runtime"
- "strconv"
- "strings"
- "syscall"
- "time"
-
- "github.com/containers/libpod/libpod/define"
- "github.com/containers/libpod/pkg/cgroups"
- "github.com/containers/libpod/pkg/errorhandling"
- "github.com/containers/libpod/pkg/lookup"
- "github.com/containers/libpod/pkg/rootless"
- "github.com/containers/libpod/pkg/util"
- "github.com/containers/libpod/utils"
- "github.com/coreos/go-systemd/activation"
- spec "github.com/opencontainers/runtime-spec/specs-go"
- "github.com/opencontainers/selinux/go-selinux"
- "github.com/opencontainers/selinux/go-selinux/label"
- "github.com/pkg/errors"
- "github.com/sirupsen/logrus"
- "golang.org/x/sys/unix"
-)
-
-// createOCIContainer generates this container's main conmon instance and prepares it for starting
-func (r *OCIRuntime) createOCIContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (err error) {
- var stderrBuf bytes.Buffer
-
- runtimeDir, err := util.GetRuntimeDir()
- if err != nil {
- return err
- }
-
- parentSyncPipe, childSyncPipe, err := newPipe()
- if err != nil {
- return errors.Wrapf(err, "error creating socket pair")
- }
- defer errorhandling.CloseQuiet(parentSyncPipe)
-
- childStartPipe, parentStartPipe, err := newPipe()
- if err != nil {
- return errors.Wrapf(err, "error creating socket pair for start pipe")
- }
-
- defer errorhandling.CloseQuiet(parentStartPipe)
-
- var ociLog string
- if logrus.GetLevel() != logrus.DebugLevel && r.supportsJSON {
- ociLog = filepath.Join(ctr.state.RunDir, "oci-log")
- }
- args := r.sharedConmonArgs(ctr, ctr.ID(), ctr.bundlePath(), filepath.Join(ctr.state.RunDir, "pidfile"), ctr.LogPath(), r.exitsDir, ociLog)
-
- if ctr.config.Spec.Process.Terminal {
- args = append(args, "-t")
- } else if ctr.config.Stdin {
- args = append(args, "-i")
- }
-
- if ctr.config.ConmonPidFile != "" {
- args = append(args, "--conmon-pidfile", ctr.config.ConmonPidFile)
- }
-
- if r.noPivot {
- args = append(args, "--no-pivot")
- }
-
- if len(ctr.config.ExitCommand) > 0 {
- args = append(args, "--exit-command", ctr.config.ExitCommand[0])
- for _, arg := range ctr.config.ExitCommand[1:] {
- args = append(args, []string{"--exit-command-arg", arg}...)
- }
- }
-
- if restoreOptions != nil {
- args = append(args, "--restore", ctr.CheckpointPath())
- if restoreOptions.TCPEstablished {
- args = append(args, "--runtime-opt", "--tcp-established")
- }
- }
-
- logrus.WithFields(logrus.Fields{
- "args": args,
- }).Debugf("running conmon: %s", r.conmonPath)
-
- cmd := exec.Command(r.conmonPath, args...)
- cmd.Dir = ctr.bundlePath()
- cmd.SysProcAttr = &syscall.SysProcAttr{
- Setpgid: true,
- }
- // TODO this is probably a really bad idea for some uses
- // Make this configurable
- cmd.Stdin = os.Stdin
- cmd.Stdout = os.Stdout
- cmd.Stderr = os.Stderr
- if ctr.config.Spec.Process.Terminal {
- cmd.Stderr = &stderrBuf
- }
-
- // 0, 1 and 2 are stdin, stdout and stderr
- conmonEnv, envFiles, err := r.configureConmonEnv(runtimeDir)
- if err != nil {
- return err
- }
-
- cmd.Env = append(r.conmonEnv, fmt.Sprintf("_OCI_SYNCPIPE=%d", 3), fmt.Sprintf("_OCI_STARTPIPE=%d", 4))
- cmd.Env = append(cmd.Env, conmonEnv...)
- cmd.ExtraFiles = append(cmd.ExtraFiles, childSyncPipe, childStartPipe)
- cmd.ExtraFiles = append(cmd.ExtraFiles, envFiles...)
-
- if r.reservePorts && !ctr.config.NetMode.IsSlirp4netns() {
- ports, err := bindPorts(ctr.config.PortMappings)
- if err != nil {
- return err
- }
-
- // Leak the port we bound in the conmon process. These fd's won't be used
- // by the container and conmon will keep the ports busy so that another
- // process cannot use them.
- cmd.ExtraFiles = append(cmd.ExtraFiles, ports...)
- }
-
- if ctr.config.NetMode.IsSlirp4netns() {
- if ctr.config.PostConfigureNetNS {
- ctr.rootlessSlirpSyncR, ctr.rootlessSlirpSyncW, err = os.Pipe()
- if err != nil {
- return errors.Wrapf(err, "failed to create rootless network sync pipe")
- }
- } else {
- if ctr.rootlessSlirpSyncR != nil {
- defer errorhandling.CloseQuiet(ctr.rootlessSlirpSyncR)
- }
- if ctr.rootlessSlirpSyncW != nil {
- defer errorhandling.CloseQuiet(ctr.rootlessSlirpSyncW)
- }
- }
- // Leak one end in conmon, the other one will be leaked into slirp4netns
- cmd.ExtraFiles = append(cmd.ExtraFiles, ctr.rootlessSlirpSyncW)
- }
-
- err = startCommandGivenSelinux(cmd)
- // regardless of whether we errored or not, we no longer need the children pipes
- childSyncPipe.Close()
- childStartPipe.Close()
- if err != nil {
- return err
- }
- if err := r.moveConmonToCgroupAndSignal(ctr, cmd, parentStartPipe, ctr.ID()); err != nil {
- return err
- }
- /* Wait for initial setup and fork, and reap child */
- err = cmd.Wait()
- if err != nil {
- return err
- }
-
- pid, err := readConmonPipeData(parentSyncPipe, ociLog)
- if err != nil {
- if err2 := r.deleteContainer(ctr); err2 != nil {
- logrus.Errorf("Error removing container %s from runtime after creation failed", ctr.ID())
- }
- return err
- }
- ctr.state.PID = pid
-
- conmonPID, err := readConmonPidFile(ctr.config.ConmonPidFile)
- if err != nil {
- logrus.Warnf("error reading conmon pid file for container %s: %s", ctr.ID(), err.Error())
- } else if conmonPID > 0 {
- // conmon not having a pid file is a valid state, so don't set it if we don't have it
- logrus.Infof("Got Conmon PID as %d", conmonPID)
- ctr.state.ConmonPID = conmonPID
- }
-
- return nil
-}
-
-// prepareProcessExec returns the path of the process.json used in runc exec -p
-// caller is responsible to close the returned *os.File if needed.
-func prepareProcessExec(c *Container, cmd, env []string, tty bool, cwd, user, sessionID string) (*os.File, error) {
- f, err := ioutil.TempFile(c.execBundlePath(sessionID), "exec-process-")
- if err != nil {
- return nil, err
- }
-
- pspec := c.config.Spec.Process
- pspec.Args = cmd
- // We need to default this to false else it will inherit terminal as true
- // from the container.
- pspec.Terminal = false
- if tty {
- pspec.Terminal = true
- }
- if len(env) > 0 {
- pspec.Env = append(pspec.Env, env...)
- }
-
- if cwd != "" {
- pspec.Cwd = cwd
-
- }
-
- overrides := c.getUserOverrides()
- execUser, err := lookup.GetUserGroupInfo(c.state.Mountpoint, user, overrides)
- if err != nil {
- return nil, err
- }
-
- // If user was set, look it up in the container to get a UID to use on
- // the host
- if user != "" {
- sgids := make([]uint32, 0, len(execUser.Sgids))
- for _, sgid := range execUser.Sgids {
- sgids = append(sgids, uint32(sgid))
- }
- processUser := spec.User{
- UID: uint32(execUser.Uid),
- GID: uint32(execUser.Gid),
- AdditionalGids: sgids,
- }
-
- pspec.User = processUser
- }
-
- hasHomeSet := false
- for _, s := range pspec.Env {
- if strings.HasPrefix(s, "HOME=") {
- hasHomeSet = true
- break
- }
- }
- if !hasHomeSet {
- pspec.Env = append(pspec.Env, fmt.Sprintf("HOME=%s", execUser.Home))
- }
-
- processJSON, err := json.Marshal(pspec)
- if err != nil {
- return nil, err
- }
-
- if err := ioutil.WriteFile(f.Name(), processJSON, 0644); err != nil {
- return nil, err
- }
- return f, nil
-}
-
-// configureConmonEnv gets the environment values to add to conmon's exec struct
-// TODO this may want to be less hardcoded/more configurable in the future
-func (r *OCIRuntime) configureConmonEnv(runtimeDir string) ([]string, []*os.File, error) {
- env := make([]string, 0, 6)
- env = append(env, fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir))
- env = append(env, fmt.Sprintf("_CONTAINERS_USERNS_CONFIGURED=%s", os.Getenv("_CONTAINERS_USERNS_CONFIGURED")))
- env = append(env, fmt.Sprintf("_CONTAINERS_ROOTLESS_UID=%s", os.Getenv("_CONTAINERS_ROOTLESS_UID")))
- home, err := homeDir()
- if err != nil {
- return nil, nil, err
- }
- env = append(env, fmt.Sprintf("HOME=%s", home))
-
- extraFiles := make([]*os.File, 0)
- if notify, ok := os.LookupEnv("NOTIFY_SOCKET"); ok {
- env = append(env, fmt.Sprintf("NOTIFY_SOCKET=%s", notify))
- }
- if !r.sdNotify {
- if listenfds, ok := os.LookupEnv("LISTEN_FDS"); ok {
- env = append(env, fmt.Sprintf("LISTEN_FDS=%s", listenfds), "LISTEN_PID=1")
- fds := activation.Files(false)
- extraFiles = append(extraFiles, fds...)
- }
- } else {
- logrus.Debug("disabling SD notify")
- }
- return env, extraFiles, nil
-}
-
-// sharedConmonArgs takes common arguments for exec and create/restore and formats them for the conmon CLI
-func (r *OCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, pidPath, logPath, exitDir, ociLogPath string) []string {
- // set the conmon API version to be able to use the correct sync struct keys
- args := []string{"--api-version", "1"}
- if r.cgroupManager == SystemdCgroupsManager && !ctr.config.NoCgroups {
- args = append(args, "-s")
- }
- args = append(args, "-c", ctr.ID())
- args = append(args, "-u", cuuid)
- args = append(args, "-r", r.path)
- args = append(args, "-b", bundlePath)
- args = append(args, "-p", pidPath)
-
- var logDriver string
- switch ctr.LogDriver() {
- case JournaldLogging:
- logDriver = JournaldLogging
- case JSONLogging:
- fallthrough
- default: //nolint-stylecheck
- // No case here should happen except JSONLogging, but keep this here in case the options are extended
- logrus.Errorf("%s logging specified but not supported. Choosing k8s-file logging instead", ctr.LogDriver())
- fallthrough
- case "":
- // to get here, either a user would specify `--log-driver ""`, or this came from another place in libpod
- // since the former case is obscure, and the latter case isn't an error, let's silently fallthrough
- fallthrough
- case KubernetesLogging:
- logDriver = fmt.Sprintf("%s:%s", KubernetesLogging, logPath)
- }
-
- args = append(args, "-l", logDriver)
- args = append(args, "--exit-dir", exitDir)
- args = append(args, "--socket-dir-path", r.socketsDir)
- if r.logSizeMax >= 0 {
- args = append(args, "--log-size-max", fmt.Sprintf("%v", r.logSizeMax))
- }
-
- logLevel := logrus.GetLevel()
- args = append(args, "--log-level", logLevel.String())
-
- if logLevel == logrus.DebugLevel {
- logrus.Debugf("%s messages will be logged to syslog", r.conmonPath)
- args = append(args, "--syslog")
- }
- if ociLogPath != "" {
- args = append(args, "--runtime-arg", "--log-format=json", "--runtime-arg", "--log", fmt.Sprintf("--runtime-arg=%s", ociLogPath))
- }
- if ctr.config.NoCgroups {
- logrus.Debugf("Running with no CGroups")
- args = append(args, "--runtime-arg", "--cgroup-manager", "--runtime-arg", "disabled")
- }
- return args
-}
-
-// startCommandGivenSelinux starts a container ensuring to set the labels of
-// the process to make sure SELinux doesn't block conmon communication, if SELinux is enabled
-func startCommandGivenSelinux(cmd *exec.Cmd) error {
- if !selinux.GetEnabled() {
- return cmd.Start()
- }
- // Set the label of the conmon process to be level :s0
- // This will allow the container processes to talk to fifo-files
- // passed into the container by conmon
- var (
- plabel string
- con selinux.Context
- err error
- )
- plabel, err = selinux.CurrentLabel()
- if err != nil {
- return errors.Wrapf(err, "Failed to get current SELinux label")
- }
-
- con, err = selinux.NewContext(plabel)
- if err != nil {
- return errors.Wrapf(err, "Failed to get new context from SELinux label")
- }
-
- runtime.LockOSThread()
- if con["level"] != "s0" && con["level"] != "" {
- con["level"] = "s0"
- if err = label.SetProcessLabel(con.Get()); err != nil {
- runtime.UnlockOSThread()
- return err
- }
- }
- err = cmd.Start()
- // Ignore error returned from SetProcessLabel("") call,
- // can't recover.
- if labelErr := label.SetProcessLabel(""); labelErr != nil {
- logrus.Errorf("unable to set process label: %q", err)
- }
- runtime.UnlockOSThread()
- return err
-}
-
-// moveConmonToCgroupAndSignal gets a container's cgroupParent and moves the conmon process to that cgroup
-// it then signals for conmon to start by sending nonse data down the start fd
-func (r *OCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec.Cmd, startFd *os.File, uuid string) error {
- mustCreateCgroup := true
- // If cgroup creation is disabled - just signal.
- if ctr.config.NoCgroups {
- mustCreateCgroup = false
- }
-
- if rootless.IsRootless() {
- ownsCgroup, err := cgroups.UserOwnsCurrentSystemdCgroup()
- if err != nil {
- return err
- }
- mustCreateCgroup = !ownsCgroup
- }
-
- if mustCreateCgroup {
- cgroupParent := ctr.CgroupParent()
- if r.cgroupManager == SystemdCgroupsManager {
- unitName := createUnitName("libpod-conmon", ctr.ID())
-
- realCgroupParent := cgroupParent
- splitParent := strings.Split(cgroupParent, "/")
- if strings.HasSuffix(cgroupParent, ".slice") && len(splitParent) > 1 {
- realCgroupParent = splitParent[len(splitParent)-1]
- }
-
- logrus.Infof("Running conmon under slice %s and unitName %s", realCgroupParent, unitName)
- if err := utils.RunUnderSystemdScope(cmd.Process.Pid, realCgroupParent, unitName); err != nil {
- logrus.Warnf("Failed to add conmon to systemd sandbox cgroup: %v", err)
- }
- } else {
- cgroupPath := filepath.Join(ctr.config.CgroupParent, "conmon")
- control, err := cgroups.New(cgroupPath, &spec.LinuxResources{})
- if err != nil {
- logrus.Warnf("Failed to add conmon to cgroupfs sandbox cgroup: %v", err)
- } else {
- // we need to remove this defer and delete the cgroup once conmon exits
- // maybe need a conmon monitor?
- if err := control.AddPid(cmd.Process.Pid); err != nil {
- logrus.Warnf("Failed to add conmon to cgroupfs sandbox cgroup: %v", err)
- }
- }
- }
- }
-
- /* We set the cgroup, now the child can start creating children */
- if err := writeConmonPipeData(startFd); err != nil {
- return err
- }
- return nil
-}
-
-// newPipe creates a unix socket pair for communication
-func newPipe() (parent *os.File, child *os.File, err error) {
- fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0)
- if err != nil {
- return nil, nil, err
- }
- return os.NewFile(uintptr(fds[1]), "parent"), os.NewFile(uintptr(fds[0]), "child"), nil
-}
-
-// readConmonPidFile attempts to read conmon's pid from its pid file
-func readConmonPidFile(pidFile string) (int, error) {
- // Let's try reading the Conmon pid at the same time.
- if pidFile != "" {
- contents, err := ioutil.ReadFile(pidFile)
- if err != nil {
- return -1, err
- }
- // Convert it to an int
- conmonPID, err := strconv.Atoi(string(contents))
- if err != nil {
- return -1, err
- }
- return conmonPID, nil
- }
- return 0, nil
-}
-
-// readConmonPipeData attempts to read a syncInfo struct from the pipe
-func readConmonPipeData(pipe *os.File, ociLog string) (int, error) {
- // syncInfo is used to return data from monitor process to daemon
- type syncInfo struct {
- Data int `json:"data"`
- Message string `json:"message,omitempty"`
- }
-
- // Wait to get container pid from conmon
- type syncStruct struct {
- si *syncInfo
- err error
- }
- ch := make(chan syncStruct)
- go func() {
- var si *syncInfo
- rdr := bufio.NewReader(pipe)
- b, err := rdr.ReadBytes('\n')
- if err != nil {
- ch <- syncStruct{err: err}
- }
- if err := json.Unmarshal(b, &si); err != nil {
- ch <- syncStruct{err: err}
- return
- }
- ch <- syncStruct{si: si}
- }()
-
- data := -1
- select {
- case ss := <-ch:
- if ss.err != nil {
- if ociLog != "" {
- ociLogData, err := ioutil.ReadFile(ociLog)
- if err == nil {
- var ociErr ociError
- if err := json.Unmarshal(ociLogData, &ociErr); err == nil {
- return -1, getOCIRuntimeError(ociErr.Msg)
- }
- }
- }
- return -1, errors.Wrapf(ss.err, "error reading container (probably exited) json message")
- }
- logrus.Debugf("Received: %d", ss.si.Data)
- if ss.si.Data < 0 {
- if ociLog != "" {
- ociLogData, err := ioutil.ReadFile(ociLog)
- if err == nil {
- var ociErr ociError
- if err := json.Unmarshal(ociLogData, &ociErr); err == nil {
- return ss.si.Data, getOCIRuntimeError(ociErr.Msg)
- }
- }
- }
- // If we failed to parse the JSON errors, then print the output as it is
- if ss.si.Message != "" {
- return ss.si.Data, getOCIRuntimeError(ss.si.Message)
- }
- return ss.si.Data, errors.Wrapf(define.ErrInternal, "container create failed")
- }
- data = ss.si.Data
- case <-time.After(ContainerCreateTimeout):
- return -1, errors.Wrapf(define.ErrInternal, "container creation timeout")
- }
- return data, nil
-}
-
-func getOCIRuntimeError(runtimeMsg string) error {
- r := strings.ToLower(runtimeMsg)
- if match, _ := regexp.MatchString(".*permission denied.*|.*operation not permitted.*", r); match {
- return errors.Wrapf(define.ErrOCIRuntimePermissionDenied, "%s", strings.Trim(runtimeMsg, "\n"))
- }
- if match, _ := regexp.MatchString(".*executable file not found in.*|.*no such file or directory.*", r); match {
- return errors.Wrapf(define.ErrOCIRuntimeNotFound, "%s", strings.Trim(runtimeMsg, "\n"))
- }
- return errors.Wrapf(define.ErrOCIRuntime, "%s", strings.Trim(runtimeMsg, "\n"))
-}
-
-// writeConmonPipeData writes nonse data to a pipe
-func writeConmonPipeData(pipe *os.File) error {
- someData := []byte{0}
- _, err := pipe.Write(someData)
- return err
-}
-
-// formatRuntimeOpts prepends opts passed to it with --runtime-opt for passing to conmon
-func formatRuntimeOpts(opts ...string) []string {
- args := make([]string, 0, len(opts)*2)
- for _, o := range opts {
- args = append(args, "--runtime-opt", o)
- }
- return args
-}
diff --git a/libpod/oci_linux.go b/libpod/oci_linux.go
deleted file mode 100644
index 9ec074704..000000000
--- a/libpod/oci_linux.go
+++ /dev/null
@@ -1,503 +0,0 @@
-// +build linux
-
-package libpod
-
-import (
- "fmt"
- "os"
- "os/exec"
- "path/filepath"
- "runtime"
- "strconv"
- "strings"
- "syscall"
- "time"
-
- "github.com/containers/libpod/libpod/define"
- "github.com/containers/libpod/pkg/errorhandling"
- "github.com/containers/libpod/pkg/rootless"
- "github.com/containers/libpod/pkg/util"
- "github.com/containers/libpod/utils"
- pmount "github.com/containers/storage/pkg/mount"
- "github.com/pkg/errors"
- "github.com/sirupsen/logrus"
- "golang.org/x/sys/unix"
- "k8s.io/client-go/tools/remotecommand"
-)
-
-// makeAccessible changes the path permission and each parent directory to have --x--x--x
-func makeAccessible(path string, uid, gid int) error {
- for ; path != "/"; path = filepath.Dir(path) {
- st, err := os.Stat(path)
- if err != nil {
- if os.IsNotExist(err) {
- return nil
- }
- return err
- }
- if int(st.Sys().(*syscall.Stat_t).Uid) == uid && int(st.Sys().(*syscall.Stat_t).Gid) == gid {
- continue
- }
- if st.Mode()&0111 != 0111 {
- if err := os.Chmod(path, st.Mode()|0111); err != nil {
- return err
- }
- }
- }
- return nil
-}
-
-// CreateContainer creates a container in the OCI runtime
-// TODO terminal support for container
-// Presently just ignoring conmon opts related to it
-func (r *OCIRuntime) createContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (err error) {
- if len(ctr.config.IDMappings.UIDMap) != 0 || len(ctr.config.IDMappings.GIDMap) != 0 {
- for _, i := range []string{ctr.state.RunDir, ctr.runtime.config.TmpDir, ctr.config.StaticDir, ctr.state.Mountpoint, ctr.runtime.config.VolumePath} {
- if err := makeAccessible(i, ctr.RootUID(), ctr.RootGID()); err != nil {
- return err
- }
- }
-
- // if we are running a non privileged container, be sure to umount some kernel paths so they are not
- // bind mounted inside the container at all.
- if !ctr.config.Privileged && !rootless.IsRootless() {
- ch := make(chan error)
- go func() {
- runtime.LockOSThread()
- err := func() error {
- fd, err := os.Open(fmt.Sprintf("/proc/%d/task/%d/ns/mnt", os.Getpid(), unix.Gettid()))
- if err != nil {
- return err
- }
- defer errorhandling.CloseQuiet(fd)
-
- // create a new mountns on the current thread
- if err = unix.Unshare(unix.CLONE_NEWNS); err != nil {
- return err
- }
- defer func() {
- if err := unix.Setns(int(fd.Fd()), unix.CLONE_NEWNS); err != nil {
- logrus.Errorf("unable to clone new namespace: %q", err)
- }
- }()
-
- // don't spread our mounts around. We are setting only /sys to be slave
- // so that the cleanup process is still able to umount the storage and the
- // changes are propagated to the host.
- err = unix.Mount("/sys", "/sys", "none", unix.MS_REC|unix.MS_SLAVE, "")
- if err != nil {
- return errors.Wrapf(err, "cannot make /sys slave")
- }
-
- mounts, err := pmount.GetMounts()
- if err != nil {
- return err
- }
- for _, m := range mounts {
- if !strings.HasPrefix(m.Mountpoint, "/sys/kernel") {
- continue
- }
- err = unix.Unmount(m.Mountpoint, 0)
- if err != nil && !os.IsNotExist(err) {
- return errors.Wrapf(err, "cannot unmount %s", m.Mountpoint)
- }
- }
- return r.createOCIContainer(ctr, restoreOptions)
- }()
- ch <- err
- }()
- err := <-ch
- return err
- }
- }
- return r.createOCIContainer(ctr, restoreOptions)
-}
-
-func (r *OCIRuntime) pathPackage() string {
- return packageVersion(r.path)
-}
-
-func (r *OCIRuntime) conmonPackage() string {
- return packageVersion(r.conmonPath)
-}
-
-// execContainer executes a command in a running container
-// TODO: Add --detach support
-// TODO: Convert to use conmon
-// TODO: add --pid-file and use that to generate exec session tracking
-func (r *OCIRuntime) execContainer(c *Container, cmd, capAdd, env []string, tty bool, cwd, user, sessionID string, streams *AttachStreams, preserveFDs int, resize chan remotecommand.TerminalSize, detachKeys string) (int, chan error, error) {
- if len(cmd) == 0 {
- return -1, nil, errors.Wrapf(define.ErrInvalidArg, "must provide a command to execute")
- }
-
- if sessionID == "" {
- return -1, nil, errors.Wrapf(define.ErrEmptyID, "must provide a session ID for exec")
- }
-
- // create sync pipe to receive the pid
- parentSyncPipe, childSyncPipe, err := newPipe()
- if err != nil {
- return -1, nil, errors.Wrapf(err, "error creating socket pair")
- }
-
- defer errorhandling.CloseQuiet(parentSyncPipe)
-
- // create start pipe to set the cgroup before running
- // attachToExec is responsible for closing parentStartPipe
- childStartPipe, parentStartPipe, err := newPipe()
- if err != nil {
- return -1, nil, errors.Wrapf(err, "error creating socket pair")
- }
-
- // We want to make sure we close the parent{Start,Attach}Pipes if we fail
- // but also don't want to close them after attach to exec is called
- attachToExecCalled := false
-
- defer func() {
- if !attachToExecCalled {
- errorhandling.CloseQuiet(parentStartPipe)
- }
- }()
-
- // create the attach pipe to allow attach socket to be created before
- // $RUNTIME exec starts running. This is to make sure we can capture all output
- // from the process through that socket, rather than half reading the log, half attaching to the socket
- // attachToExec is responsible for closing parentAttachPipe
- parentAttachPipe, childAttachPipe, err := newPipe()
- if err != nil {
- return -1, nil, errors.Wrapf(err, "error creating socket pair")
- }
-
- defer func() {
- if !attachToExecCalled {
- errorhandling.CloseQuiet(parentAttachPipe)
- }
- }()
-
- childrenClosed := false
- defer func() {
- if !childrenClosed {
- errorhandling.CloseQuiet(childSyncPipe)
- errorhandling.CloseQuiet(childAttachPipe)
- errorhandling.CloseQuiet(childStartPipe)
- }
- }()
-
- runtimeDir, err := util.GetRuntimeDir()
- if err != nil {
- return -1, nil, err
- }
-
- processFile, err := prepareProcessExec(c, cmd, env, tty, cwd, user, sessionID)
- if err != nil {
- return -1, nil, err
- }
-
- var ociLog string
- if logrus.GetLevel() != logrus.DebugLevel && r.supportsJSON {
- ociLog = c.execOCILog(sessionID)
- }
- args := r.sharedConmonArgs(c, sessionID, c.execBundlePath(sessionID), c.execPidPath(sessionID), c.execLogPath(sessionID), c.execExitFileDir(sessionID), ociLog)
-
- if preserveFDs > 0 {
- args = append(args, formatRuntimeOpts("--preserve-fds", strconv.Itoa(preserveFDs))...)
- }
-
- for _, capability := range capAdd {
- args = append(args, formatRuntimeOpts("--cap", capability)...)
- }
-
- if tty {
- args = append(args, "-t")
- }
-
- // Append container ID and command
- args = append(args, "-e")
- // TODO make this optional when we can detach
- args = append(args, "--exec-attach")
- args = append(args, "--exec-process-spec", processFile.Name())
-
- logrus.WithFields(logrus.Fields{
- "args": args,
- }).Debugf("running conmon: %s", r.conmonPath)
- execCmd := exec.Command(r.conmonPath, args...)
-
- if streams.AttachInput {
- execCmd.Stdin = streams.InputStream
- }
- if streams.AttachOutput {
- execCmd.Stdout = streams.OutputStream
- }
- if streams.AttachError {
- execCmd.Stderr = streams.ErrorStream
- }
-
- conmonEnv, extraFiles, err := r.configureConmonEnv(runtimeDir)
- if err != nil {
- return -1, nil, err
- }
-
- if preserveFDs > 0 {
- for fd := 3; fd < 3+preserveFDs; fd++ {
- execCmd.ExtraFiles = append(execCmd.ExtraFiles, os.NewFile(uintptr(fd), fmt.Sprintf("fd-%d", fd)))
- }
- }
-
- // we don't want to step on users fds they asked to preserve
- // Since 0-2 are used for stdio, start the fds we pass in at preserveFDs+3
- execCmd.Env = append(r.conmonEnv, fmt.Sprintf("_OCI_SYNCPIPE=%d", preserveFDs+3), fmt.Sprintf("_OCI_STARTPIPE=%d", preserveFDs+4), fmt.Sprintf("_OCI_ATTACHPIPE=%d", preserveFDs+5))
- execCmd.Env = append(execCmd.Env, conmonEnv...)
-
- execCmd.ExtraFiles = append(execCmd.ExtraFiles, childSyncPipe, childStartPipe, childAttachPipe)
- execCmd.ExtraFiles = append(execCmd.ExtraFiles, extraFiles...)
- execCmd.Dir = c.execBundlePath(sessionID)
- execCmd.SysProcAttr = &syscall.SysProcAttr{
- Setpgid: true,
- }
-
- err = startCommandGivenSelinux(execCmd)
-
- // We don't need children pipes on the parent side
- errorhandling.CloseQuiet(childSyncPipe)
- errorhandling.CloseQuiet(childAttachPipe)
- errorhandling.CloseQuiet(childStartPipe)
- childrenClosed = true
-
- if err != nil {
- return -1, nil, errors.Wrapf(err, "cannot start container %s", c.ID())
- }
- if err := r.moveConmonToCgroupAndSignal(c, execCmd, parentStartPipe, sessionID); err != nil {
- return -1, nil, err
- }
-
- if preserveFDs > 0 {
- for fd := 3; fd < 3+preserveFDs; fd++ {
- // These fds were passed down to the runtime. Close them
- // and not interfere
- if err := os.NewFile(uintptr(fd), fmt.Sprintf("fd-%d", fd)).Close(); err != nil {
- logrus.Debugf("unable to close file fd-%d", fd)
- }
- }
- }
-
- // TODO Only create if !detach
- // Attach to the container before starting it
- attachChan := make(chan error)
- go func() {
- // attachToExec is responsible for closing pipes
- attachChan <- c.attachToExec(streams, detachKeys, resize, sessionID, parentStartPipe, parentAttachPipe)
- close(attachChan)
- }()
- attachToExecCalled = true
-
- pid, err := readConmonPipeData(parentSyncPipe, ociLog)
-
- return pid, attachChan, err
-}
-
-// Wait for a container which has been sent a signal to stop
-func waitContainerStop(ctr *Container, timeout time.Duration) error {
- done := make(chan struct{})
- chControl := make(chan struct{})
- go func() {
- for {
- select {
- case <-chControl:
- return
- default:
- // Check if the process is still around
- err := unix.Kill(ctr.state.PID, 0)
- if err == unix.ESRCH {
- close(done)
- return
- }
- time.Sleep(100 * time.Millisecond)
- }
- }
- }()
- select {
- case <-done:
- return nil
- case <-time.After(timeout):
- close(chControl)
- logrus.Debugf("container %s did not die within timeout %d", ctr.ID(), timeout)
- return errors.Errorf("container %s did not die within timeout", ctr.ID())
- }
-}
-
-// Wait for a set of given PIDs to stop
-func waitPidsStop(pids []int, timeout time.Duration) error {
- done := make(chan struct{})
- chControl := make(chan struct{})
- go func() {
- for {
- select {
- case <-chControl:
- return
- default:
- allClosed := true
- for _, pid := range pids {
- if err := unix.Kill(pid, 0); err != unix.ESRCH {
- allClosed = false
- break
- }
- }
- if allClosed {
- close(done)
- return
- }
- time.Sleep(100 * time.Millisecond)
- }
- }
- }()
- select {
- case <-done:
- return nil
- case <-time.After(timeout):
- close(chControl)
- return errors.Errorf("given PIDs did not die within timeout")
- }
-}
-
-// stopContainer stops a container, first using its given stop signal (or
-// SIGTERM if no signal was specified), then using SIGKILL
-// Timeout is given in seconds. If timeout is 0, the container will be
-// immediately kill with SIGKILL
-// Does not set finished time for container, assumes you will run updateStatus
-// after to pull the exit code
-func (r *OCIRuntime) stopContainer(ctr *Container, timeout uint) error {
- logrus.Debugf("Stopping container %s (PID %d)", ctr.ID(), ctr.state.PID)
-
- // Ping the container to see if it's alive
- // If it's not, it's already stopped, return
- err := unix.Kill(ctr.state.PID, 0)
- if err == unix.ESRCH {
- return nil
- }
-
- stopSignal := ctr.config.StopSignal
- if stopSignal == 0 {
- stopSignal = uint(syscall.SIGTERM)
- }
-
- if timeout > 0 {
- if err := r.killContainer(ctr, stopSignal); err != nil {
- // Is the container gone?
- // If so, it probably died between the first check and
- // our sending the signal
- // The container is stopped, so exit cleanly
- err := unix.Kill(ctr.state.PID, 0)
- if err == unix.ESRCH {
- return nil
- }
-
- return err
- }
-
- if err := waitContainerStop(ctr, time.Duration(timeout)*time.Second); err != nil {
- logrus.Warnf("Timed out stopping container %s, resorting to SIGKILL", ctr.ID())
- } else {
- // No error, the container is dead
- return nil
- }
- }
-
- var args []string
- if rootless.IsRootless() || ctr.config.NoCgroups {
- // we don't use --all for rootless containers as the OCI runtime might use
- // the cgroups to determine the PIDs, but for rootless containers there is
- // not any.
- // Same logic for NoCgroups - we can't use cgroups as the user
- // explicitly requested none be created.
- args = []string{"kill", ctr.ID(), "KILL"}
- } else {
- args = []string{"kill", "--all", ctr.ID(), "KILL"}
- }
-
- runtimeDir, err := util.GetRuntimeDir()
- if err != nil {
- return err
- }
- env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)}
- if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, args...); err != nil {
- // Again, check if the container is gone. If it is, exit cleanly.
- err := unix.Kill(ctr.state.PID, 0)
- if err == unix.ESRCH {
- return nil
- }
-
- return errors.Wrapf(err, "error sending SIGKILL to container %s", ctr.ID())
- }
-
- // Give runtime a few seconds to make it happen
- if err := waitContainerStop(ctr, killContainerTimeout); err != nil {
- return err
- }
-
- return nil
-}
-
-// execStopContainer stops all active exec sessions in a container
-// It will also stop all other processes in the container. It is only intended
-// to be used to assist in cleanup when removing a container.
-// SIGTERM is used by default to stop processes. If SIGTERM fails, SIGKILL will be used.
-func (r *OCIRuntime) execStopContainer(ctr *Container, timeout uint) error {
- // Do we have active exec sessions?
- if len(ctr.state.ExecSessions) == 0 {
- return nil
- }
-
- // Get a list of active exec sessions
- execSessions := []int{}
- for _, session := range ctr.state.ExecSessions {
- pid := session.PID
- // Ping the PID with signal 0 to see if it still exists
- if err := unix.Kill(pid, 0); err == unix.ESRCH {
- continue
- }
-
- execSessions = append(execSessions, pid)
- }
-
- // All the sessions may be dead
- // If they are, just return
- if len(execSessions) == 0 {
- return nil
- }
- runtimeDir, err := util.GetRuntimeDir()
- if err != nil {
- return err
- }
- env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)}
-
- // If timeout is 0, just use SIGKILL
- if timeout > 0 {
- // Stop using SIGTERM by default
- // Use SIGSTOP after a timeout
- logrus.Debugf("Killing all processes in container %s with SIGTERM", ctr.ID())
- if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "kill", "--all", ctr.ID(), "TERM"); err != nil {
- return errors.Wrapf(err, "error sending SIGTERM to container %s processes", ctr.ID())
- }
-
- // Wait for all processes to stop
- if err := waitPidsStop(execSessions, time.Duration(timeout)*time.Second); err != nil {
- logrus.Warnf("Timed out stopping container %s exec sessions", ctr.ID())
- } else {
- // No error, all exec sessions are dead
- return nil
- }
- }
-
- // Send SIGKILL
- logrus.Debugf("Killing all processes in container %s with SIGKILL", ctr.ID())
- if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "kill", "--all", ctr.ID(), "KILL"); err != nil {
- return errors.Wrapf(err, "error sending SIGKILL to container %s processes", ctr.ID())
- }
-
- // Give the processes a few seconds to go down
- if err := waitPidsStop(execSessions, killContainerTimeout); err != nil {
- return errors.Wrapf(err, "failed to kill container %s exec sessions", ctr.ID())
- }
-
- return nil
-}
diff --git a/libpod/oci_unsupported.go b/libpod/oci_unsupported.go
deleted file mode 100644
index 4a65d4d1d..000000000
--- a/libpod/oci_unsupported.go
+++ /dev/null
@@ -1,47 +0,0 @@
-// +build !linux
-
-package libpod
-
-import (
- "os"
- "os/exec"
-
- "github.com/containers/libpod/libpod/define"
- "k8s.io/client-go/tools/remotecommand"
-)
-
-func (r *OCIRuntime) moveConmonToCgroup(ctr *Container, cgroupParent string, cmd *exec.Cmd) error {
- return define.ErrOSNotSupported
-}
-
-func newPipe() (parent *os.File, child *os.File, err error) {
- return nil, nil, define.ErrNotImplemented
-}
-
-func (r *OCIRuntime) createContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (err error) {
- return define.ErrNotImplemented
-}
-
-func (r *OCIRuntime) pathPackage() string {
- return ""
-}
-
-func (r *OCIRuntime) conmonPackage() string {
- return ""
-}
-
-func (r *OCIRuntime) createOCIContainer(ctr *Container, cgroupParent string, restoreOptions *ContainerCheckpointOptions) (err error) {
- return define.ErrOSNotSupported
-}
-
-func (r *OCIRuntime) execStopContainer(ctr *Container, timeout uint) error {
- return define.ErrOSNotSupported
-}
-
-func (r *OCIRuntime) stopContainer(ctr *Container, timeout uint) error {
- return define.ErrOSNotSupported
-}
-
-func (r *OCIRuntime) execContainer(c *Container, cmd, capAdd, env []string, tty bool, cwd, user, sessionID string, streams *AttachStreams, preserveFDs int, resize chan remotecommand.TerminalSize, detachKeys string) (int, chan error, error) {
- return -1, nil, define.ErrOSNotSupported
-}
diff --git a/libpod/oci_util.go b/libpod/oci_util.go
new file mode 100644
index 000000000..cb85b153d
--- /dev/null
+++ b/libpod/oci_util.go
@@ -0,0 +1,113 @@
+package libpod
+
+import (
+ "fmt"
+ "net"
+ "os"
+ "regexp"
+ "strings"
+ "time"
+
+ "github.com/containers/libpod/libpod/define"
+ "github.com/cri-o/ocicni/pkg/ocicni"
+ "github.com/pkg/errors"
+ "github.com/sirupsen/logrus"
+)
+
+const (
+ // CgroupfsCgroupsManager represents cgroupfs native cgroup manager
+ CgroupfsCgroupsManager = "cgroupfs"
+ // SystemdCgroupsManager represents systemd native cgroup manager
+ SystemdCgroupsManager = "systemd"
+
+ // ContainerCreateTimeout is the timeout before we decide we've failed
+ // to create a container.
+ // TODO: Make this generic - all OCI runtime operations should use the
+ // same timeout, this one.
+ // TODO: Consider dropping from 240 to 60 seconds. I don't think waiting
+ // 4 minutes versus 1 minute makes a real difference.
+ ContainerCreateTimeout = 240 * time.Second
+
+ // Timeout before declaring that runtime has failed to kill a given
+ // container
+ killContainerTimeout = 5 * time.Second
+ // DefaultShmSize is the default shm size
+ DefaultShmSize = 64 * 1024 * 1024
+ // NsRunDir is the default directory in which running network namespaces
+ // are stored
+ NsRunDir = "/var/run/netns"
+)
+
+// ociError is used to parse the OCI runtime JSON log. It is not part of the
+// OCI runtime specifications, it follows what runc does
+type ociError struct {
+ Level string `json:"level,omitempty"`
+ Time string `json:"time,omitempty"`
+ Msg string `json:"msg,omitempty"`
+}
+
+// Create systemd unit name for cgroup scopes
+func createUnitName(prefix string, name string) string {
+ return fmt.Sprintf("%s-%s.scope", prefix, name)
+}
+
+// Bind ports to keep them closed on the host
+func bindPorts(ports []ocicni.PortMapping) ([]*os.File, error) {
+ var files []*os.File
+ notifySCTP := false
+ for _, i := range ports {
+ switch i.Protocol {
+ case "udp":
+ addr, err := net.ResolveUDPAddr("udp", fmt.Sprintf("%s:%d", i.HostIP, i.HostPort))
+ if err != nil {
+ return nil, errors.Wrapf(err, "cannot resolve the UDP address")
+ }
+
+ server, err := net.ListenUDP("udp", addr)
+ if err != nil {
+ return nil, errors.Wrapf(err, "cannot listen on the UDP port")
+ }
+ f, err := server.File()
+ if err != nil {
+ return nil, errors.Wrapf(err, "cannot get file for UDP socket")
+ }
+ files = append(files, f)
+
+ case "tcp":
+ addr, err := net.ResolveTCPAddr("tcp4", fmt.Sprintf("%s:%d", i.HostIP, i.HostPort))
+ if err != nil {
+ return nil, errors.Wrapf(err, "cannot resolve the TCP address")
+ }
+
+ server, err := net.ListenTCP("tcp4", addr)
+ if err != nil {
+ return nil, errors.Wrapf(err, "cannot listen on the TCP port")
+ }
+ f, err := server.File()
+ if err != nil {
+ return nil, errors.Wrapf(err, "cannot get file for TCP socket")
+ }
+ files = append(files, f)
+ case "sctp":
+ if !notifySCTP {
+ notifySCTP = true
+ logrus.Warnf("port reservation for SCTP is not supported")
+ }
+ default:
+ return nil, fmt.Errorf("unknown protocol %s", i.Protocol)
+
+ }
+ }
+ return files, nil
+}
+
+func getOCIRuntimeError(runtimeMsg string) error {
+ r := strings.ToLower(runtimeMsg)
+ if match, _ := regexp.MatchString(".*permission denied.*|.*operation not permitted.*", r); match {
+ return errors.Wrapf(define.ErrOCIRuntimePermissionDenied, "%s", strings.Trim(runtimeMsg, "\n"))
+ }
+ if match, _ := regexp.MatchString(".*executable file not found in.*|.*no such file or directory.*", r); match {
+ return errors.Wrapf(define.ErrOCIRuntimeNotFound, "%s", strings.Trim(runtimeMsg, "\n"))
+ }
+ return errors.Wrapf(define.ErrOCIRuntime, "%s", strings.Trim(runtimeMsg, "\n"))
+}
diff --git a/libpod/options.go b/libpod/options.go
index ee44439ac..ddc5993af 100644
--- a/libpod/options.go
+++ b/libpod/options.go
@@ -463,6 +463,28 @@ func WithMigrate() RuntimeOption {
}
}
+// WithMigrateRuntime instructs Libpod to change the default OCI runtime on all
+// containers during a migration. This is not used if `MigrateRuntime()` is not
+// also passed.
+// Libpod makes no promises that your containers continue to work with the new
+// runtime - migrations between dissimilar runtimes may well break things.
+// Use with caution.
+func WithMigrateRuntime(requestedRuntime string) RuntimeOption {
+ return func(rt *Runtime) error {
+ if rt.valid {
+ return define.ErrRuntimeFinalized
+ }
+
+ if requestedRuntime == "" {
+ return errors.Wrapf(define.ErrInvalidArg, "must provide a non-empty name for new runtime")
+ }
+
+ rt.migrateRuntime = requestedRuntime
+
+ return nil
+ }
+}
+
// WithEventsLogger sets the events backend to use.
// Currently supported values are "file" for file backend and "journald" for
// journald backend.
diff --git a/libpod/pod_api.go b/libpod/pod_api.go
index 7c786b835..3a194f04b 100644
--- a/libpod/pod_api.go
+++ b/libpod/pod_api.go
@@ -123,7 +123,7 @@ func (p *Pod) StopWithTimeout(ctx context.Context, cleanup bool, timeout int) (m
if timeout > -1 {
stopTimeout = uint(timeout)
}
- if err := ctr.stop(stopTimeout); err != nil {
+ if err := ctr.stop(stopTimeout, false); err != nil {
ctr.lock.Unlock()
ctrErrors[ctr.ID()] = err
continue
@@ -370,7 +370,7 @@ func (p *Pod) Kill(signal uint) (map[string]error, error) {
continue
}
- if err := ctr.ociRuntime.killContainer(ctr, signal); err != nil {
+ if err := ctr.ociRuntime.KillContainer(ctr, signal, false); err != nil {
ctr.lock.Unlock()
ctrErrors[ctr.ID()] = err
continue
diff --git a/libpod/runtime.go b/libpod/runtime.go
index cdb5670ba..a0cf0ad7c 100644
--- a/libpod/runtime.go
+++ b/libpod/runtime.go
@@ -99,8 +99,8 @@ type Runtime struct {
store storage.Store
storageService *storageService
imageContext *types.SystemContext
- defaultOCIRuntime *OCIRuntime
- ociRuntimes map[string]*OCIRuntime
+ defaultOCIRuntime OCIRuntime
+ ociRuntimes map[string]OCIRuntime
netPlugin ocicni.CNIPlugin
conmonPath string
imageRuntime *image.Runtime
@@ -114,6 +114,10 @@ type Runtime struct {
doRenumber bool
doMigrate bool
+ // System migrate can move containers to a new runtime.
+ // We make no promises that these migrated containers work on the new
+ // runtime, though.
+ migrateRuntime string
// valid indicates whether the runtime is ready to use.
// valid is set to true when a runtime is returned from GetRuntime(),
@@ -1053,7 +1057,7 @@ func makeRuntime(ctx context.Context, runtime *Runtime) (err error) {
}
// Get us at least one working OCI runtime.
- runtime.ociRuntimes = make(map[string]*OCIRuntime)
+ runtime.ociRuntimes = make(map[string]OCIRuntime)
// Is the old runtime_path defined?
if runtime.config.RuntimePath != nil {
@@ -1072,7 +1076,7 @@ func makeRuntime(ctx context.Context, runtime *Runtime) (err error) {
json := supportsJSON[name]
nocgroups := supportsNoCgroups[name]
- ociRuntime, err := newOCIRuntime(name, runtime.config.RuntimePath, runtime.conmonPath, runtime.config, json, nocgroups)
+ ociRuntime, err := newConmonOCIRuntime(name, runtime.config.RuntimePath, runtime.conmonPath, runtime.config, json, nocgroups)
if err != nil {
return err
}
@@ -1086,7 +1090,7 @@ func makeRuntime(ctx context.Context, runtime *Runtime) (err error) {
json := supportsJSON[name]
nocgroups := supportsNoCgroups[name]
- ociRuntime, err := newOCIRuntime(name, paths, runtime.conmonPath, runtime.config, json, nocgroups)
+ ociRuntime, err := newConmonOCIRuntime(name, paths, runtime.conmonPath, runtime.config, json, nocgroups)
if err != nil {
// Don't fatally error.
// This will allow us to ship configs including optional
@@ -1109,7 +1113,7 @@ func makeRuntime(ctx context.Context, runtime *Runtime) (err error) {
json := supportsJSON[name]
nocgroups := supportsNoCgroups[name]
- ociRuntime, err := newOCIRuntime(name, []string{runtime.config.OCIRuntime}, runtime.conmonPath, runtime.config, json, nocgroups)
+ ociRuntime, err := newConmonOCIRuntime(name, []string{runtime.config.OCIRuntime}, runtime.conmonPath, runtime.config, json, nocgroups)
if err != nil {
return err
}
@@ -1474,6 +1478,11 @@ func (r *Runtime) SystemContext() *types.SystemContext {
return r.imageContext
}
+// GetOCIRuntimePath retrieves the path of the default OCI runtime.
+func (r *Runtime) GetOCIRuntimePath() string {
+ return r.defaultOCIRuntime.Path()
+}
+
// Since runc does not currently support cgroupV2
// Change to default crun on first running of libpod.conf
// TODO Once runc has support for cgroups, this function should be removed.
diff --git a/libpod/runtime_ctr.go b/libpod/runtime_ctr.go
index 78176a400..411264d25 100644
--- a/libpod/runtime_ctr.go
+++ b/libpod/runtime_ctr.go
@@ -102,7 +102,7 @@ func (r *Runtime) initContainerVariables(rSpec *spec.Spec, config *ContainerConf
ctr.config.StopTimeout = define.CtrRemoveTimeout
- ctr.config.OCIRuntime = r.defaultOCIRuntime.name
+ ctr.config.OCIRuntime = r.defaultOCIRuntime.Name()
// Set namespace based on current runtime namespace
// Do so before options run so they can override it
@@ -167,8 +167,8 @@ func (r *Runtime) setupContainer(ctx context.Context, ctr *Container) (c *Contai
// Check NoCgroups support
if ctr.config.NoCgroups {
- if !ctr.ociRuntime.supportsNoCgroups {
- return nil, errors.Wrapf(define.ErrInvalidArg, "requested OCI runtime %s is not compatible with NoCgroups", ctr.ociRuntime.name)
+ if !ctr.ociRuntime.SupportsNoCgroups() {
+ return nil, errors.Wrapf(define.ErrInvalidArg, "requested OCI runtime %s is not compatible with NoCgroups", ctr.ociRuntime.Name())
}
}
@@ -264,6 +264,14 @@ func (r *Runtime) setupContainer(ctx context.Context, ctr *Container) (c *Contai
g.RemoveMount("/etc/hosts")
g.RemoveMount("/run/.containerenv")
g.RemoveMount("/run/secrets")
+
+ // Regenerate CGroup paths so they don't point to the old
+ // container ID.
+ cgroupPath, err := ctr.getOCICgroupPath()
+ if err != nil {
+ return nil, err
+ }
+ g.SetLinuxCgroupsPath(cgroupPath)
}
// Set up storage for the container
@@ -430,7 +438,7 @@ func (r *Runtime) removeContainer(ctx context.Context, c *Container, force bool,
}
if c.state.State == define.ContainerStatePaused {
- if err := c.ociRuntime.killContainer(c, 9); err != nil {
+ if err := c.ociRuntime.KillContainer(c, 9, false); err != nil {
return err
}
if err := c.unpause(); err != nil {
@@ -444,15 +452,15 @@ func (r *Runtime) removeContainer(ctx context.Context, c *Container, force bool,
// Check that the container's in a good state to be removed
if c.state.State == define.ContainerStateRunning {
- if err := c.stop(c.StopTimeout()); err != nil {
+ if err := c.stop(c.StopTimeout(), true); err != nil {
return errors.Wrapf(err, "cannot remove container %s as it could not be stopped", c.ID())
}
}
// Check that all of our exec sessions have finished
- if len(c.state.ExecSessions) != 0 {
- if err := c.ociRuntime.execStopContainer(c, c.StopTimeout()); err != nil {
- return err
+ for _, session := range c.state.ExecSessions {
+ if err := c.ociRuntime.ExecStopContainer(c, session.ID, c.StopTimeout()); err != nil {
+ return errors.Wrapf(err, "error stopping exec session %s of container %s", session.ID, c.ID())
}
}
diff --git a/libpod/runtime_migrate.go b/libpod/runtime_migrate.go
index c363991e6..d85652232 100644
--- a/libpod/runtime_migrate.go
+++ b/libpod/runtime_migrate.go
@@ -5,14 +5,15 @@ package libpod
import (
"context"
"fmt"
- "github.com/containers/libpod/pkg/util"
"io/ioutil"
"os"
"path/filepath"
"strconv"
"syscall"
+ "github.com/containers/libpod/libpod/define"
"github.com/containers/libpod/pkg/rootless"
+ "github.com/containers/libpod/pkg/util"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
)
@@ -63,11 +64,34 @@ func (r *Runtime) migrate(ctx context.Context) error {
}
}
+ // Did the user request a new runtime?
+ runtimeChangeRequested := r.migrateRuntime != ""
+ requestedRuntime, runtimeExists := r.ociRuntimes[r.migrateRuntime]
+ if !runtimeExists && runtimeChangeRequested {
+ return errors.Wrapf(define.ErrInvalidArg, "change to runtime %q requested but no such runtime is defined", r.migrateRuntime)
+ }
+
for _, ctr := range allCtrs {
+ needsWrite := false
+
+ // Reset pause process location
oldLocation := filepath.Join(ctr.state.RunDir, "conmon.pid")
if ctr.config.ConmonPidFile == oldLocation {
logrus.Infof("changing conmon PID file for %s", ctr.ID())
ctr.config.ConmonPidFile = filepath.Join(ctr.config.StaticDir, "conmon.pid")
+ needsWrite = true
+ }
+
+ // Reset runtime
+ if runtimeChangeRequested {
+ logrus.Infof("Resetting container %s runtime to runtime %s", ctr.ID(), r.migrateRuntime)
+ ctr.config.OCIRuntime = r.migrateRuntime
+ ctr.ociRuntime = requestedRuntime
+
+ needsWrite = true
+ }
+
+ if needsWrite {
if err := r.state.RewriteContainerConfig(ctr, ctr.config); err != nil {
return errors.Wrapf(err, "error rewriting config for container %s", ctr.ID())
}