aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGiuseppe Scrivano <gscrivan@redhat.com>2020-06-18 13:56:30 +0200
committerGiuseppe Scrivano <gscrivan@redhat.com>2020-06-25 17:16:12 +0200
commit6ee5f740a4ecb70636b888e78b02065ee984636c (patch)
treefb01a86c33a53b78420426a6780ce938b25d068f
parent35cca198b31582b75304e3614f8942b83a38422e (diff)
downloadpodman-6ee5f740a4ecb70636b888e78b02065ee984636c.tar.gz
podman-6ee5f740a4ecb70636b888e78b02065ee984636c.tar.bz2
podman-6ee5f740a4ecb70636b888e78b02065ee984636c.zip
podman: add new cgroup mode split
When running under systemd there is no need to create yet another cgroup for the container. With conmon-delegated the current cgroup will be split in two sub cgroups: - supervisor - container The supervisor cgroup will hold conmon and the podman process, while the container cgroup is used by the OCI runtime (using the cgroupfs backend). Closes: https://github.com/containers/libpod/issues/6400 Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
-rw-r--r--cmd/podman/common/create.go2
-rw-r--r--cmd/podman/containers/create.go4
-rw-r--r--docs/source/markdown/podman-create.1.md5
-rw-r--r--docs/source/markdown/podman-run.1.md10
-rw-r--r--libpod/container.go22
-rw-r--r--libpod/container_internal_linux.go12
-rw-r--r--libpod/container_validate.go4
-rw-r--r--libpod/oci_conmon.go7
-rw-r--r--libpod/oci_conmon_linux.go10
-rw-r--r--libpod/options.go2
-rw-r--r--libpod/runtime_ctr.go4
-rw-r--r--utils/utils_supported.go122
-rw-r--r--utils/utils_windows.go12
13 files changed, 201 insertions, 15 deletions
diff --git a/cmd/podman/common/create.go b/cmd/podman/common/create.go
index fbb7f449e..ec3251ae1 100644
--- a/cmd/podman/common/create.go
+++ b/cmd/podman/common/create.go
@@ -57,7 +57,7 @@ func GetCreateFlags(cf *ContainerCLIOpts) *pflag.FlagSet {
createFlags.StringVar(
&cf.CGroupsMode,
"cgroups", containerConfig.Cgroups(),
- `control container cgroup configuration ("enabled"|"disabled"|"no-conmon")`,
+ `control container cgroup configuration ("enabled"|"disabled"|"no-conmon"|"split")`,
)
createFlags.StringVar(
&cf.CGroupParent,
diff --git a/cmd/podman/containers/create.go b/cmd/podman/containers/create.go
index 45ce00c86..68a825231 100644
--- a/cmd/podman/containers/create.go
+++ b/cmd/podman/containers/create.go
@@ -205,6 +205,10 @@ func createInit(c *cobra.Command) error {
cliVals.Env = env
}
+ if c.Flag("cgroups").Changed && cliVals.CGroupsMode == "split" && registry.IsRemote() {
+ return errors.Errorf("the option --cgroups=%q is not supported in remote mode", cliVals.CGroupsMode)
+ }
+
// Docker-compatibility: the "-h" flag for run/create is reserved for
// the hostname (see https://github.com/containers/libpod/issues/1367).
diff --git a/docs/source/markdown/podman-create.1.md b/docs/source/markdown/podman-create.1.md
index 3ec91a3ad..ded668f34 100644
--- a/docs/source/markdown/podman-create.1.md
+++ b/docs/source/markdown/podman-create.1.md
@@ -78,9 +78,12 @@ If the host uses cgroups v1, the default is set to **host**. On cgroups v2 the
**--cgroups**=*mode*
Determines whether the container will create CGroups.
-Valid values are *enabled*, *disabled*, *no-conmon*, which the default being *enabled*.
+Valid values are *enabled*, *disabled*, *no-conmon*, *split*, which the default being *enabled*.
+
+The *enabled* option will create a new cgroup under the cgroup-parent.
The *disabled* option will force the container to not create CGroups, and thus conflicts with CGroup options (**--cgroupns** and **--cgroup-parent**).
The *no-conmon* option disables a new CGroup only for the conmon process.
+The *split* option splits the current cgroup in two sub-cgroups: one for conmon and one for the container payload. It is not possible to set *--cgroup-parent* with *split*.
**--cgroup-parent**=*path*
diff --git a/docs/source/markdown/podman-run.1.md b/docs/source/markdown/podman-run.1.md
index 88666d595..83971107f 100644
--- a/docs/source/markdown/podman-run.1.md
+++ b/docs/source/markdown/podman-run.1.md
@@ -89,14 +89,16 @@ Set the cgroup namespace mode for the container.
If the host uses cgroups v1, the default is set to **host**. On cgroups v2, the default is **private**.
-**--cgroups**=**enabled**|**disabled**|**no-conmon**
+**--cgroups**=**enabled**|**disabled**|**no-conmon**|**split**
Determines whether the container will create CGroups.
-Default is **enabled**. The **disabled** option will force the container
-to not create CGroups, and thus conflicts with CGroup options
-(**--cgroupns** and **--cgroup-parent**).
+Default is **enabled**.
+
+The **enabled** option will create a new cgroup under the cgroup-parent.
+The **disabled** option will force the container to not create CGroups, and thus conflicts with CGroup options (**--cgroupns** and **--cgroup-parent**).
The **no-conmon** option disables a new CGroup only for the **conmon** process.
+The **split** option splits the current cgroup in two sub-cgroups: one for conmon and one for the container payload. It is not possible to set **--cgroup-parent** with **split**.
**--cgroup-parent**=*path*
diff --git a/libpod/container.go b/libpod/container.go
index c85249676..20688e3ee 100644
--- a/libpod/container.go
+++ b/libpod/container.go
@@ -17,6 +17,7 @@ import (
"github.com/containers/libpod/libpod/lock"
"github.com/containers/libpod/pkg/namespaces"
"github.com/containers/libpod/pkg/rootless"
+ "github.com/containers/libpod/utils"
"github.com/containers/storage"
"github.com/cri-o/ocicni/pkg/ocicni"
spec "github.com/opencontainers/runtime-spec/specs-go"
@@ -1089,10 +1090,25 @@ func (c *Container) NamespacePath(linuxNS LinuxNS) (string, error) { //nolint:in
// CGroupPath returns a cgroups "path" for a given container.
func (c *Container) CGroupPath() (string, error) {
- switch c.runtime.config.Engine.CgroupManager {
- case config.CgroupfsCgroupsManager:
+ switch {
+ case c.config.CgroupsMode == cgroupSplit:
+ if c.config.CgroupParent != "" {
+ return "", errors.Errorf("cannot specify cgroup-parent with cgroup-mode %q", cgroupSplit)
+ }
+ cg, err := utils.GetCgroupProcess(c.state.ConmonPID)
+ if err != nil {
+ return "", err
+ }
+ // Use the conmon cgroup for two reasons: we validate the container
+ // delegation was correct, and the conmon cgroup doesn't change at runtime
+ // while we are not sure about the container that can create sub cgroups.
+ if !strings.HasSuffix(cg, "supervisor") {
+ return "", errors.Errorf("invalid cgroup for conmon %q", cg)
+ }
+ return strings.TrimSuffix(cg, "/supervisor") + "/container", nil
+ case c.runtime.config.Engine.CgroupManager == config.CgroupfsCgroupsManager:
return filepath.Join(c.config.CgroupParent, fmt.Sprintf("libpod-%s", c.ID())), nil
- case config.SystemdCgroupsManager:
+ case c.runtime.config.Engine.CgroupManager == config.SystemdCgroupsManager:
if rootless.IsRootless() {
uid := rootless.GetRootlessUID()
parts := strings.SplitN(c.config.CgroupParent, "/", 2)
diff --git a/libpod/container_internal_linux.go b/libpod/container_internal_linux.go
index 5ee6726e0..2c78f6bd2 100644
--- a/libpod/container_internal_linux.go
+++ b/libpod/container_internal_linux.go
@@ -31,6 +31,7 @@ import (
"github.com/containers/libpod/pkg/resolvconf"
"github.com/containers/libpod/pkg/rootless"
"github.com/containers/libpod/pkg/util"
+ "github.com/containers/libpod/utils"
"github.com/containers/storage/pkg/archive"
securejoin "github.com/cyphar/filepath-securejoin"
"github.com/opencontainers/runc/libcontainer/user"
@@ -1505,8 +1506,17 @@ func (c *Container) getOCICgroupPath() (string, error) {
switch {
case (rootless.IsRootless() && !unified) || c.config.NoCgroups:
return "", nil
+ case c.config.CgroupsMode == cgroupSplit:
+ if c.config.CgroupParent != "" {
+ return c.config.CgroupParent, nil
+ }
+ selfCgroup, err := utils.GetOwnCgroup()
+ if err != nil {
+ return "", err
+ }
+ return filepath.Join(selfCgroup, "container"), nil
case c.runtime.config.Engine.CgroupManager == config.SystemdCgroupsManager:
- // When runc is set to use Systemd as a cgroup manager, it
+ // When the OCI runtime is set to use Systemd as a cgroup manager, it
// expects cgroups to be passed as follows:
// slice:prefix:name
systemdCgroups := fmt.Sprintf("%s:libpod:%s", path.Base(c.config.CgroupParent), c.ID())
diff --git a/libpod/container_validate.go b/libpod/container_validate.go
index b7f0aadff..a53a1839d 100644
--- a/libpod/container_validate.go
+++ b/libpod/container_validate.go
@@ -34,6 +34,10 @@ func (c *Container) validate() error {
return errors.Wrapf(define.ErrInvalidArg, "cannot both create a network namespace and join another container's network namespace")
}
+ if c.config.CgroupsMode == cgroupSplit && c.config.CgroupParent != "" {
+ return errors.Wrapf(define.ErrInvalidArg, "cannot specify --cgroup-mode=split with a cgroup-parent")
+ }
+
// Not creating cgroups has a number of requirements, mostly related to
// the PID namespace.
if c.config.NoCgroups || c.config.CgroupsMode == "disabled" {
diff --git a/libpod/oci_conmon.go b/libpod/oci_conmon.go
new file mode 100644
index 000000000..74060b357
--- /dev/null
+++ b/libpod/oci_conmon.go
@@ -0,0 +1,7 @@
+package libpod
+
+const (
+ // cgroupSplit is the cgroup mode for reusing the current cgroup both
+ // for conmon and for the container payload.
+ cgroupSplit = "split"
+)
diff --git a/libpod/oci_conmon_linux.go b/libpod/oci_conmon_linux.go
index d8a89047e..26e5d70b0 100644
--- a/libpod/oci_conmon_linux.go
+++ b/libpod/oci_conmon_linux.go
@@ -881,6 +881,12 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co
return err
}
+ if ctr.config.CgroupsMode == cgroupSplit {
+ if err := utils.MoveUnderCgroupSubtree("supervisor"); err != nil {
+ return err
+ }
+ }
+
args := r.sharedConmonArgs(ctr, ctr.ID(), ctr.bundlePath(), filepath.Join(ctr.state.RunDir, "pidfile"), ctr.LogPath(), r.exitsDir, ociLog, ctr.LogDriver(), logTag)
if ctr.config.Spec.Process.Terminal {
@@ -1173,7 +1179,7 @@ func (r *ConmonOCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, p
"--socket-dir-path", r.socketsDir,
}
- if r.cgroupManager == config.SystemdCgroupsManager && !ctr.config.NoCgroups {
+ if r.cgroupManager == config.SystemdCgroupsManager && !ctr.config.NoCgroups && ctr.config.CgroupsMode != cgroupSplit {
args = append(args, "-s")
}
@@ -1275,7 +1281,7 @@ func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec
// If cgroup creation is disabled - just signal.
switch ctr.config.CgroupsMode {
- case "disabled", "no-conmon":
+ case "disabled", "no-conmon", cgroupSplit:
mustCreateCgroup = false
}
diff --git a/libpod/options.go b/libpod/options.go
index 7a60870a0..28be1bc03 100644
--- a/libpod/options.go
+++ b/libpod/options.go
@@ -1049,7 +1049,7 @@ func WithCgroupsMode(mode string) CtrCreateOption {
case "disabled":
ctr.config.NoCgroups = true
ctr.config.CgroupsMode = mode
- case "enabled", "no-conmon":
+ case "enabled", "no-conmon", cgroupSplit:
ctr.config.CgroupsMode = mode
default:
return errors.Wrapf(define.ErrInvalidArg, "Invalid cgroup mode %q", mode)
diff --git a/libpod/runtime_ctr.go b/libpod/runtime_ctr.go
index f1752cbeb..dd6602acb 100644
--- a/libpod/runtime_ctr.go
+++ b/libpod/runtime_ctr.go
@@ -233,9 +233,9 @@ func (r *Runtime) setupContainer(ctx context.Context, ctr *Container) (_ *Contai
return nil, errors.Wrapf(err, "error retrieving pod %s cgroup", pod.ID())
}
ctr.config.CgroupParent = podCgroup
- case rootless.IsRootless():
+ case rootless.IsRootless() && ctr.config.CgroupsMode != cgroupSplit:
ctr.config.CgroupParent = SystemdDefaultRootlessCgroupParent
- default:
+ case ctr.config.CgroupsMode != cgroupSplit:
ctr.config.CgroupParent = SystemdDefaultCgroupParent
}
} else if len(ctr.config.CgroupParent) < 6 || !strings.HasSuffix(path.Base(ctr.config.CgroupParent), ".slice") {
diff --git a/utils/utils_supported.go b/utils/utils_supported.go
index ce9fd5604..201ddb57b 100644
--- a/utils/utils_supported.go
+++ b/utils/utils_supported.go
@@ -3,10 +3,20 @@
package utils
import (
+ "bufio"
+ "bytes"
+ "fmt"
+ "io/ioutil"
+ "os"
+ "path/filepath"
+ "strings"
+
"github.com/containers/libpod/pkg/cgroups"
"github.com/containers/libpod/pkg/rootless"
systemdDbus "github.com/coreos/go-systemd/v22/dbus"
"github.com/godbus/dbus/v5"
+ "github.com/pkg/errors"
+ "github.com/sirupsen/logrus"
)
// RunUnderSystemdScope adds the specified pid to a systemd scope
@@ -43,6 +53,118 @@ func RunUnderSystemdScope(pid int, slice string, unitName string) error {
return nil
}
+func getCgroupProcess(procFile string) (string, error) {
+ f, err := os.Open(procFile)
+ if err != nil {
+ return "", errors.Wrapf(err, "open file %q", procFile)
+ }
+ defer f.Close()
+
+ scanner := bufio.NewScanner(f)
+ cgroup := "/"
+ for scanner.Scan() {
+ line := scanner.Text()
+ parts := strings.Split(line, ":")
+ if len(parts) != 3 {
+ return "", errors.Errorf("cannot parse cgroup line %q", line)
+ }
+ if strings.HasPrefix(line, "0::") {
+ cgroup = line[3:]
+ break
+ }
+ // root cgroup, skip it
+ if parts[2] == "/" {
+ continue
+ }
+ // The process must have the same cgroup path for all controllers
+ // The OCI runtime spec file allow us to specify only one path.
+ if cgroup != "/" && cgroup != parts[2] {
+ return "", errors.Errorf("cgroup configuration not supported, the process is in two different cgroups")
+ }
+ cgroup = parts[2]
+ }
+ if cgroup == "/" {
+ return "", errors.Errorf("could not find cgroup mount in %q", procFile)
+ }
+ return cgroup, nil
+}
+
+// GetOwnCgroup returns the cgroup for the current process.
+func GetOwnCgroup() (string, error) {
+ return getCgroupProcess("/proc/self/cgroup")
+}
+
+// GetCgroupProcess returns the cgroup for the specified process process.
+func GetCgroupProcess(pid int) (string, error) {
+ return getCgroupProcess(fmt.Sprintf("/proc/%d/cgroup", pid))
+}
+
+// MoveUnderCgroupSubtree moves the PID under a cgroup subtree.
+func MoveUnderCgroupSubtree(subtree string) error {
+ procFile := "/proc/self/cgroup"
+ f, err := os.Open(procFile)
+ if err != nil {
+ return errors.Wrapf(err, "open file %q", procFile)
+ }
+ defer f.Close()
+
+ unifiedMode, err := cgroups.IsCgroup2UnifiedMode()
+ if err != nil {
+ return err
+ }
+
+ scanner := bufio.NewScanner(f)
+ for scanner.Scan() {
+ line := scanner.Text()
+ parts := strings.Split(line, ":")
+ if len(parts) != 3 {
+ return errors.Errorf("cannot parse cgroup line %q", line)
+ }
+
+ // root cgroup, skip it
+ if parts[2] == "/" {
+ continue
+ }
+
+ cgroupRoot := "/sys/fs/cgroup"
+ // Special case the unified mount on hybrid cgroup and named hierarchies.
+ // This works on Fedora 31, but we should really parse the mounts to see
+ // where the cgroup hierarchy is mounted.
+ if parts[1] == "" && !unifiedMode {
+ // If it is not using unified mode, the cgroup v2 hierarchy is
+ // usually mounted under /sys/fs/cgroup/unified
+ cgroupRoot = filepath.Join(cgroupRoot, "unified")
+ } else if parts[1] != "" {
+ // Assume the controller is mounted at /sys/fs/cgroup/$CONTROLLER.
+ controller := strings.TrimPrefix(parts[1], "name=")
+ cgroupRoot = filepath.Join(cgroupRoot, controller)
+ }
+
+ processes, err := ioutil.ReadFile(filepath.Join(cgroupRoot, parts[2], "cgroup.procs"))
+ if err != nil {
+ return err
+ }
+
+ newCgroup := filepath.Join(cgroupRoot, parts[2], subtree)
+ if err := os.Mkdir(newCgroup, 0755); err != nil {
+ return err
+ }
+
+ f, err := os.OpenFile(filepath.Join(newCgroup, "cgroup.procs"), os.O_RDWR, 0755)
+ if err != nil {
+ return err
+ }
+ defer f.Close()
+
+ for _, pid := range bytes.Split(processes, []byte("\n")) {
+ if _, err := f.Write(pid); err != nil {
+ logrus.Warnf("Cannot move process %s to cgroup %q", pid, newCgroup)
+ }
+ }
+ }
+ return nil
+}
+
func newProp(name string, units interface{}) systemdDbus.Property {
return systemdDbus.Property{
Name: name,
diff --git a/utils/utils_windows.go b/utils/utils_windows.go
index db27877d9..1a2196029 100644
--- a/utils/utils_windows.go
+++ b/utils/utils_windows.go
@@ -7,3 +7,15 @@ import "github.com/pkg/errors"
func RunUnderSystemdScope(pid int, slice string, unitName string) error {
return errors.New("not implemented for windows")
}
+
+func MoveUnderCgroupSubtree(subtree string) error {
+ return errors.New("not implemented for windows")
+}
+
+func GetOwnCgroup() (string, error) {
+ return "", errors.New("not implemented for windows")
+}
+
+func GetCgroupProcess(pid int) (string, error) {
+ return "", errors.New("not implemented for windows")
+}