summaryrefslogtreecommitdiff
path: root/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd
diff options
context:
space:
mode:
authorMatthew Heon <matthew.heon@gmail.com>2017-11-01 11:24:59 -0400
committerMatthew Heon <matthew.heon@gmail.com>2017-11-01 11:24:59 -0400
commita031b83a09a8628435317a03f199cdc18b78262f (patch)
treebc017a96769ce6de33745b8b0b1304ccf38e9df0 /vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd
parent2b74391cd5281f6fdf391ff8ad50fd1490f6bf89 (diff)
downloadpodman-a031b83a09a8628435317a03f199cdc18b78262f.tar.gz
podman-a031b83a09a8628435317a03f199cdc18b78262f.tar.bz2
podman-a031b83a09a8628435317a03f199cdc18b78262f.zip
Initial checkin from CRI-O repo
Signed-off-by: Matthew Heon <matthew.heon@gmail.com>
Diffstat (limited to 'vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd')
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_nosystemd.go55
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go553
2 files changed, 608 insertions, 0 deletions
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_nosystemd.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_nosystemd.go
new file mode 100644
index 000000000..7de9ae605
--- /dev/null
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_nosystemd.go
@@ -0,0 +1,55 @@
+// +build !linux
+
+package systemd
+
+import (
+ "fmt"
+
+ "github.com/opencontainers/runc/libcontainer/cgroups"
+ "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type Manager struct {
+ Cgroups *configs.Cgroup
+ Paths map[string]string
+}
+
+func UseSystemd() bool {
+ return false
+}
+
+func (m *Manager) Apply(pid int) error {
+ return fmt.Errorf("Systemd not supported")
+}
+
+func (m *Manager) GetPids() ([]int, error) {
+ return nil, fmt.Errorf("Systemd not supported")
+}
+
+func (m *Manager) GetAllPids() ([]int, error) {
+ return nil, fmt.Errorf("Systemd not supported")
+}
+
+func (m *Manager) Destroy() error {
+ return fmt.Errorf("Systemd not supported")
+}
+
+func (m *Manager) GetPaths() map[string]string {
+ return nil
+}
+
+func (m *Manager) GetStats() (*cgroups.Stats, error) {
+ return nil, fmt.Errorf("Systemd not supported")
+}
+
+func (m *Manager) Set(container *configs.Config) error {
+ return nil, fmt.Errorf("Systemd not supported")
+}
+
+func (m *Manager) Freeze(state configs.FreezerState) error {
+ return fmt.Errorf("Systemd not supported")
+}
+
+func Freeze(c *configs.Cgroup, state configs.FreezerState) error {
+ return fmt.Errorf("Systemd not supported")
+}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go
new file mode 100644
index 000000000..b010b4b32
--- /dev/null
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go
@@ -0,0 +1,553 @@
+// +build linux
+
+package systemd
+
+import (
+ "errors"
+ "fmt"
+ "os"
+ "path/filepath"
+ "strings"
+ "sync"
+ "time"
+
+ systemdDbus "github.com/coreos/go-systemd/dbus"
+ systemdUtil "github.com/coreos/go-systemd/util"
+ "github.com/godbus/dbus"
+ "github.com/opencontainers/runc/libcontainer/cgroups"
+ "github.com/opencontainers/runc/libcontainer/cgroups/fs"
+ "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type Manager struct {
+ mu sync.Mutex
+ Cgroups *configs.Cgroup
+ Paths map[string]string
+}
+
+type subsystem interface {
+ // Name returns the name of the subsystem.
+ Name() string
+ // Returns the stats, as 'stats', corresponding to the cgroup under 'path'.
+ GetStats(path string, stats *cgroups.Stats) error
+ // Set the cgroup represented by cgroup.
+ Set(path string, cgroup *configs.Cgroup) error
+}
+
+var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist")
+
+type subsystemSet []subsystem
+
+func (s subsystemSet) Get(name string) (subsystem, error) {
+ for _, ss := range s {
+ if ss.Name() == name {
+ return ss, nil
+ }
+ }
+ return nil, errSubsystemDoesNotExist
+}
+
+var subsystems = subsystemSet{
+ &fs.CpusetGroup{},
+ &fs.DevicesGroup{},
+ &fs.MemoryGroup{},
+ &fs.CpuGroup{},
+ &fs.CpuacctGroup{},
+ &fs.PidsGroup{},
+ &fs.BlkioGroup{},
+ &fs.HugetlbGroup{},
+ &fs.PerfEventGroup{},
+ &fs.FreezerGroup{},
+ &fs.NetPrioGroup{},
+ &fs.NetClsGroup{},
+ &fs.NameGroup{GroupName: "name=systemd"},
+}
+
+const (
+ testScopeWait = 4
+ testSliceWait = 4
+)
+
+var (
+ connLock sync.Mutex
+ theConn *systemdDbus.Conn
+ hasStartTransientUnit bool
+ hasStartTransientSliceUnit bool
+ hasTransientDefaultDependencies bool
+ hasDelegate bool
+)
+
+func newProp(name string, units interface{}) systemdDbus.Property {
+ return systemdDbus.Property{
+ Name: name,
+ Value: dbus.MakeVariant(units),
+ }
+}
+
+func UseSystemd() bool {
+ if !systemdUtil.IsRunningSystemd() {
+ return false
+ }
+
+ connLock.Lock()
+ defer connLock.Unlock()
+
+ if theConn == nil {
+ var err error
+ theConn, err = systemdDbus.New()
+ if err != nil {
+ return false
+ }
+
+ // Assume we have StartTransientUnit
+ hasStartTransientUnit = true
+
+ // But if we get UnknownMethod error we don't
+ if _, err := theConn.StartTransientUnit("test.scope", "invalid", nil, nil); err != nil {
+ if dbusError, ok := err.(dbus.Error); ok {
+ if dbusError.Name == "org.freedesktop.DBus.Error.UnknownMethod" {
+ hasStartTransientUnit = false
+ return hasStartTransientUnit
+ }
+ }
+ }
+
+ // Ensure the scope name we use doesn't exist. Use the Pid to
+ // avoid collisions between multiple libcontainer users on a
+ // single host.
+ scope := fmt.Sprintf("libcontainer-%d-systemd-test-default-dependencies.scope", os.Getpid())
+ testScopeExists := true
+ for i := 0; i <= testScopeWait; i++ {
+ if _, err := theConn.StopUnit(scope, "replace", nil); err != nil {
+ if dbusError, ok := err.(dbus.Error); ok {
+ if strings.Contains(dbusError.Name, "org.freedesktop.systemd1.NoSuchUnit") {
+ testScopeExists = false
+ break
+ }
+ }
+ }
+ time.Sleep(time.Millisecond)
+ }
+
+ // Bail out if we can't kill this scope without testing for DefaultDependencies
+ if testScopeExists {
+ return hasStartTransientUnit
+ }
+
+ // Assume StartTransientUnit on a scope allows DefaultDependencies
+ hasTransientDefaultDependencies = true
+ ddf := newProp("DefaultDependencies", false)
+ if _, err := theConn.StartTransientUnit(scope, "replace", []systemdDbus.Property{ddf}, nil); err != nil {
+ if dbusError, ok := err.(dbus.Error); ok {
+ if strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.PropertyReadOnly") {
+ hasTransientDefaultDependencies = false
+ }
+ }
+ }
+
+ // Not critical because of the stop unit logic above.
+ theConn.StopUnit(scope, "replace", nil)
+
+ // Assume StartTransientUnit on a scope allows Delegate
+ hasDelegate = true
+ dl := newProp("Delegate", true)
+ if _, err := theConn.StartTransientUnit(scope, "replace", []systemdDbus.Property{dl}, nil); err != nil {
+ if dbusError, ok := err.(dbus.Error); ok {
+ if strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.PropertyReadOnly") {
+ hasDelegate = false
+ }
+ }
+ }
+
+ // Assume we have the ability to start a transient unit as a slice
+ // This was broken until systemd v229, but has been back-ported on RHEL environments >= 219
+ // For details, see: https://bugzilla.redhat.com/show_bug.cgi?id=1370299
+ hasStartTransientSliceUnit = true
+
+ // To ensure simple clean-up, we create a slice off the root with no hierarchy
+ slice := fmt.Sprintf("libcontainer_%d_systemd_test_default.slice", os.Getpid())
+ if _, err := theConn.StartTransientUnit(slice, "replace", nil, nil); err != nil {
+ if _, ok := err.(dbus.Error); ok {
+ hasStartTransientSliceUnit = false
+ }
+ }
+
+ for i := 0; i <= testSliceWait; i++ {
+ if _, err := theConn.StopUnit(slice, "replace", nil); err != nil {
+ if dbusError, ok := err.(dbus.Error); ok {
+ if strings.Contains(dbusError.Name, "org.freedesktop.systemd1.NoSuchUnit") {
+ hasStartTransientSliceUnit = false
+ break
+ }
+ }
+ } else {
+ break
+ }
+ time.Sleep(time.Millisecond)
+ }
+
+ // Not critical because of the stop unit logic above.
+ theConn.StopUnit(scope, "replace", nil)
+ theConn.StopUnit(slice, "replace", nil)
+ }
+ return hasStartTransientUnit
+}
+
+func (m *Manager) Apply(pid int) error {
+ var (
+ c = m.Cgroups
+ unitName = getUnitName(c)
+ slice = "system.slice"
+ properties []systemdDbus.Property
+ )
+
+ if c.Paths != nil {
+ paths := make(map[string]string)
+ for name, path := range c.Paths {
+ _, err := getSubsystemPath(m.Cgroups, name)
+ if err != nil {
+ // Don't fail if a cgroup hierarchy was not found, just skip this subsystem
+ if cgroups.IsNotFound(err) {
+ continue
+ }
+ return err
+ }
+ paths[name] = path
+ }
+ m.Paths = paths
+ return cgroups.EnterPid(m.Paths, pid)
+ }
+
+ if c.Parent != "" {
+ slice = c.Parent
+ }
+
+ properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name))
+
+ // if we create a slice, the parent is defined via a Wants=
+ if strings.HasSuffix(unitName, ".slice") {
+ // This was broken until systemd v229, but has been back-ported on RHEL environments >= 219
+ if !hasStartTransientSliceUnit {
+ return fmt.Errorf("systemd version does not support ability to start a slice as transient unit")
+ }
+ properties = append(properties, systemdDbus.PropWants(slice))
+ } else {
+ // otherwise, we use Slice=
+ properties = append(properties, systemdDbus.PropSlice(slice))
+ }
+
+ // only add pid if its valid, -1 is used w/ general slice creation.
+ if pid != -1 {
+ properties = append(properties, newProp("PIDs", []uint32{uint32(pid)}))
+ }
+
+ if hasDelegate {
+ // This is only supported on systemd versions 218 and above.
+ properties = append(properties, newProp("Delegate", true))
+ }
+
+ // Always enable accounting, this gets us the same behaviour as the fs implementation,
+ // plus the kernel has some problems with joining the memory cgroup at a later time.
+ properties = append(properties,
+ newProp("MemoryAccounting", true),
+ newProp("CPUAccounting", true),
+ newProp("BlockIOAccounting", true))
+
+ if hasTransientDefaultDependencies {
+ properties = append(properties,
+ newProp("DefaultDependencies", false))
+ }
+
+ if c.Resources.Memory != 0 {
+ properties = append(properties,
+ newProp("MemoryLimit", c.Resources.Memory))
+ }
+
+ if c.Resources.CpuShares != 0 {
+ properties = append(properties,
+ newProp("CPUShares", c.Resources.CpuShares))
+ }
+
+ // cpu.cfs_quota_us and cpu.cfs_period_us are controlled by systemd.
+ if c.Resources.CpuQuota != 0 && c.Resources.CpuPeriod != 0 {
+ cpuQuotaPerSecUSec := uint64(c.Resources.CpuQuota*1000000) / c.Resources.CpuPeriod
+ properties = append(properties,
+ newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec))
+ }
+
+ if c.Resources.BlkioWeight != 0 {
+ properties = append(properties,
+ newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight)))
+ }
+
+ // We have to set kernel memory here, as we can't change it once
+ // processes have been attached to the cgroup.
+ if c.Resources.KernelMemory != 0 {
+ if err := setKernelMemory(c); err != nil {
+ return err
+ }
+ }
+
+ if _, err := theConn.StartTransientUnit(unitName, "replace", properties, nil); err != nil && !isUnitExists(err) {
+ return err
+ }
+
+ if err := joinCgroups(c, pid); err != nil {
+ return err
+ }
+
+ paths := make(map[string]string)
+ for _, s := range subsystems {
+ subsystemPath, err := getSubsystemPath(m.Cgroups, s.Name())
+ if err != nil {
+ // Don't fail if a cgroup hierarchy was not found, just skip this subsystem
+ if cgroups.IsNotFound(err) {
+ continue
+ }
+ return err
+ }
+ paths[s.Name()] = subsystemPath
+ }
+ m.Paths = paths
+ return nil
+}
+
+func (m *Manager) Destroy() error {
+ if m.Cgroups.Paths != nil {
+ return nil
+ }
+ m.mu.Lock()
+ defer m.mu.Unlock()
+ theConn.StopUnit(getUnitName(m.Cgroups), "replace", nil)
+ if err := cgroups.RemovePaths(m.Paths); err != nil {
+ return err
+ }
+ m.Paths = make(map[string]string)
+ return nil
+}
+
+func (m *Manager) GetPaths() map[string]string {
+ m.mu.Lock()
+ paths := m.Paths
+ m.mu.Unlock()
+ return paths
+}
+
+func join(c *configs.Cgroup, subsystem string, pid int) (string, error) {
+ path, err := getSubsystemPath(c, subsystem)
+ if err != nil {
+ return "", err
+ }
+ if err := os.MkdirAll(path, 0755); err != nil {
+ return "", err
+ }
+ if err := cgroups.WriteCgroupProc(path, pid); err != nil {
+ return "", err
+ }
+ return path, nil
+}
+
+func joinCgroups(c *configs.Cgroup, pid int) error {
+ for _, sys := range subsystems {
+ name := sys.Name()
+ switch name {
+ case "name=systemd":
+ // let systemd handle this
+ case "cpuset":
+ path, err := getSubsystemPath(c, name)
+ if err != nil && !cgroups.IsNotFound(err) {
+ return err
+ }
+ s := &fs.CpusetGroup{}
+ if err := s.ApplyDir(path, c, pid); err != nil {
+ return err
+ }
+ default:
+ _, err := join(c, name, pid)
+ if err != nil {
+ // Even if it's `not found` error, we'll return err
+ // because devices cgroup is hard requirement for
+ // container security.
+ if name == "devices" {
+ return err
+ }
+ // For other subsystems, omit the `not found` error
+ // because they are optional.
+ if !cgroups.IsNotFound(err) {
+ return err
+ }
+ }
+ }
+ }
+
+ return nil
+}
+
+// systemd represents slice hierarchy using `-`, so we need to follow suit when
+// generating the path of slice. Essentially, test-a-b.slice becomes
+// test.slice/test-a.slice/test-a-b.slice.
+func ExpandSlice(slice string) (string, error) {
+ suffix := ".slice"
+ // Name has to end with ".slice", but can't be just ".slice".
+ if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) {
+ return "", fmt.Errorf("invalid slice name: %s", slice)
+ }
+
+ // Path-separators are not allowed.
+ if strings.Contains(slice, "/") {
+ return "", fmt.Errorf("invalid slice name: %s", slice)
+ }
+
+ var path, prefix string
+ sliceName := strings.TrimSuffix(slice, suffix)
+ // if input was -.slice, we should just return root now
+ if sliceName == "-" {
+ return "/", nil
+ }
+ for _, component := range strings.Split(sliceName, "-") {
+ // test--a.slice isn't permitted, nor is -test.slice.
+ if component == "" {
+ return "", fmt.Errorf("invalid slice name: %s", slice)
+ }
+
+ // Append the component to the path and to the prefix.
+ path += prefix + component + suffix + "/"
+ prefix += component + "-"
+ }
+
+ return path, nil
+}
+
+func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
+ mountpoint, err := cgroups.FindCgroupMountpoint(subsystem)
+ if err != nil {
+ return "", err
+ }
+
+ initPath, err := cgroups.GetInitCgroup(subsystem)
+ if err != nil {
+ return "", err
+ }
+ // if pid 1 is systemd 226 or later, it will be in init.scope, not the root
+ initPath = strings.TrimSuffix(filepath.Clean(initPath), "init.scope")
+
+ slice := "system.slice"
+ if c.Parent != "" {
+ slice = c.Parent
+ }
+
+ slice, err = ExpandSlice(slice)
+ if err != nil {
+ return "", err
+ }
+
+ return filepath.Join(mountpoint, initPath, slice, getUnitName(c)), nil
+}
+
+func (m *Manager) Freeze(state configs.FreezerState) error {
+ path, err := getSubsystemPath(m.Cgroups, "freezer")
+ if err != nil {
+ return err
+ }
+ prevState := m.Cgroups.Resources.Freezer
+ m.Cgroups.Resources.Freezer = state
+ freezer, err := subsystems.Get("freezer")
+ if err != nil {
+ return err
+ }
+ err = freezer.Set(path, m.Cgroups)
+ if err != nil {
+ m.Cgroups.Resources.Freezer = prevState
+ return err
+ }
+ return nil
+}
+
+func (m *Manager) GetPids() ([]int, error) {
+ path, err := getSubsystemPath(m.Cgroups, "devices")
+ if err != nil {
+ return nil, err
+ }
+ return cgroups.GetPids(path)
+}
+
+func (m *Manager) GetAllPids() ([]int, error) {
+ path, err := getSubsystemPath(m.Cgroups, "devices")
+ if err != nil {
+ return nil, err
+ }
+ return cgroups.GetAllPids(path)
+}
+
+func (m *Manager) GetStats() (*cgroups.Stats, error) {
+ m.mu.Lock()
+ defer m.mu.Unlock()
+ stats := cgroups.NewStats()
+ for name, path := range m.Paths {
+ sys, err := subsystems.Get(name)
+ if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) {
+ continue
+ }
+ if err := sys.GetStats(path, stats); err != nil {
+ return nil, err
+ }
+ }
+
+ return stats, nil
+}
+
+func (m *Manager) Set(container *configs.Config) error {
+ // If Paths are set, then we are just joining cgroups paths
+ // and there is no need to set any values.
+ if m.Cgroups.Paths != nil {
+ return nil
+ }
+ for _, sys := range subsystems {
+ // Get the subsystem path, but don't error out for not found cgroups.
+ path, err := getSubsystemPath(container.Cgroups, sys.Name())
+ if err != nil && !cgroups.IsNotFound(err) {
+ return err
+ }
+
+ if err := sys.Set(path, container.Cgroups); err != nil {
+ return err
+ }
+ }
+
+ if m.Paths["cpu"] != "" {
+ if err := fs.CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+func getUnitName(c *configs.Cgroup) string {
+ // by default, we create a scope unless the user explicitly asks for a slice.
+ if !strings.HasSuffix(c.Name, ".slice") {
+ return fmt.Sprintf("%s-%s.scope", c.ScopePrefix, c.Name)
+ }
+ return c.Name
+}
+
+func setKernelMemory(c *configs.Cgroup) error {
+ path, err := getSubsystemPath(c, "memory")
+ if err != nil && !cgroups.IsNotFound(err) {
+ return err
+ }
+
+ if err := os.MkdirAll(path, 0755); err != nil {
+ return err
+ }
+ return fs.EnableKernelMemoryAccounting(path)
+}
+
+// isUnitExists returns true if the error is that a systemd unit already exists.
+func isUnitExists(err error) bool {
+ if err != nil {
+ if dbusError, ok := err.(dbus.Error); ok {
+ return strings.Contains(dbusError.Name, "org.freedesktop.systemd1.UnitExists")
+ }
+ }
+ return false
+}