summaryrefslogtreecommitdiff
path: root/libpod
diff options
context:
space:
mode:
authorcdoern <cdoern@redhat.com>2022-06-13 15:35:16 -0400
committerCharlie Doern <cdoern@redhat.com>2022-06-24 15:39:15 -0400
commit2792e598c7ce1198ec8464a3119504123ae8397c (patch)
tree0d8a1ca5428822278a43cb990308a9f960e08e1e /libpod
parent95707a08bf49141ceb782b28adc947dda213f300 (diff)
downloadpodman-2792e598c7ce1198ec8464a3119504123ae8397c.tar.gz
podman-2792e598c7ce1198ec8464a3119504123ae8397c.tar.bz2
podman-2792e598c7ce1198ec8464a3119504123ae8397c.zip
podman cgroup enhancement
currently, setting any sort of resource limit in a pod does nothing. With the newly refactored creation process in c/common, podman ca now set resources at a pod level meaning that resource related flags can now be exposed to podman pod create. cgroupfs and systemd are both supported with varying completion. cgroupfs is a much simpler process and one that is virtually complete for all resource types, the flags now just need to be added. systemd on the other hand has to be handeled via the dbus api meaning that the limits need to be passed as recognized properties to systemd. The properties added so far are the ones that podman pod create supports as well as `cpuset-mems` as this will be the next flag I work on. Signed-off-by: Charlie Doern <cdoern@redhat.com>
Diffstat (limited to 'libpod')
-rw-r--r--libpod/container_config.go1
-rw-r--r--libpod/container_internal_linux.go1
-rw-r--r--libpod/oci_conmon_linux.go201
-rw-r--r--libpod/pod_internal.go2
-rw-r--r--libpod/runtime_pod_linux.go30
-rw-r--r--libpod/stats.go24
-rw-r--r--libpod/util_linux.go27
7 files changed, 255 insertions, 31 deletions
diff --git a/libpod/container_config.go b/libpod/container_config.go
index 45ff03d58..544c45a8c 100644
--- a/libpod/container_config.go
+++ b/libpod/container_config.go
@@ -424,7 +424,6 @@ type InfraInherit struct {
CapDrop []string `json:"cap_drop,omitempty"`
HostDeviceList []spec.LinuxDevice `json:"host_device_list,omitempty"`
ImageVolumes []*specgen.ImageVolume `json:"image_volumes,omitempty"`
- InfraResources *spec.LinuxResources `json:"resource_limits,omitempty"`
Mounts []spec.Mount `json:"mounts,omitempty"`
NoNewPrivileges bool `json:"no_new_privileges,omitempty"`
OverlayVolumes []*specgen.OverlayVolume `json:"overlay_volumes,omitempty"`
diff --git a/libpod/container_internal_linux.go b/libpod/container_internal_linux.go
index 77b598b16..10f4eeec1 100644
--- a/libpod/container_internal_linux.go
+++ b/libpod/container_internal_linux.go
@@ -870,6 +870,7 @@ func (c *Container) generateSpec(ctx context.Context) (*spec.Spec, error) {
if err != nil {
return nil, err
}
+
g.SetLinuxCgroupsPath(cgroupPath)
// Warning: CDI may alter g.Config in place.
diff --git a/libpod/oci_conmon_linux.go b/libpod/oci_conmon_linux.go
index fde8624b0..d34ff7cea 100644
--- a/libpod/oci_conmon_linux.go
+++ b/libpod/oci_conmon_linux.go
@@ -23,6 +23,9 @@ import (
"text/template"
"time"
+ runcconfig "github.com/opencontainers/runc/libcontainer/configs"
+ "github.com/opencontainers/runc/libcontainer/devices"
+
"github.com/containers/common/pkg/cgroups"
"github.com/containers/common/pkg/config"
conmonConfig "github.com/containers/conmon/runner/config"
@@ -1451,9 +1454,14 @@ func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec
// TODO: This should be a switch - we are not guaranteed that
// there are only 2 valid cgroup managers
cgroupParent := ctr.CgroupParent()
+ cgroupPath := filepath.Join(ctr.config.CgroupParent, "conmon")
+ Resource := ctr.Spec().Linux.Resources
+ cgroupResources, err := GetLimits(Resource)
+ if err != nil {
+ logrus.StandardLogger().Log(logLevel, "Could not get ctr resources")
+ }
if ctr.CgroupManager() == config.SystemdCgroupsManager {
unitName := createUnitName("libpod-conmon", ctr.ID())
-
realCgroupParent := cgroupParent
splitParent := strings.Split(cgroupParent, "/")
if strings.HasSuffix(cgroupParent, ".slice") && len(splitParent) > 1 {
@@ -1465,8 +1473,7 @@ func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec
logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to systemd sandbox cgroup: %v", err)
}
} else {
- cgroupPath := filepath.Join(ctr.config.CgroupParent, "conmon")
- control, err := cgroups.New(cgroupPath, &spec.LinuxResources{})
+ control, err := cgroups.New(cgroupPath, &cgroupResources)
if err != nil {
logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to cgroupfs sandbox cgroup: %v", err)
} else if err := control.AddPid(cmd.Process.Pid); err != nil {
@@ -1748,3 +1755,191 @@ func httpAttachNonTerminalCopy(container *net.UnixConn, http *bufio.ReadWriter,
}
}
}
+
+// GetLimits converts spec resource limits to cgroup consumable limits
+func GetLimits(resource *spec.LinuxResources) (runcconfig.Resources, error) {
+ if resource == nil {
+ resource = &spec.LinuxResources{}
+ }
+ final := &runcconfig.Resources{}
+ devs := []*devices.Rule{}
+
+ // Devices
+ for _, entry := range resource.Devices {
+ if entry.Major == nil || entry.Minor == nil {
+ continue
+ }
+ runeType := 'a'
+ switch entry.Type {
+ case "b":
+ runeType = 'b'
+ case "c":
+ runeType = 'c'
+ }
+
+ devs = append(devs, &devices.Rule{
+ Type: devices.Type(runeType),
+ Major: *entry.Major,
+ Minor: *entry.Minor,
+ Permissions: devices.Permissions(entry.Access),
+ Allow: entry.Allow,
+ })
+ }
+ final.Devices = devs
+
+ // HugepageLimits
+ pageLimits := []*runcconfig.HugepageLimit{}
+ for _, entry := range resource.HugepageLimits {
+ pageLimits = append(pageLimits, &runcconfig.HugepageLimit{
+ Pagesize: entry.Pagesize,
+ Limit: entry.Limit,
+ })
+ }
+ final.HugetlbLimit = pageLimits
+
+ // Networking
+ netPriorities := []*runcconfig.IfPrioMap{}
+ if resource.Network != nil {
+ for _, entry := range resource.Network.Priorities {
+ netPriorities = append(netPriorities, &runcconfig.IfPrioMap{
+ Interface: entry.Name,
+ Priority: int64(entry.Priority),
+ })
+ }
+ }
+ final.NetPrioIfpriomap = netPriorities
+ rdma := make(map[string]runcconfig.LinuxRdma)
+ for name, entry := range resource.Rdma {
+ rdma[name] = runcconfig.LinuxRdma{HcaHandles: entry.HcaHandles, HcaObjects: entry.HcaObjects}
+ }
+ final.Rdma = rdma
+
+ // Memory
+ if resource.Memory != nil {
+ if resource.Memory.Limit != nil {
+ final.Memory = *resource.Memory.Limit
+ }
+ if resource.Memory.Reservation != nil {
+ final.MemoryReservation = *resource.Memory.Reservation
+ }
+ if resource.Memory.Swap != nil {
+ final.MemorySwap = *resource.Memory.Swap
+ }
+ if resource.Memory.Swappiness != nil {
+ final.MemorySwappiness = resource.Memory.Swappiness
+ }
+ }
+
+ // CPU
+ if resource.CPU != nil {
+ if resource.CPU.Period != nil {
+ final.CpuPeriod = *resource.CPU.Period
+ }
+ if resource.CPU.Quota != nil {
+ final.CpuQuota = *resource.CPU.Quota
+ }
+ if resource.CPU.RealtimePeriod != nil {
+ final.CpuRtPeriod = *resource.CPU.RealtimePeriod
+ }
+ if resource.CPU.RealtimeRuntime != nil {
+ final.CpuRtRuntime = *resource.CPU.RealtimeRuntime
+ }
+ if resource.CPU.Shares != nil {
+ final.CpuShares = *resource.CPU.Shares
+ }
+ final.CpusetCpus = resource.CPU.Cpus
+ final.CpusetMems = resource.CPU.Mems
+ }
+
+ // BlkIO
+ if resource.BlockIO != nil {
+ if len(resource.BlockIO.ThrottleReadBpsDevice) > 0 {
+ for _, entry := range resource.BlockIO.ThrottleReadBpsDevice {
+ throttle := &runcconfig.ThrottleDevice{}
+ dev := &runcconfig.BlockIODevice{
+ Major: entry.Major,
+ Minor: entry.Minor,
+ }
+ throttle.BlockIODevice = *dev
+ throttle.Rate = entry.Rate
+ final.BlkioThrottleReadBpsDevice = append(final.BlkioThrottleReadBpsDevice, throttle)
+ }
+ }
+ if len(resource.BlockIO.ThrottleWriteBpsDevice) > 0 {
+ for _, entry := range resource.BlockIO.ThrottleWriteBpsDevice {
+ throttle := &runcconfig.ThrottleDevice{}
+ dev := &runcconfig.BlockIODevice{
+ Major: entry.Major,
+ Minor: entry.Minor,
+ }
+ throttle.BlockIODevice = *dev
+ throttle.Rate = entry.Rate
+ final.BlkioThrottleWriteBpsDevice = append(final.BlkioThrottleWriteBpsDevice, throttle)
+ }
+ }
+ if len(resource.BlockIO.ThrottleReadIOPSDevice) > 0 {
+ for _, entry := range resource.BlockIO.ThrottleReadIOPSDevice {
+ throttle := &runcconfig.ThrottleDevice{}
+ dev := &runcconfig.BlockIODevice{
+ Major: entry.Major,
+ Minor: entry.Minor,
+ }
+ throttle.BlockIODevice = *dev
+ throttle.Rate = entry.Rate
+ final.BlkioThrottleReadIOPSDevice = append(final.BlkioThrottleReadIOPSDevice, throttle)
+ }
+ }
+ if len(resource.BlockIO.ThrottleWriteIOPSDevice) > 0 {
+ for _, entry := range resource.BlockIO.ThrottleWriteIOPSDevice {
+ throttle := &runcconfig.ThrottleDevice{}
+ dev := &runcconfig.BlockIODevice{
+ Major: entry.Major,
+ Minor: entry.Minor,
+ }
+ throttle.BlockIODevice = *dev
+ throttle.Rate = entry.Rate
+ final.BlkioThrottleWriteIOPSDevice = append(final.BlkioThrottleWriteIOPSDevice, throttle)
+ }
+ }
+ if resource.BlockIO.LeafWeight != nil {
+ final.BlkioLeafWeight = *resource.BlockIO.LeafWeight
+ }
+ if resource.BlockIO.Weight != nil {
+ final.BlkioWeight = *resource.BlockIO.Weight
+ }
+ if len(resource.BlockIO.WeightDevice) > 0 {
+ for _, entry := range resource.BlockIO.WeightDevice {
+ weight := &runcconfig.WeightDevice{}
+ dev := &runcconfig.BlockIODevice{
+ Major: entry.Major,
+ Minor: entry.Minor,
+ }
+ if entry.Weight != nil {
+ weight.Weight = *entry.Weight
+ }
+ if entry.LeafWeight != nil {
+ weight.LeafWeight = *entry.LeafWeight
+ }
+ weight.BlockIODevice = *dev
+ final.BlkioWeightDevice = append(final.BlkioWeightDevice, weight)
+ }
+ }
+ }
+
+ // Pids
+ if resource.Pids != nil {
+ final.PidsLimit = resource.Pids.Limit
+ }
+
+ // Networking
+ if resource.Network != nil {
+ if resource.Network.ClassID != nil {
+ final.NetClsClassid = *resource.Network.ClassID
+ }
+ }
+
+ // Unified state
+ final.Unified = resource.Unified
+
+ return *final, nil
+}
diff --git a/libpod/pod_internal.go b/libpod/pod_internal.go
index 41f745e6c..1502bcb06 100644
--- a/libpod/pod_internal.go
+++ b/libpod/pod_internal.go
@@ -69,7 +69,7 @@ func (p *Pod) refresh() error {
if p.config.UsePodCgroup {
switch p.runtime.config.Engine.CgroupManager {
case config.SystemdCgroupsManager:
- cgroupPath, err := systemdSliceFromPath(p.config.CgroupParent, fmt.Sprintf("libpod_pod_%s", p.ID()))
+ cgroupPath, err := systemdSliceFromPath(p.config.CgroupParent, fmt.Sprintf("libpod_pod_%s", p.ID()), p.ResourceLim())
if err != nil {
logrus.Errorf("Creating Cgroup for pod %s: %v", p.ID(), err)
}
diff --git a/libpod/runtime_pod_linux.go b/libpod/runtime_pod_linux.go
index dcc3a044f..d75ac2971 100644
--- a/libpod/runtime_pod_linux.go
+++ b/libpod/runtime_pod_linux.go
@@ -17,7 +17,7 @@ import (
"github.com/containers/podman/v4/libpod/events"
"github.com/containers/podman/v4/pkg/rootless"
"github.com/containers/podman/v4/pkg/specgen"
- spec "github.com/opencontainers/runtime-spec/specs-go"
+ runcconfig "github.com/opencontainers/runc/libcontainer/configs"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
)
@@ -66,6 +66,7 @@ func (r *Runtime) NewPod(ctx context.Context, p specgen.PodSpecGenerator, option
case config.CgroupfsCgroupsManager:
canUseCgroup := !rootless.IsRootless() || isRootlessCgroupSet(pod.config.CgroupParent)
if canUseCgroup {
+ // need to actually create parent here
if pod.config.CgroupParent == "" {
pod.config.CgroupParent = CgroupfsDefaultCgroupParent
} else if strings.HasSuffix(path.Base(pod.config.CgroupParent), ".slice") {
@@ -73,12 +74,26 @@ func (r *Runtime) NewPod(ctx context.Context, p specgen.PodSpecGenerator, option
}
// If we are set to use pod cgroups, set the cgroup parent that
// all containers in the pod will share
- // No need to create it with cgroupfs - the first container to
- // launch should do it for us
if pod.config.UsePodCgroup {
pod.state.CgroupPath = filepath.Join(pod.config.CgroupParent, pod.ID())
if p.InfraContainerSpec != nil {
p.InfraContainerSpec.CgroupParent = pod.state.CgroupPath
+ res, err := GetLimits(p.InfraContainerSpec.ResourceLimits)
+ if err != nil {
+ return nil, err
+ }
+ // Need to both create and update the cgroup
+ // rather than create a new path in c/common for pod cgroup creation
+ // just create as if it is a ctr and then update figures out that we need to
+ // populate the resource limits on the pod level
+ cgc, err := cgroups.New(pod.state.CgroupPath, &res)
+ if err != nil {
+ return nil, err
+ }
+ err = cgc.Update(&res)
+ if err != nil {
+ return nil, err
+ }
}
}
}
@@ -95,7 +110,7 @@ func (r *Runtime) NewPod(ctx context.Context, p specgen.PodSpecGenerator, option
// If we are set to use pod cgroups, set the cgroup parent that
// all containers in the pod will share
if pod.config.UsePodCgroup {
- cgroupPath, err := systemdSliceFromPath(pod.config.CgroupParent, fmt.Sprintf("libpod_pod_%s", pod.ID()))
+ cgroupPath, err := systemdSliceFromPath(pod.config.CgroupParent, fmt.Sprintf("libpod_pod_%s", pod.ID()), p.InfraContainerSpec.ResourceLimits)
if err != nil {
return nil, errors.Wrapf(err, "unable to create pod cgroup for pod %s", pod.ID())
}
@@ -239,9 +254,8 @@ func (r *Runtime) removePod(ctx context.Context, p *Pod, removeCtrs, force bool,
}
// New resource limits
- resLimits := new(spec.LinuxResources)
- resLimits.Pids = new(spec.LinuxPids)
- resLimits.Pids.Limit = 1 // Inhibit forks with very low pids limit
+ resLimits := new(runcconfig.Resources)
+ resLimits.PidsLimit = 1 // Inhibit forks with very low pids limit
// Don't try if we failed to retrieve the cgroup
if err == nil {
@@ -321,7 +335,7 @@ func (r *Runtime) removePod(ctx context.Context, p *Pod, removeCtrs, force bool,
switch p.runtime.config.Engine.CgroupManager {
case config.SystemdCgroupsManager:
- if err := deleteSystemdCgroup(p.state.CgroupPath); err != nil {
+ if err := deleteSystemdCgroup(p.state.CgroupPath, p.ResourceLim()); err != nil {
if removalErr == nil {
removalErr = errors.Wrapf(err, "error removing pod %s cgroup", p.ID())
} else {
diff --git a/libpod/stats.go b/libpod/stats.go
index d2ffc3b32..eaac9d7d0 100644
--- a/libpod/stats.go
+++ b/libpod/stats.go
@@ -9,6 +9,8 @@ import (
"syscall"
"time"
+ runccgroup "github.com/opencontainers/runc/libcontainer/cgroups"
+
"github.com/containers/common/pkg/cgroups"
"github.com/containers/podman/v4/libpod/define"
"github.com/pkg/errors"
@@ -69,29 +71,29 @@ func (c *Container) GetContainerStats(previousStats *define.ContainerStats) (*de
// If the current total usage in the cgroup is less than what was previously
// recorded then it means the container was restarted and runs in a new cgroup
- if previousStats.Duration > cgroupStats.CPU.Usage.Total {
+ if previousStats.Duration > cgroupStats.CpuStats.CpuUsage.TotalUsage {
previousStats = &define.ContainerStats{}
}
previousCPU := previousStats.CPUNano
now := uint64(time.Now().UnixNano())
- stats.Duration = cgroupStats.CPU.Usage.Total
+ stats.Duration = cgroupStats.CpuStats.CpuUsage.TotalUsage
stats.UpTime = time.Duration(stats.Duration)
stats.CPU = calculateCPUPercent(cgroupStats, previousCPU, now, previousStats.SystemNano)
// calc the average cpu usage for the time the container is running
stats.AvgCPU = calculateCPUPercent(cgroupStats, 0, now, uint64(c.state.StartedTime.UnixNano()))
- stats.MemUsage = cgroupStats.Memory.Usage.Usage
+ stats.MemUsage = cgroupStats.MemoryStats.Usage.Usage
stats.MemLimit = c.getMemLimit()
stats.MemPerc = (float64(stats.MemUsage) / float64(stats.MemLimit)) * 100
stats.PIDs = 0
if conState == define.ContainerStateRunning || conState == define.ContainerStatePaused {
- stats.PIDs = cgroupStats.Pids.Current
+ stats.PIDs = cgroupStats.PidsStats.Current
}
stats.BlockInput, stats.BlockOutput = calculateBlockIO(cgroupStats)
- stats.CPUNano = cgroupStats.CPU.Usage.Total
- stats.CPUSystemNano = cgroupStats.CPU.Usage.Kernel
+ stats.CPUNano = cgroupStats.CpuStats.CpuUsage.TotalUsage
+ stats.CPUSystemNano = cgroupStats.CpuStats.CpuUsage.UsageInKernelmode
stats.SystemNano = now
- stats.PerCPU = cgroupStats.CPU.Usage.PerCPU
+ stats.PerCPU = cgroupStats.CpuStats.CpuUsage.PercpuUsage
// Handle case where the container is not in a network namespace
if netStats != nil {
stats.NetInput = netStats.TxBytes
@@ -133,10 +135,10 @@ func (c *Container) getMemLimit() uint64 {
// previousCPU is the last value of stats.CPU.Usage.Total measured at the time previousSystem.
// (now - previousSystem) is the time delta in nanoseconds, between the measurement in previousCPU
// and the updated value in stats.
-func calculateCPUPercent(stats *cgroups.Metrics, previousCPU, now, previousSystem uint64) float64 {
+func calculateCPUPercent(stats *runccgroup.Stats, previousCPU, now, previousSystem uint64) float64 {
var (
cpuPercent = 0.0
- cpuDelta = float64(stats.CPU.Usage.Total - previousCPU)
+ cpuDelta = float64(stats.CpuStats.CpuUsage.TotalUsage - previousCPU)
systemDelta = float64(now - previousSystem)
)
if systemDelta > 0.0 && cpuDelta > 0.0 {
@@ -146,8 +148,8 @@ func calculateCPUPercent(stats *cgroups.Metrics, previousCPU, now, previousSyste
return cpuPercent
}
-func calculateBlockIO(stats *cgroups.Metrics) (read uint64, write uint64) {
- for _, blkIOEntry := range stats.Blkio.IoServiceBytesRecursive {
+func calculateBlockIO(stats *runccgroup.Stats) (read uint64, write uint64) {
+ for _, blkIOEntry := range stats.BlkioStats.IoServiceBytesRecursive {
switch strings.ToLower(blkIOEntry.Op) {
case "read":
read += blkIOEntry.Value
diff --git a/libpod/util_linux.go b/libpod/util_linux.go
index fe98056dc..414d1bff9 100644
--- a/libpod/util_linux.go
+++ b/libpod/util_linux.go
@@ -11,6 +11,7 @@ import (
"github.com/containers/common/pkg/cgroups"
"github.com/containers/podman/v4/libpod/define"
"github.com/containers/podman/v4/pkg/rootless"
+ spec "github.com/opencontainers/runtime-spec/specs-go"
"github.com/opencontainers/selinux/go-selinux/label"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
@@ -20,7 +21,7 @@ import (
// systemdSliceFromPath makes a new systemd slice under the given parent with
// the given name.
// The parent must be a slice. The name must NOT include ".slice"
-func systemdSliceFromPath(parent, name string) (string, error) {
+func systemdSliceFromPath(parent, name string, resources *spec.LinuxResources) (string, error) {
cgroupPath, err := assembleSystemdCgroupName(parent, name)
if err != nil {
return "", err
@@ -28,7 +29,7 @@ func systemdSliceFromPath(parent, name string) (string, error) {
logrus.Debugf("Created cgroup path %s for parent %s and name %s", cgroupPath, parent, name)
- if err := makeSystemdCgroup(cgroupPath); err != nil {
+ if err := makeSystemdCgroup(cgroupPath, resources); err != nil {
return "", errors.Wrapf(err, "error creating cgroup %s", cgroupPath)
}
@@ -45,8 +46,12 @@ func getDefaultSystemdCgroup() string {
}
// makeSystemdCgroup creates a systemd Cgroup at the given location.
-func makeSystemdCgroup(path string) error {
- controller, err := cgroups.NewSystemd(getDefaultSystemdCgroup())
+func makeSystemdCgroup(path string, resources *spec.LinuxResources) error {
+ res, err := GetLimits(resources)
+ if err != nil {
+ return err
+ }
+ controller, err := cgroups.NewSystemd(getDefaultSystemdCgroup(), &res)
if err != nil {
return err
}
@@ -54,12 +59,20 @@ func makeSystemdCgroup(path string) error {
if rootless.IsRootless() {
return controller.CreateSystemdUserUnit(path, rootless.GetRootlessUID())
}
- return controller.CreateSystemdUnit(path)
+ err = controller.CreateSystemdUnit(path)
+ if err != nil {
+ return err
+ }
+ return nil
}
// deleteSystemdCgroup deletes the systemd cgroup at the given location
-func deleteSystemdCgroup(path string) error {
- controller, err := cgroups.NewSystemd(getDefaultSystemdCgroup())
+func deleteSystemdCgroup(path string, resources *spec.LinuxResources) error {
+ res, err := GetLimits(resources)
+ if err != nil {
+ return err
+ }
+ controller, err := cgroups.NewSystemd(getDefaultSystemdCgroup(), &res)
if err != nil {
return err
}