diff options
author | openshift-ci[bot] <75433959+openshift-ci[bot]@users.noreply.github.com> | 2022-06-27 15:14:20 +0000 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-06-27 15:14:20 +0000 |
commit | 088665d2cfd488004f34dbd0804c3cd00bf94ca4 (patch) | |
tree | 0aa7d04edc4009672fcc760aca511b8d10796682 /vendor/github.com/opencontainers/runc/libcontainer | |
parent | 8806606ca2e2060750b3f86c6d31bb50125309de (diff) | |
parent | 2792e598c7ce1198ec8464a3119504123ae8397c (diff) | |
download | podman-088665d2cfd488004f34dbd0804c3cd00bf94ca4.tar.gz podman-088665d2cfd488004f34dbd0804c3cd00bf94ca4.tar.bz2 podman-088665d2cfd488004f34dbd0804c3cd00bf94ca4.zip |
Merge pull request #14654 from cdoern/cgroup
podman cgroup enhancement
Diffstat (limited to 'vendor/github.com/opencontainers/runc/libcontainer')
34 files changed, 3713 insertions, 24 deletions
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go index ba2b2266c..b9ba889b7 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go @@ -1,9 +1,24 @@ package cgroups import ( + "errors" + "github.com/opencontainers/runc/libcontainer/configs" ) +var ( + // ErrDevicesUnsupported is an error returned when a cgroup manager + // is not configured to set device rules. + ErrDevicesUnsupported = errors.New("cgroup manager is not configured to set device rules") + + // DevicesSetV1 and DevicesSetV2 are functions to set devices for + // cgroup v1 and v2, respectively. Unless libcontainer/cgroups/devices + // package is imported, it is set to nil, so cgroup managers can't + // manage devices. + DevicesSetV1 func(path string, r *configs.Resources) error + DevicesSetV2 func(path string, r *configs.Resources) error +) + type Manager interface { // Apply creates a cgroup, if not yet created, and adds a process // with the specified pid into that cgroup. A special value of -1 diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/blkio.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/blkio.go new file mode 100644 index 000000000..c81b6562a --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/blkio.go @@ -0,0 +1,311 @@ +package fs + +import ( + "bufio" + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type BlkioGroup struct { + weightFilename string + weightDeviceFilename string +} + +func (s *BlkioGroup) Name() string { + return "blkio" +} + +func (s *BlkioGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) +} + +func (s *BlkioGroup) Set(path string, r *configs.Resources) error { + s.detectWeightFilenames(path) + if r.BlkioWeight != 0 { + if err := cgroups.WriteFile(path, s.weightFilename, strconv.FormatUint(uint64(r.BlkioWeight), 10)); err != nil { + return err + } + } + + if r.BlkioLeafWeight != 0 { + if err := cgroups.WriteFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(r.BlkioLeafWeight), 10)); err != nil { + return err + } + } + for _, wd := range r.BlkioWeightDevice { + if wd.Weight != 0 { + if err := cgroups.WriteFile(path, s.weightDeviceFilename, wd.WeightString()); err != nil { + return err + } + } + if wd.LeafWeight != 0 { + if err := cgroups.WriteFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil { + return err + } + } + } + for _, td := range r.BlkioThrottleReadBpsDevice { + if err := cgroups.WriteFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil { + return err + } + } + for _, td := range r.BlkioThrottleWriteBpsDevice { + if err := cgroups.WriteFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil { + return err + } + } + for _, td := range r.BlkioThrottleReadIOPSDevice { + if err := cgroups.WriteFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil { + return err + } + } + for _, td := range r.BlkioThrottleWriteIOPSDevice { + if err := cgroups.WriteFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil { + return err + } + } + + return nil +} + +/* +examples: + + blkio.sectors + 8:0 6792 + + blkio.io_service_bytes + 8:0 Read 1282048 + 8:0 Write 2195456 + 8:0 Sync 2195456 + 8:0 Async 1282048 + 8:0 Total 3477504 + Total 3477504 + + blkio.io_serviced + 8:0 Read 124 + 8:0 Write 104 + 8:0 Sync 104 + 8:0 Async 124 + 8:0 Total 228 + Total 228 + + blkio.io_queued + 8:0 Read 0 + 8:0 Write 0 + 8:0 Sync 0 + 8:0 Async 0 + 8:0 Total 0 + Total 0 +*/ + +func splitBlkioStatLine(r rune) bool { + return r == ' ' || r == ':' +} + +func getBlkioStat(dir, file string) ([]cgroups.BlkioStatEntry, error) { + var blkioStats []cgroups.BlkioStatEntry + f, err := cgroups.OpenFile(dir, file, os.O_RDONLY) + if err != nil { + if os.IsNotExist(err) { + return blkioStats, nil + } + return nil, err + } + defer f.Close() + + sc := bufio.NewScanner(f) + for sc.Scan() { + // format: dev type amount + fields := strings.FieldsFunc(sc.Text(), splitBlkioStatLine) + if len(fields) < 3 { + if len(fields) == 2 && fields[0] == "Total" { + // skip total line + continue + } else { + return nil, malformedLine(dir, file, sc.Text()) + } + } + + v, err := strconv.ParseUint(fields[0], 10, 64) + if err != nil { + return nil, &parseError{Path: dir, File: file, Err: err} + } + major := v + + v, err = strconv.ParseUint(fields[1], 10, 64) + if err != nil { + return nil, &parseError{Path: dir, File: file, Err: err} + } + minor := v + + op := "" + valueField := 2 + if len(fields) == 4 { + op = fields[2] + valueField = 3 + } + v, err = strconv.ParseUint(fields[valueField], 10, 64) + if err != nil { + return nil, &parseError{Path: dir, File: file, Err: err} + } + blkioStats = append(blkioStats, cgroups.BlkioStatEntry{Major: major, Minor: minor, Op: op, Value: v}) + } + if err := sc.Err(); err != nil { + return nil, &parseError{Path: dir, File: file, Err: err} + } + + return blkioStats, nil +} + +func (s *BlkioGroup) GetStats(path string, stats *cgroups.Stats) error { + type blkioStatInfo struct { + filename string + blkioStatEntriesPtr *[]cgroups.BlkioStatEntry + } + bfqDebugStats := []blkioStatInfo{ + { + filename: "blkio.bfq.sectors_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.SectorsRecursive, + }, + { + filename: "blkio.bfq.io_service_time_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceTimeRecursive, + }, + { + filename: "blkio.bfq.io_wait_time_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoWaitTimeRecursive, + }, + { + filename: "blkio.bfq.io_merged_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoMergedRecursive, + }, + { + filename: "blkio.bfq.io_queued_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoQueuedRecursive, + }, + { + filename: "blkio.bfq.time_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoTimeRecursive, + }, + { + filename: "blkio.bfq.io_serviced_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, + }, + { + filename: "blkio.bfq.io_service_bytes_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, + }, + } + bfqStats := []blkioStatInfo{ + { + filename: "blkio.bfq.io_serviced_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, + }, + { + filename: "blkio.bfq.io_service_bytes_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, + }, + } + cfqStats := []blkioStatInfo{ + { + filename: "blkio.sectors_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.SectorsRecursive, + }, + { + filename: "blkio.io_service_time_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceTimeRecursive, + }, + { + filename: "blkio.io_wait_time_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoWaitTimeRecursive, + }, + { + filename: "blkio.io_merged_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoMergedRecursive, + }, + { + filename: "blkio.io_queued_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoQueuedRecursive, + }, + { + filename: "blkio.time_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoTimeRecursive, + }, + { + filename: "blkio.io_serviced_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, + }, + { + filename: "blkio.io_service_bytes_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, + }, + } + throttleRecursiveStats := []blkioStatInfo{ + { + filename: "blkio.throttle.io_serviced_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, + }, + { + filename: "blkio.throttle.io_service_bytes_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, + }, + } + baseStats := []blkioStatInfo{ + { + filename: "blkio.throttle.io_serviced", + blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, + }, + { + filename: "blkio.throttle.io_service_bytes", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, + }, + } + orderedStats := [][]blkioStatInfo{ + bfqDebugStats, + bfqStats, + cfqStats, + throttleRecursiveStats, + baseStats, + } + + var blkioStats []cgroups.BlkioStatEntry + var err error + + for _, statGroup := range orderedStats { + for i, statInfo := range statGroup { + if blkioStats, err = getBlkioStat(path, statInfo.filename); err != nil || blkioStats == nil { + // if error occurs on first file, move to next group + if i == 0 { + break + } + return err + } + *statInfo.blkioStatEntriesPtr = blkioStats + // finish if all stats are gathered + if i == len(statGroup)-1 { + return nil + } + } + } + return nil +} + +func (s *BlkioGroup) detectWeightFilenames(path string) { + if s.weightFilename != "" { + // Already detected. + return + } + if cgroups.PathExists(filepath.Join(path, "blkio.weight")) { + s.weightFilename = "blkio.weight" + s.weightDeviceFilename = "blkio.weight_device" + } else { + s.weightFilename = "blkio.bfq.weight" + s.weightDeviceFilename = "blkio.bfq.weight_device" + } +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go new file mode 100644 index 000000000..6c79f899b --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go @@ -0,0 +1,129 @@ +package fs + +import ( + "bufio" + "errors" + "fmt" + "os" + "strconv" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" + "golang.org/x/sys/unix" +) + +type CpuGroup struct{} + +func (s *CpuGroup) Name() string { + return "cpu" +} + +func (s *CpuGroup) Apply(path string, r *configs.Resources, pid int) error { + if err := os.MkdirAll(path, 0o755); err != nil { + return err + } + // We should set the real-Time group scheduling settings before moving + // in the process because if the process is already in SCHED_RR mode + // and no RT bandwidth is set, adding it will fail. + if err := s.SetRtSched(path, r); err != nil { + return err + } + // Since we are not using apply(), we need to place the pid + // into the procs file. + return cgroups.WriteCgroupProc(path, pid) +} + +func (s *CpuGroup) SetRtSched(path string, r *configs.Resources) error { + if r.CpuRtPeriod != 0 { + if err := cgroups.WriteFile(path, "cpu.rt_period_us", strconv.FormatUint(r.CpuRtPeriod, 10)); err != nil { + return err + } + } + if r.CpuRtRuntime != 0 { + if err := cgroups.WriteFile(path, "cpu.rt_runtime_us", strconv.FormatInt(r.CpuRtRuntime, 10)); err != nil { + return err + } + } + return nil +} + +func (s *CpuGroup) Set(path string, r *configs.Resources) error { + if r.CpuShares != 0 { + shares := r.CpuShares + if err := cgroups.WriteFile(path, "cpu.shares", strconv.FormatUint(shares, 10)); err != nil { + return err + } + // read it back + sharesRead, err := fscommon.GetCgroupParamUint(path, "cpu.shares") + if err != nil { + return err + } + // ... and check + if shares > sharesRead { + return fmt.Errorf("the maximum allowed cpu-shares is %d", sharesRead) + } else if shares < sharesRead { + return fmt.Errorf("the minimum allowed cpu-shares is %d", sharesRead) + } + } + + var period string + if r.CpuPeriod != 0 { + period = strconv.FormatUint(r.CpuPeriod, 10) + if err := cgroups.WriteFile(path, "cpu.cfs_period_us", period); err != nil { + // Sometimes when the period to be set is smaller + // than the current one, it is rejected by the kernel + // (EINVAL) as old_quota/new_period exceeds the parent + // cgroup quota limit. If this happens and the quota is + // going to be set, ignore the error for now and retry + // after setting the quota. + if !errors.Is(err, unix.EINVAL) || r.CpuQuota == 0 { + return err + } + } else { + period = "" + } + } + if r.CpuQuota != 0 { + if err := cgroups.WriteFile(path, "cpu.cfs_quota_us", strconv.FormatInt(r.CpuQuota, 10)); err != nil { + return err + } + if period != "" { + if err := cgroups.WriteFile(path, "cpu.cfs_period_us", period); err != nil { + return err + } + } + } + return s.SetRtSched(path, r) +} + +func (s *CpuGroup) GetStats(path string, stats *cgroups.Stats) error { + const file = "cpu.stat" + f, err := cgroups.OpenFile(path, file, os.O_RDONLY) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + defer f.Close() + + sc := bufio.NewScanner(f) + for sc.Scan() { + t, v, err := fscommon.ParseKeyValue(sc.Text()) + if err != nil { + return &parseError{Path: path, File: file, Err: err} + } + switch t { + case "nr_periods": + stats.CpuStats.ThrottlingData.Periods = v + + case "nr_throttled": + stats.CpuStats.ThrottlingData.ThrottledPeriods = v + + case "throttled_time": + stats.CpuStats.ThrottlingData.ThrottledTime = v + } + } + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuacct.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuacct.go new file mode 100644 index 000000000..d3bd7e111 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuacct.go @@ -0,0 +1,166 @@ +package fs + +import ( + "bufio" + "os" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +const ( + cgroupCpuacctStat = "cpuacct.stat" + cgroupCpuacctUsageAll = "cpuacct.usage_all" + + nanosecondsInSecond = 1000000000 + + userModeColumn = 1 + kernelModeColumn = 2 + cuacctUsageAllColumnsNumber = 3 + + // The value comes from `C.sysconf(C._SC_CLK_TCK)`, and + // on Linux it's a constant which is safe to be hard coded, + // so we can avoid using cgo here. For details, see: + // https://github.com/containerd/cgroups/pull/12 + clockTicks uint64 = 100 +) + +type CpuacctGroup struct{} + +func (s *CpuacctGroup) Name() string { + return "cpuacct" +} + +func (s *CpuacctGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) +} + +func (s *CpuacctGroup) Set(_ string, _ *configs.Resources) error { + return nil +} + +func (s *CpuacctGroup) GetStats(path string, stats *cgroups.Stats) error { + if !cgroups.PathExists(path) { + return nil + } + userModeUsage, kernelModeUsage, err := getCpuUsageBreakdown(path) + if err != nil { + return err + } + + totalUsage, err := fscommon.GetCgroupParamUint(path, "cpuacct.usage") + if err != nil { + return err + } + + percpuUsage, err := getPercpuUsage(path) + if err != nil { + return err + } + + percpuUsageInKernelmode, percpuUsageInUsermode, err := getPercpuUsageInModes(path) + if err != nil { + return err + } + + stats.CpuStats.CpuUsage.TotalUsage = totalUsage + stats.CpuStats.CpuUsage.PercpuUsage = percpuUsage + stats.CpuStats.CpuUsage.PercpuUsageInKernelmode = percpuUsageInKernelmode + stats.CpuStats.CpuUsage.PercpuUsageInUsermode = percpuUsageInUsermode + stats.CpuStats.CpuUsage.UsageInUsermode = userModeUsage + stats.CpuStats.CpuUsage.UsageInKernelmode = kernelModeUsage + return nil +} + +// Returns user and kernel usage breakdown in nanoseconds. +func getCpuUsageBreakdown(path string) (uint64, uint64, error) { + var userModeUsage, kernelModeUsage uint64 + const ( + userField = "user" + systemField = "system" + file = cgroupCpuacctStat + ) + + // Expected format: + // user <usage in ticks> + // system <usage in ticks> + data, err := cgroups.ReadFile(path, file) + if err != nil { + return 0, 0, err + } + // TODO: use strings.SplitN instead. + fields := strings.Fields(data) + if len(fields) < 4 || fields[0] != userField || fields[2] != systemField { + return 0, 0, malformedLine(path, file, data) + } + if userModeUsage, err = strconv.ParseUint(fields[1], 10, 64); err != nil { + return 0, 0, &parseError{Path: path, File: file, Err: err} + } + if kernelModeUsage, err = strconv.ParseUint(fields[3], 10, 64); err != nil { + return 0, 0, &parseError{Path: path, File: file, Err: err} + } + + return (userModeUsage * nanosecondsInSecond) / clockTicks, (kernelModeUsage * nanosecondsInSecond) / clockTicks, nil +} + +func getPercpuUsage(path string) ([]uint64, error) { + const file = "cpuacct.usage_percpu" + percpuUsage := []uint64{} + data, err := cgroups.ReadFile(path, file) + if err != nil { + return percpuUsage, err + } + // TODO: use strings.SplitN instead. + for _, value := range strings.Fields(data) { + value, err := strconv.ParseUint(value, 10, 64) + if err != nil { + return percpuUsage, &parseError{Path: path, File: file, Err: err} + } + percpuUsage = append(percpuUsage, value) + } + return percpuUsage, nil +} + +func getPercpuUsageInModes(path string) ([]uint64, []uint64, error) { + usageKernelMode := []uint64{} + usageUserMode := []uint64{} + const file = cgroupCpuacctUsageAll + + fd, err := cgroups.OpenFile(path, file, os.O_RDONLY) + if os.IsNotExist(err) { + return usageKernelMode, usageUserMode, nil + } else if err != nil { + return nil, nil, err + } + defer fd.Close() + + scanner := bufio.NewScanner(fd) + scanner.Scan() // skipping header line + + for scanner.Scan() { + lineFields := strings.SplitN(scanner.Text(), " ", cuacctUsageAllColumnsNumber+1) + if len(lineFields) != cuacctUsageAllColumnsNumber { + continue + } + + usageInKernelMode, err := strconv.ParseUint(lineFields[kernelModeColumn], 10, 64) + if err != nil { + return nil, nil, &parseError{Path: path, File: file, Err: err} + } + usageKernelMode = append(usageKernelMode, usageInKernelMode) + + usageInUserMode, err := strconv.ParseUint(lineFields[userModeColumn], 10, 64) + if err != nil { + return nil, nil, &parseError{Path: path, File: file, Err: err} + } + usageUserMode = append(usageUserMode, usageInUserMode) + } + if err := scanner.Err(); err != nil { + return nil, nil, &parseError{Path: path, File: file, Err: err} + } + + return usageKernelMode, usageUserMode, nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go new file mode 100644 index 000000000..550baa427 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go @@ -0,0 +1,245 @@ +package fs + +import ( + "errors" + "os" + "path/filepath" + "strconv" + "strings" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type CpusetGroup struct{} + +func (s *CpusetGroup) Name() string { + return "cpuset" +} + +func (s *CpusetGroup) Apply(path string, r *configs.Resources, pid int) error { + return s.ApplyDir(path, r, pid) +} + +func (s *CpusetGroup) Set(path string, r *configs.Resources) error { + if r.CpusetCpus != "" { + if err := cgroups.WriteFile(path, "cpuset.cpus", r.CpusetCpus); err != nil { + return err + } + } + if r.CpusetMems != "" { + if err := cgroups.WriteFile(path, "cpuset.mems", r.CpusetMems); err != nil { + return err + } + } + return nil +} + +func getCpusetStat(path string, file string) ([]uint16, error) { + var extracted []uint16 + fileContent, err := fscommon.GetCgroupParamString(path, file) + if err != nil { + return extracted, err + } + if len(fileContent) == 0 { + return extracted, &parseError{Path: path, File: file, Err: errors.New("empty file")} + } + + for _, s := range strings.Split(fileContent, ",") { + sp := strings.SplitN(s, "-", 3) + switch len(sp) { + case 3: + return extracted, &parseError{Path: path, File: file, Err: errors.New("extra dash")} + case 2: + min, err := strconv.ParseUint(sp[0], 10, 16) + if err != nil { + return extracted, &parseError{Path: path, File: file, Err: err} + } + max, err := strconv.ParseUint(sp[1], 10, 16) + if err != nil { + return extracted, &parseError{Path: path, File: file, Err: err} + } + if min > max { + return extracted, &parseError{Path: path, File: file, Err: errors.New("invalid values, min > max")} + } + for i := min; i <= max; i++ { + extracted = append(extracted, uint16(i)) + } + case 1: + value, err := strconv.ParseUint(s, 10, 16) + if err != nil { + return extracted, &parseError{Path: path, File: file, Err: err} + } + extracted = append(extracted, uint16(value)) + } + } + + return extracted, nil +} + +func (s *CpusetGroup) GetStats(path string, stats *cgroups.Stats) error { + var err error + + stats.CPUSetStats.CPUs, err = getCpusetStat(path, "cpuset.cpus") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.CPUExclusive, err = fscommon.GetCgroupParamUint(path, "cpuset.cpu_exclusive") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.Mems, err = getCpusetStat(path, "cpuset.mems") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.MemHardwall, err = fscommon.GetCgroupParamUint(path, "cpuset.mem_hardwall") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.MemExclusive, err = fscommon.GetCgroupParamUint(path, "cpuset.mem_exclusive") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.MemoryMigrate, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_migrate") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.MemorySpreadPage, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_spread_page") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.MemorySpreadSlab, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_spread_slab") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.MemoryPressure, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_pressure") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.SchedLoadBalance, err = fscommon.GetCgroupParamUint(path, "cpuset.sched_load_balance") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.SchedRelaxDomainLevel, err = fscommon.GetCgroupParamInt(path, "cpuset.sched_relax_domain_level") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + return nil +} + +func (s *CpusetGroup) ApplyDir(dir string, r *configs.Resources, pid int) error { + // This might happen if we have no cpuset cgroup mounted. + // Just do nothing and don't fail. + if dir == "" { + return nil + } + // 'ensureParent' start with parent because we don't want to + // explicitly inherit from parent, it could conflict with + // 'cpuset.cpu_exclusive'. + if err := cpusetEnsureParent(filepath.Dir(dir)); err != nil { + return err + } + if err := os.Mkdir(dir, 0o755); err != nil && !os.IsExist(err) { + return err + } + // We didn't inherit cpuset configs from parent, but we have + // to ensure cpuset configs are set before moving task into the + // cgroup. + // The logic is, if user specified cpuset configs, use these + // specified configs, otherwise, inherit from parent. This makes + // cpuset configs work correctly with 'cpuset.cpu_exclusive', and + // keep backward compatibility. + if err := s.ensureCpusAndMems(dir, r); err != nil { + return err + } + // Since we are not using apply(), we need to place the pid + // into the procs file. + return cgroups.WriteCgroupProc(dir, pid) +} + +func getCpusetSubsystemSettings(parent string) (cpus, mems string, err error) { + if cpus, err = cgroups.ReadFile(parent, "cpuset.cpus"); err != nil { + return + } + if mems, err = cgroups.ReadFile(parent, "cpuset.mems"); err != nil { + return + } + return cpus, mems, nil +} + +// cpusetEnsureParent makes sure that the parent directories of current +// are created and populated with the proper cpus and mems files copied +// from their respective parent. It does that recursively, starting from +// the top of the cpuset hierarchy (i.e. cpuset cgroup mount point). +func cpusetEnsureParent(current string) error { + var st unix.Statfs_t + + parent := filepath.Dir(current) + err := unix.Statfs(parent, &st) + if err == nil && st.Type != unix.CGROUP_SUPER_MAGIC { + return nil + } + // Treat non-existing directory as cgroupfs as it will be created, + // and the root cpuset directory obviously exists. + if err != nil && err != unix.ENOENT { //nolint:errorlint // unix errors are bare + return &os.PathError{Op: "statfs", Path: parent, Err: err} + } + + if err := cpusetEnsureParent(parent); err != nil { + return err + } + if err := os.Mkdir(current, 0o755); err != nil && !os.IsExist(err) { + return err + } + return cpusetCopyIfNeeded(current, parent) +} + +// cpusetCopyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent +// directory to the current directory if the file's contents are 0 +func cpusetCopyIfNeeded(current, parent string) error { + currentCpus, currentMems, err := getCpusetSubsystemSettings(current) + if err != nil { + return err + } + parentCpus, parentMems, err := getCpusetSubsystemSettings(parent) + if err != nil { + return err + } + + if isEmptyCpuset(currentCpus) { + if err := cgroups.WriteFile(current, "cpuset.cpus", parentCpus); err != nil { + return err + } + } + if isEmptyCpuset(currentMems) { + if err := cgroups.WriteFile(current, "cpuset.mems", parentMems); err != nil { + return err + } + } + return nil +} + +func isEmptyCpuset(str string) bool { + return str == "" || str == "\n" +} + +func (s *CpusetGroup) ensureCpusAndMems(path string, r *configs.Resources) error { + if err := s.Set(path, r); err != nil { + return err + } + return cpusetCopyIfNeeded(path, filepath.Dir(path)) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go new file mode 100644 index 000000000..0bf3d9deb --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go @@ -0,0 +1,39 @@ +package fs + +import ( + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type DevicesGroup struct{} + +func (s *DevicesGroup) Name() string { + return "devices" +} + +func (s *DevicesGroup) Apply(path string, r *configs.Resources, pid int) error { + if r.SkipDevices { + return nil + } + if path == "" { + // Return error here, since devices cgroup + // is a hard requirement for container's security. + return errSubsystemDoesNotExist + } + + return apply(path, pid) +} + +func (s *DevicesGroup) Set(path string, r *configs.Resources) error { + if cgroups.DevicesSetV1 == nil { + if len(r.Devices) == 0 { + return nil + } + return cgroups.ErrDevicesUnsupported + } + return cgroups.DevicesSetV1(path, r) +} + +func (s *DevicesGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/error.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/error.go new file mode 100644 index 000000000..f2ab6f130 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/error.go @@ -0,0 +1,15 @@ +package fs + +import ( + "fmt" + + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" +) + +type parseError = fscommon.ParseError + +// malformedLine is used by all cgroupfs file parsers that expect a line +// in a particular format but get some garbage instead. +func malformedLine(path, file, line string) error { + return &parseError{Path: path, File: file, Err: fmt.Errorf("malformed line: %s", line)} +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go new file mode 100644 index 000000000..987f1bf5e --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go @@ -0,0 +1,158 @@ +package fs + +import ( + "errors" + "fmt" + "os" + "strings" + "time" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +type FreezerGroup struct{} + +func (s *FreezerGroup) Name() string { + return "freezer" +} + +func (s *FreezerGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) +} + +func (s *FreezerGroup) Set(path string, r *configs.Resources) (Err error) { + switch r.Freezer { + case configs.Frozen: + defer func() { + if Err != nil { + // Freezing failed, and it is bad and dangerous + // to leave the cgroup in FROZEN or FREEZING + // state, so (try to) thaw it back. + _ = cgroups.WriteFile(path, "freezer.state", string(configs.Thawed)) + } + }() + + // As per older kernel docs (freezer-subsystem.txt before + // kernel commit ef9fe980c6fcc1821), if FREEZING is seen, + // userspace should either retry or thaw. While current + // kernel cgroup v1 docs no longer mention a need to retry, + // even a recent kernel (v5.4, Ubuntu 20.04) can't reliably + // freeze a cgroup v1 while new processes keep appearing in it + // (either via fork/clone or by writing new PIDs to + // cgroup.procs). + // + // The numbers below are empirically chosen to have a decent + // chance to succeed in various scenarios ("runc pause/unpause + // with parallel runc exec" and "bare freeze/unfreeze on a very + // slow system"), tested on RHEL7 and Ubuntu 20.04 kernels. + // + // Adding any amount of sleep in between retries did not + // increase the chances of successful freeze in "pause/unpause + // with parallel exec" reproducer. OTOH, adding an occasional + // sleep helped for the case where the system is extremely slow + // (CentOS 7 VM on GHA CI). + // + // Alas, this is still a game of chances, since the real fix + // belong to the kernel (cgroup v2 do not have this bug). + + for i := 0; i < 1000; i++ { + if i%50 == 49 { + // Occasional thaw and sleep improves + // the chances to succeed in freezing + // in case new processes keep appearing + // in the cgroup. + _ = cgroups.WriteFile(path, "freezer.state", string(configs.Thawed)) + time.Sleep(10 * time.Millisecond) + } + + if err := cgroups.WriteFile(path, "freezer.state", string(configs.Frozen)); err != nil { + return err + } + + if i%25 == 24 { + // Occasional short sleep before reading + // the state back also improves the chances to + // succeed in freezing in case of a very slow + // system. + time.Sleep(10 * time.Microsecond) + } + state, err := cgroups.ReadFile(path, "freezer.state") + if err != nil { + return err + } + state = strings.TrimSpace(state) + switch state { + case "FREEZING": + continue + case string(configs.Frozen): + if i > 1 { + logrus.Debugf("frozen after %d retries", i) + } + return nil + default: + // should never happen + return fmt.Errorf("unexpected state %s while freezing", strings.TrimSpace(state)) + } + } + // Despite our best efforts, it got stuck in FREEZING. + return errors.New("unable to freeze") + case configs.Thawed: + return cgroups.WriteFile(path, "freezer.state", string(configs.Thawed)) + case configs.Undefined: + return nil + default: + return fmt.Errorf("Invalid argument '%s' to freezer.state", string(r.Freezer)) + } +} + +func (s *FreezerGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} + +func (s *FreezerGroup) GetState(path string) (configs.FreezerState, error) { + for { + state, err := cgroups.ReadFile(path, "freezer.state") + if err != nil { + // If the kernel is too old, then we just treat the freezer as + // being in an "undefined" state. + if os.IsNotExist(err) || errors.Is(err, unix.ENODEV) { + err = nil + } + return configs.Undefined, err + } + switch strings.TrimSpace(state) { + case "THAWED": + return configs.Thawed, nil + case "FROZEN": + // Find out whether the cgroup is frozen directly, + // or indirectly via an ancestor. + self, err := cgroups.ReadFile(path, "freezer.self_freezing") + if err != nil { + // If the kernel is too old, then we just treat + // it as being frozen. + if errors.Is(err, os.ErrNotExist) || errors.Is(err, unix.ENODEV) { + err = nil + } + return configs.Frozen, err + } + switch self { + case "0\n": + return configs.Thawed, nil + case "1\n": + return configs.Frozen, nil + default: + return configs.Undefined, fmt.Errorf(`unknown "freezer.self_freezing" state: %q`, self) + } + case "FREEZING": + // Make sure we get a stable freezer state, so retry if the cgroup + // is still undergoing freezing. This should be a temporary delay. + time.Sleep(1 * time.Millisecond) + continue + default: + return configs.Undefined, fmt.Errorf("unknown freezer.state %q", state) + } + } +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/fs.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/fs.go new file mode 100644 index 000000000..be4dcc341 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/fs.go @@ -0,0 +1,264 @@ +package fs + +import ( + "errors" + "fmt" + "os" + "sync" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +var subsystems = []subsystem{ + &CpusetGroup{}, + &DevicesGroup{}, + &MemoryGroup{}, + &CpuGroup{}, + &CpuacctGroup{}, + &PidsGroup{}, + &BlkioGroup{}, + &HugetlbGroup{}, + &NetClsGroup{}, + &NetPrioGroup{}, + &PerfEventGroup{}, + &FreezerGroup{}, + &RdmaGroup{}, + &NameGroup{GroupName: "name=systemd", Join: true}, +} + +var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist") + +func init() { + // If using cgroups-hybrid mode then add a "" controller indicating + // it should join the cgroups v2. + if cgroups.IsCgroup2HybridMode() { + subsystems = append(subsystems, &NameGroup{GroupName: "", Join: true}) + } +} + +type subsystem interface { + // Name returns the name of the subsystem. + Name() string + // GetStats fills in the stats for the subsystem. + GetStats(path string, stats *cgroups.Stats) error + // Apply creates and joins a cgroup, adding pid into it. Some + // subsystems use resources to pre-configure the cgroup parents + // before creating or joining it. + Apply(path string, r *configs.Resources, pid int) error + // Set sets the cgroup resources. + Set(path string, r *configs.Resources) error +} + +type manager struct { + mu sync.Mutex + cgroups *configs.Cgroup + paths map[string]string +} + +func NewManager(cg *configs.Cgroup, paths map[string]string) (cgroups.Manager, error) { + // Some v1 controllers (cpu, cpuset, and devices) expect + // cgroups.Resources to not be nil in Apply. + if cg.Resources == nil { + return nil, errors.New("cgroup v1 manager needs configs.Resources to be set during manager creation") + } + if cg.Resources.Unified != nil { + return nil, cgroups.ErrV1NoUnified + } + + if paths == nil { + var err error + paths, err = initPaths(cg) + if err != nil { + return nil, err + } + } + + return &manager{ + cgroups: cg, + paths: paths, + }, nil +} + +// isIgnorableError returns whether err is a permission error (in the loose +// sense of the word). This includes EROFS (which for an unprivileged user is +// basically a permission error) and EACCES (for similar reasons) as well as +// the normal EPERM. +func isIgnorableError(rootless bool, err error) bool { + // We do not ignore errors if we are root. + if !rootless { + return false + } + // Is it an ordinary EPERM? + if errors.Is(err, os.ErrPermission) { + return true + } + // Handle some specific syscall errors. + var errno unix.Errno + if errors.As(err, &errno) { + return errno == unix.EROFS || errno == unix.EPERM || errno == unix.EACCES + } + return false +} + +func (m *manager) Apply(pid int) (err error) { + m.mu.Lock() + defer m.mu.Unlock() + + c := m.cgroups + + for _, sys := range subsystems { + name := sys.Name() + p, ok := m.paths[name] + if !ok { + continue + } + + if err := sys.Apply(p, c.Resources, pid); err != nil { + // In the case of rootless (including euid=0 in userns), where an + // explicit cgroup path hasn't been set, we don't bail on error in + // case of permission problems here, but do delete the path from + // the m.paths map, since it is either non-existent and could not + // be created, or the pid could not be added to it. + // + // Cases where limits for the subsystem have been set are handled + // later by Set, which fails with a friendly error (see + // if path == "" in Set). + if isIgnorableError(c.Rootless, err) && c.Path == "" { + delete(m.paths, name) + continue + } + return err + } + + } + return nil +} + +func (m *manager) Destroy() error { + m.mu.Lock() + defer m.mu.Unlock() + return cgroups.RemovePaths(m.paths) +} + +func (m *manager) Path(subsys string) string { + m.mu.Lock() + defer m.mu.Unlock() + return m.paths[subsys] +} + +func (m *manager) GetStats() (*cgroups.Stats, error) { + m.mu.Lock() + defer m.mu.Unlock() + stats := cgroups.NewStats() + for _, sys := range subsystems { + path := m.paths[sys.Name()] + if path == "" { + continue + } + if err := sys.GetStats(path, stats); err != nil { + return nil, err + } + } + return stats, nil +} + +func (m *manager) Set(r *configs.Resources) error { + if r == nil { + return nil + } + + if r.Unified != nil { + return cgroups.ErrV1NoUnified + } + + m.mu.Lock() + defer m.mu.Unlock() + for _, sys := range subsystems { + path := m.paths[sys.Name()] + if err := sys.Set(path, r); err != nil { + // When rootless is true, errors from the device subsystem + // are ignored, as it is really not expected to work. + if m.cgroups.Rootless && sys.Name() == "devices" && !errors.Is(err, cgroups.ErrDevicesUnsupported) { + continue + } + // However, errors from other subsystems are not ignored. + // see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" + if path == "" { + // We never created a path for this cgroup, so we cannot set + // limits for it (though we have already tried at this point). + return fmt.Errorf("cannot set %s limit: container could not join or create cgroup", sys.Name()) + } + return err + } + } + + return nil +} + +// Freeze toggles the container's freezer cgroup depending on the state +// provided +func (m *manager) Freeze(state configs.FreezerState) error { + path := m.Path("freezer") + if path == "" { + return errors.New("cannot toggle freezer: cgroups not configured for container") + } + + prevState := m.cgroups.Resources.Freezer + m.cgroups.Resources.Freezer = state + freezer := &FreezerGroup{} + if err := freezer.Set(path, m.cgroups.Resources); err != nil { + m.cgroups.Resources.Freezer = prevState + return err + } + return nil +} + +func (m *manager) GetPids() ([]int, error) { + return cgroups.GetPids(m.Path("devices")) +} + +func (m *manager) GetAllPids() ([]int, error) { + return cgroups.GetAllPids(m.Path("devices")) +} + +func (m *manager) GetPaths() map[string]string { + m.mu.Lock() + defer m.mu.Unlock() + return m.paths +} + +func (m *manager) GetCgroups() (*configs.Cgroup, error) { + return m.cgroups, nil +} + +func (m *manager) GetFreezerState() (configs.FreezerState, error) { + dir := m.Path("freezer") + // If the container doesn't have the freezer cgroup, say it's undefined. + if dir == "" { + return configs.Undefined, nil + } + freezer := &FreezerGroup{} + return freezer.GetState(dir) +} + +func (m *manager) Exists() bool { + return cgroups.PathExists(m.Path("devices")) +} + +func OOMKillCount(path string) (uint64, error) { + return fscommon.GetValueByKey(path, "memory.oom_control", "oom_kill") +} + +func (m *manager) OOMKillCount() (uint64, error) { + c, err := OOMKillCount(m.Path("memory")) + // Ignore ENOENT when rootless as it couldn't create cgroup. + if err != nil && m.cgroups.Rootless && os.IsNotExist(err) { + err = nil + } + + return c, err +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/hugetlb.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/hugetlb.go new file mode 100644 index 000000000..8ddd6fdd8 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/hugetlb.go @@ -0,0 +1,62 @@ +package fs + +import ( + "strconv" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type HugetlbGroup struct{} + +func (s *HugetlbGroup) Name() string { + return "hugetlb" +} + +func (s *HugetlbGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) +} + +func (s *HugetlbGroup) Set(path string, r *configs.Resources) error { + for _, hugetlb := range r.HugetlbLimit { + if err := cgroups.WriteFile(path, "hugetlb."+hugetlb.Pagesize+".limit_in_bytes", strconv.FormatUint(hugetlb.Limit, 10)); err != nil { + return err + } + } + + return nil +} + +func (s *HugetlbGroup) GetStats(path string, stats *cgroups.Stats) error { + if !cgroups.PathExists(path) { + return nil + } + hugetlbStats := cgroups.HugetlbStats{} + for _, pageSize := range cgroups.HugePageSizes() { + usage := "hugetlb." + pageSize + ".usage_in_bytes" + value, err := fscommon.GetCgroupParamUint(path, usage) + if err != nil { + return err + } + hugetlbStats.Usage = value + + maxUsage := "hugetlb." + pageSize + ".max_usage_in_bytes" + value, err = fscommon.GetCgroupParamUint(path, maxUsage) + if err != nil { + return err + } + hugetlbStats.MaxUsage = value + + failcnt := "hugetlb." + pageSize + ".failcnt" + value, err = fscommon.GetCgroupParamUint(path, failcnt) + if err != nil { + return err + } + hugetlbStats.Failcnt = value + + stats.HugetlbStats[pageSize] = hugetlbStats + } + + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go new file mode 100644 index 000000000..b7c75f941 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go @@ -0,0 +1,348 @@ +package fs + +import ( + "bufio" + "errors" + "fmt" + "math" + "os" + "path/filepath" + "strconv" + "strings" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +const ( + cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes" + cgroupMemoryLimit = "memory.limit_in_bytes" + cgroupMemoryUsage = "memory.usage_in_bytes" + cgroupMemoryMaxUsage = "memory.max_usage_in_bytes" +) + +type MemoryGroup struct{} + +func (s *MemoryGroup) Name() string { + return "memory" +} + +func (s *MemoryGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) +} + +func setMemory(path string, val int64) error { + if val == 0 { + return nil + } + + err := cgroups.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(val, 10)) + if !errors.Is(err, unix.EBUSY) { + return err + } + + // EBUSY means the kernel can't set new limit as it's too low + // (lower than the current usage). Return more specific error. + usage, err := fscommon.GetCgroupParamUint(path, cgroupMemoryUsage) + if err != nil { + return err + } + max, err := fscommon.GetCgroupParamUint(path, cgroupMemoryMaxUsage) + if err != nil { + return err + } + + return fmt.Errorf("unable to set memory limit to %d (current usage: %d, peak usage: %d)", val, usage, max) +} + +func setSwap(path string, val int64) error { + if val == 0 { + return nil + } + + return cgroups.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(val, 10)) +} + +func setMemoryAndSwap(path string, r *configs.Resources) error { + // If the memory update is set to -1 and the swap is not explicitly + // set, we should also set swap to -1, it means unlimited memory. + if r.Memory == -1 && r.MemorySwap == 0 { + // Only set swap if it's enabled in kernel + if cgroups.PathExists(filepath.Join(path, cgroupMemorySwapLimit)) { + r.MemorySwap = -1 + } + } + + // When memory and swap memory are both set, we need to handle the cases + // for updating container. + if r.Memory != 0 && r.MemorySwap != 0 { + curLimit, err := fscommon.GetCgroupParamUint(path, cgroupMemoryLimit) + if err != nil { + return err + } + + // When update memory limit, we should adapt the write sequence + // for memory and swap memory, so it won't fail because the new + // value and the old value don't fit kernel's validation. + if r.MemorySwap == -1 || curLimit < uint64(r.MemorySwap) { + if err := setSwap(path, r.MemorySwap); err != nil { + return err + } + if err := setMemory(path, r.Memory); err != nil { + return err + } + return nil + } + } + + if err := setMemory(path, r.Memory); err != nil { + return err + } + if err := setSwap(path, r.MemorySwap); err != nil { + return err + } + + return nil +} + +func (s *MemoryGroup) Set(path string, r *configs.Resources) error { + if err := setMemoryAndSwap(path, r); err != nil { + return err + } + + // ignore KernelMemory and KernelMemoryTCP + + if r.MemoryReservation != 0 { + if err := cgroups.WriteFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(r.MemoryReservation, 10)); err != nil { + return err + } + } + + if r.OomKillDisable { + if err := cgroups.WriteFile(path, "memory.oom_control", "1"); err != nil { + return err + } + } + if r.MemorySwappiness == nil || int64(*r.MemorySwappiness) == -1 { + return nil + } else if *r.MemorySwappiness <= 100 { + if err := cgroups.WriteFile(path, "memory.swappiness", strconv.FormatUint(*r.MemorySwappiness, 10)); err != nil { + return err + } + } else { + return fmt.Errorf("invalid memory swappiness value: %d (valid range is 0-100)", *r.MemorySwappiness) + } + + return nil +} + +func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error { + const file = "memory.stat" + statsFile, err := cgroups.OpenFile(path, file, os.O_RDONLY) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + defer statsFile.Close() + + sc := bufio.NewScanner(statsFile) + for sc.Scan() { + t, v, err := fscommon.ParseKeyValue(sc.Text()) + if err != nil { + return &parseError{Path: path, File: file, Err: err} + } + stats.MemoryStats.Stats[t] = v + } + stats.MemoryStats.Cache = stats.MemoryStats.Stats["cache"] + + memoryUsage, err := getMemoryData(path, "") + if err != nil { + return err + } + stats.MemoryStats.Usage = memoryUsage + swapUsage, err := getMemoryData(path, "memsw") + if err != nil { + return err + } + stats.MemoryStats.SwapUsage = swapUsage + kernelUsage, err := getMemoryData(path, "kmem") + if err != nil { + return err + } + stats.MemoryStats.KernelUsage = kernelUsage + kernelTCPUsage, err := getMemoryData(path, "kmem.tcp") + if err != nil { + return err + } + stats.MemoryStats.KernelTCPUsage = kernelTCPUsage + + value, err := fscommon.GetCgroupParamUint(path, "memory.use_hierarchy") + if err != nil { + return err + } + if value == 1 { + stats.MemoryStats.UseHierarchy = true + } + + pagesByNUMA, err := getPageUsageByNUMA(path) + if err != nil { + return err + } + stats.MemoryStats.PageUsageByNUMA = pagesByNUMA + + return nil +} + +func getMemoryData(path, name string) (cgroups.MemoryData, error) { + memoryData := cgroups.MemoryData{} + + moduleName := "memory" + if name != "" { + moduleName = "memory." + name + } + var ( + usage = moduleName + ".usage_in_bytes" + maxUsage = moduleName + ".max_usage_in_bytes" + failcnt = moduleName + ".failcnt" + limit = moduleName + ".limit_in_bytes" + ) + + value, err := fscommon.GetCgroupParamUint(path, usage) + if err != nil { + if name != "" && os.IsNotExist(err) { + // Ignore ENOENT as swap and kmem controllers + // are optional in the kernel. + return cgroups.MemoryData{}, nil + } + return cgroups.MemoryData{}, err + } + memoryData.Usage = value + value, err = fscommon.GetCgroupParamUint(path, maxUsage) + if err != nil { + return cgroups.MemoryData{}, err + } + memoryData.MaxUsage = value + value, err = fscommon.GetCgroupParamUint(path, failcnt) + if err != nil { + return cgroups.MemoryData{}, err + } + memoryData.Failcnt = value + value, err = fscommon.GetCgroupParamUint(path, limit) + if err != nil { + return cgroups.MemoryData{}, err + } + memoryData.Limit = value + + return memoryData, nil +} + +func getPageUsageByNUMA(path string) (cgroups.PageUsageByNUMA, error) { + const ( + maxColumns = math.MaxUint8 + 1 + file = "memory.numa_stat" + ) + stats := cgroups.PageUsageByNUMA{} + + fd, err := cgroups.OpenFile(path, file, os.O_RDONLY) + if os.IsNotExist(err) { + return stats, nil + } else if err != nil { + return stats, err + } + defer fd.Close() + + // File format is documented in linux/Documentation/cgroup-v1/memory.txt + // and it looks like this: + // + // total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ... + // file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ... + // anon=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ... + // unevictable=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ... + // hierarchical_<counter>=<counter pages> N0=<node 0 pages> N1=<node 1 pages> ... + + scanner := bufio.NewScanner(fd) + for scanner.Scan() { + var field *cgroups.PageStats + + line := scanner.Text() + columns := strings.SplitN(line, " ", maxColumns) + for i, column := range columns { + byNode := strings.SplitN(column, "=", 2) + // Some custom kernels have non-standard fields, like + // numa_locality 0 0 0 0 0 0 0 0 0 0 + // numa_exectime 0 + if len(byNode) < 2 { + if i == 0 { + // Ignore/skip those. + break + } else { + // The first column was already validated, + // so be strict to the rest. + return stats, malformedLine(path, file, line) + } + } + key, val := byNode[0], byNode[1] + if i == 0 { // First column: key is name, val is total. + field = getNUMAField(&stats, key) + if field == nil { // unknown field (new kernel?) + break + } + field.Total, err = strconv.ParseUint(val, 0, 64) + if err != nil { + return stats, &parseError{Path: path, File: file, Err: err} + } + field.Nodes = map[uint8]uint64{} + } else { // Subsequent columns: key is N<id>, val is usage. + if len(key) < 2 || key[0] != 'N' { + // This is definitely an error. + return stats, malformedLine(path, file, line) + } + + n, err := strconv.ParseUint(key[1:], 10, 8) + if err != nil { + return stats, &parseError{Path: path, File: file, Err: err} + } + + usage, err := strconv.ParseUint(val, 10, 64) + if err != nil { + return stats, &parseError{Path: path, File: file, Err: err} + } + + field.Nodes[uint8(n)] = usage + } + + } + } + if err := scanner.Err(); err != nil { + return cgroups.PageUsageByNUMA{}, &parseError{Path: path, File: file, Err: err} + } + + return stats, nil +} + +func getNUMAField(stats *cgroups.PageUsageByNUMA, name string) *cgroups.PageStats { + switch name { + case "total": + return &stats.Total + case "file": + return &stats.File + case "anon": + return &stats.Anon + case "unevictable": + return &stats.Unevictable + case "hierarchical_total": + return &stats.Hierarchical.Total + case "hierarchical_file": + return &stats.Hierarchical.File + case "hierarchical_anon": + return &stats.Hierarchical.Anon + case "hierarchical_unevictable": + return &stats.Hierarchical.Unevictable + } + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/name.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/name.go new file mode 100644 index 000000000..b8d5d849c --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/name.go @@ -0,0 +1,31 @@ +package fs + +import ( + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type NameGroup struct { + GroupName string + Join bool +} + +func (s *NameGroup) Name() string { + return s.GroupName +} + +func (s *NameGroup) Apply(path string, _ *configs.Resources, pid int) error { + if s.Join { + // Ignore errors if the named cgroup does not exist. + _ = apply(path, pid) + } + return nil +} + +func (s *NameGroup) Set(_ string, _ *configs.Resources) error { + return nil +} + +func (s *NameGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_cls.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_cls.go new file mode 100644 index 000000000..abfd09ce8 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_cls.go @@ -0,0 +1,32 @@ +package fs + +import ( + "strconv" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type NetClsGroup struct{} + +func (s *NetClsGroup) Name() string { + return "net_cls" +} + +func (s *NetClsGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) +} + +func (s *NetClsGroup) Set(path string, r *configs.Resources) error { + if r.NetClsClassid != 0 { + if err := cgroups.WriteFile(path, "net_cls.classid", strconv.FormatUint(uint64(r.NetClsClassid), 10)); err != nil { + return err + } + } + + return nil +} + +func (s *NetClsGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_prio.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_prio.go new file mode 100644 index 000000000..da74d3779 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_prio.go @@ -0,0 +1,30 @@ +package fs + +import ( + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type NetPrioGroup struct{} + +func (s *NetPrioGroup) Name() string { + return "net_prio" +} + +func (s *NetPrioGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) +} + +func (s *NetPrioGroup) Set(path string, r *configs.Resources) error { + for _, prioMap := range r.NetPrioIfpriomap { + if err := cgroups.WriteFile(path, "net_prio.ifpriomap", prioMap.CgroupString()); err != nil { + return err + } + } + + return nil +} + +func (s *NetPrioGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/paths.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/paths.go new file mode 100644 index 000000000..1092331b2 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/paths.go @@ -0,0 +1,186 @@ +package fs + +import ( + "errors" + "os" + "path/filepath" + "sync" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/utils" +) + +// The absolute path to the root of the cgroup hierarchies. +var ( + cgroupRootLock sync.Mutex + cgroupRoot string +) + +const defaultCgroupRoot = "/sys/fs/cgroup" + +func initPaths(cg *configs.Cgroup) (map[string]string, error) { + root, err := rootPath() + if err != nil { + return nil, err + } + + inner, err := innerPath(cg) + if err != nil { + return nil, err + } + + paths := make(map[string]string) + for _, sys := range subsystems { + name := sys.Name() + path, err := subsysPath(root, inner, name) + if err != nil { + // The non-presence of the devices subsystem + // is considered fatal for security reasons. + if cgroups.IsNotFound(err) && (cg.SkipDevices || name != "devices") { + continue + } + + return nil, err + } + paths[name] = path + } + + return paths, nil +} + +func tryDefaultCgroupRoot() string { + var st, pst unix.Stat_t + + // (1) it should be a directory... + err := unix.Lstat(defaultCgroupRoot, &st) + if err != nil || st.Mode&unix.S_IFDIR == 0 { + return "" + } + + // (2) ... and a mount point ... + err = unix.Lstat(filepath.Dir(defaultCgroupRoot), &pst) + if err != nil { + return "" + } + + if st.Dev == pst.Dev { + // parent dir has the same dev -- not a mount point + return "" + } + + // (3) ... of 'tmpfs' fs type. + var fst unix.Statfs_t + err = unix.Statfs(defaultCgroupRoot, &fst) + if err != nil || fst.Type != unix.TMPFS_MAGIC { + return "" + } + + // (4) it should have at least 1 entry ... + dir, err := os.Open(defaultCgroupRoot) + if err != nil { + return "" + } + names, err := dir.Readdirnames(1) + if err != nil { + return "" + } + if len(names) < 1 { + return "" + } + // ... which is a cgroup mount point. + err = unix.Statfs(filepath.Join(defaultCgroupRoot, names[0]), &fst) + if err != nil || fst.Type != unix.CGROUP_SUPER_MAGIC { + return "" + } + + return defaultCgroupRoot +} + +// rootPath finds and returns path to the root of the cgroup hierarchies. +func rootPath() (string, error) { + cgroupRootLock.Lock() + defer cgroupRootLock.Unlock() + + if cgroupRoot != "" { + return cgroupRoot, nil + } + + // fast path + cgroupRoot = tryDefaultCgroupRoot() + if cgroupRoot != "" { + return cgroupRoot, nil + } + + // slow path: parse mountinfo + mi, err := cgroups.GetCgroupMounts(false) + if err != nil { + return "", err + } + if len(mi) < 1 { + return "", errors.New("no cgroup mount found in mountinfo") + } + + // Get the first cgroup mount (e.g. "/sys/fs/cgroup/memory"), + // use its parent directory. + root := filepath.Dir(mi[0].Mountpoint) + + if _, err := os.Stat(root); err != nil { + return "", err + } + + cgroupRoot = root + return cgroupRoot, nil +} + +func innerPath(c *configs.Cgroup) (string, error) { + if (c.Name != "" || c.Parent != "") && c.Path != "" { + return "", errors.New("cgroup: either Path or Name and Parent should be used") + } + + // XXX: Do not remove CleanPath. Path safety is important! -- cyphar + innerPath := utils.CleanPath(c.Path) + if innerPath == "" { + cgParent := utils.CleanPath(c.Parent) + cgName := utils.CleanPath(c.Name) + innerPath = filepath.Join(cgParent, cgName) + } + + return innerPath, nil +} + +func subsysPath(root, inner, subsystem string) (string, error) { + // If the cgroup name/path is absolute do not look relative to the cgroup of the init process. + if filepath.IsAbs(inner) { + mnt, err := cgroups.FindCgroupMountpoint(root, subsystem) + // If we didn't mount the subsystem, there is no point we make the path. + if err != nil { + return "", err + } + + // Sometimes subsystems can be mounted together as 'cpu,cpuacct'. + return filepath.Join(root, filepath.Base(mnt), inner), nil + } + + // Use GetOwnCgroupPath instead of GetInitCgroupPath, because the creating + // process could in container and shared pid namespace with host, and + // /proc/1/cgroup could point to whole other world of cgroups. + parentPath, err := cgroups.GetOwnCgroupPath(subsystem) + if err != nil { + return "", err + } + + return filepath.Join(parentPath, inner), nil +} + +func apply(path string, pid int) error { + if path == "" { + return nil + } + if err := os.MkdirAll(path, 0o755); err != nil { + return err + } + return cgroups.WriteCgroupProc(path, pid) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/perf_event.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/perf_event.go new file mode 100644 index 000000000..b86955c8f --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/perf_event.go @@ -0,0 +1,24 @@ +package fs + +import ( + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type PerfEventGroup struct{} + +func (s *PerfEventGroup) Name() string { + return "perf_event" +} + +func (s *PerfEventGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) +} + +func (s *PerfEventGroup) Set(_ string, _ *configs.Resources) error { + return nil +} + +func (s *PerfEventGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/pids.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/pids.go new file mode 100644 index 000000000..1f13532a5 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/pids.go @@ -0,0 +1,62 @@ +package fs + +import ( + "math" + "strconv" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type PidsGroup struct{} + +func (s *PidsGroup) Name() string { + return "pids" +} + +func (s *PidsGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) +} + +func (s *PidsGroup) Set(path string, r *configs.Resources) error { + if r.PidsLimit != 0 { + // "max" is the fallback value. + limit := "max" + + if r.PidsLimit > 0 { + limit = strconv.FormatInt(r.PidsLimit, 10) + } + + if err := cgroups.WriteFile(path, "pids.max", limit); err != nil { + return err + } + } + + return nil +} + +func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error { + if !cgroups.PathExists(path) { + return nil + } + current, err := fscommon.GetCgroupParamUint(path, "pids.current") + if err != nil { + return err + } + + max, err := fscommon.GetCgroupParamUint(path, "pids.max") + if err != nil { + return err + } + // If no limit is set, read from pids.max returns "max", which is + // converted to MaxUint64 by GetCgroupParamUint. Historically, we + // represent "no limit" for pids as 0, thus this conversion. + if max == math.MaxUint64 { + max = 0 + } + + stats.PidsStats.Current = current + stats.PidsStats.Limit = max + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/rdma.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/rdma.go new file mode 100644 index 000000000..5bbe0f35f --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/rdma.go @@ -0,0 +1,25 @@ +package fs + +import ( + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type RdmaGroup struct{} + +func (s *RdmaGroup) Name() string { + return "rdma" +} + +func (s *RdmaGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) +} + +func (s *RdmaGroup) Set(path string, r *configs.Resources) error { + return fscommon.RdmaSet(path, r) +} + +func (s *RdmaGroup) GetStats(path string, stats *cgroups.Stats) error { + return fscommon.RdmaGetStats(path, stats) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/cpu.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/cpu.go new file mode 100644 index 000000000..bbbae4d58 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/cpu.go @@ -0,0 +1,87 @@ +package fs2 + +import ( + "bufio" + "os" + "strconv" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func isCpuSet(r *configs.Resources) bool { + return r.CpuWeight != 0 || r.CpuQuota != 0 || r.CpuPeriod != 0 +} + +func setCpu(dirPath string, r *configs.Resources) error { + if !isCpuSet(r) { + return nil + } + + // NOTE: .CpuShares is not used here. Conversion is the caller's responsibility. + if r.CpuWeight != 0 { + if err := cgroups.WriteFile(dirPath, "cpu.weight", strconv.FormatUint(r.CpuWeight, 10)); err != nil { + return err + } + } + + if r.CpuQuota != 0 || r.CpuPeriod != 0 { + str := "max" + if r.CpuQuota > 0 { + str = strconv.FormatInt(r.CpuQuota, 10) + } + period := r.CpuPeriod + if period == 0 { + // This default value is documented in + // https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html + period = 100000 + } + str += " " + strconv.FormatUint(period, 10) + if err := cgroups.WriteFile(dirPath, "cpu.max", str); err != nil { + return err + } + } + + return nil +} + +func statCpu(dirPath string, stats *cgroups.Stats) error { + const file = "cpu.stat" + f, err := cgroups.OpenFile(dirPath, file, os.O_RDONLY) + if err != nil { + return err + } + defer f.Close() + + sc := bufio.NewScanner(f) + for sc.Scan() { + t, v, err := fscommon.ParseKeyValue(sc.Text()) + if err != nil { + return &parseError{Path: dirPath, File: file, Err: err} + } + switch t { + case "usage_usec": + stats.CpuStats.CpuUsage.TotalUsage = v * 1000 + + case "user_usec": + stats.CpuStats.CpuUsage.UsageInUsermode = v * 1000 + + case "system_usec": + stats.CpuStats.CpuUsage.UsageInKernelmode = v * 1000 + + case "nr_periods": + stats.CpuStats.ThrottlingData.Periods = v + + case "nr_throttled": + stats.CpuStats.ThrottlingData.ThrottledPeriods = v + + case "throttled_usec": + stats.CpuStats.ThrottlingData.ThrottledTime = v * 1000 + } + } + if err := sc.Err(); err != nil { + return &parseError{Path: dirPath, File: file, Err: err} + } + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/cpuset.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/cpuset.go new file mode 100644 index 000000000..16c45bad8 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/cpuset.go @@ -0,0 +1,28 @@ +package fs2 + +import ( + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func isCpusetSet(r *configs.Resources) bool { + return r.CpusetCpus != "" || r.CpusetMems != "" +} + +func setCpuset(dirPath string, r *configs.Resources) error { + if !isCpusetSet(r) { + return nil + } + + if r.CpusetCpus != "" { + if err := cgroups.WriteFile(dirPath, "cpuset.cpus", r.CpusetCpus); err != nil { + return err + } + } + if r.CpusetMems != "" { + if err := cgroups.WriteFile(dirPath, "cpuset.mems", r.CpusetMems); err != nil { + return err + } + } + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/create.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/create.go new file mode 100644 index 000000000..641123a4d --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/create.go @@ -0,0 +1,152 @@ +package fs2 + +import ( + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func supportedControllers() (string, error) { + return cgroups.ReadFile(UnifiedMountpoint, "/cgroup.controllers") +} + +// needAnyControllers returns whether we enable some supported controllers or not, +// based on (1) controllers available and (2) resources that are being set. +// We don't check "pseudo" controllers such as +// "freezer" and "devices". +func needAnyControllers(r *configs.Resources) (bool, error) { + if r == nil { + return false, nil + } + + // list of all available controllers + content, err := supportedControllers() + if err != nil { + return false, err + } + avail := make(map[string]struct{}) + for _, ctr := range strings.Fields(content) { + avail[ctr] = struct{}{} + } + + // check whether the controller if available or not + have := func(controller string) bool { + _, ok := avail[controller] + return ok + } + + if isPidsSet(r) && have("pids") { + return true, nil + } + if isMemorySet(r) && have("memory") { + return true, nil + } + if isIoSet(r) && have("io") { + return true, nil + } + if isCpuSet(r) && have("cpu") { + return true, nil + } + if isCpusetSet(r) && have("cpuset") { + return true, nil + } + if isHugeTlbSet(r) && have("hugetlb") { + return true, nil + } + + return false, nil +} + +// containsDomainController returns whether the current config contains domain controller or not. +// Refer to: http://man7.org/linux/man-pages/man7/cgroups.7.html +// As at Linux 4.19, the following controllers are threaded: cpu, perf_event, and pids. +func containsDomainController(r *configs.Resources) bool { + return isMemorySet(r) || isIoSet(r) || isCpuSet(r) || isHugeTlbSet(r) +} + +// CreateCgroupPath creates cgroupv2 path, enabling all the supported controllers. +func CreateCgroupPath(path string, c *configs.Cgroup) (Err error) { + if !strings.HasPrefix(path, UnifiedMountpoint) { + return fmt.Errorf("invalid cgroup path %s", path) + } + + content, err := supportedControllers() + if err != nil { + return err + } + + const ( + cgTypeFile = "cgroup.type" + cgStCtlFile = "cgroup.subtree_control" + ) + ctrs := strings.Fields(content) + res := "+" + strings.Join(ctrs, " +") + + elements := strings.Split(path, "/") + elements = elements[3:] + current := "/sys/fs" + for i, e := range elements { + current = filepath.Join(current, e) + if i > 0 { + if err := os.Mkdir(current, 0o755); err != nil { + if !os.IsExist(err) { + return err + } + } else { + // If the directory was created, be sure it is not left around on errors. + current := current + defer func() { + if Err != nil { + os.Remove(current) + } + }() + } + cgType, _ := cgroups.ReadFile(current, cgTypeFile) + cgType = strings.TrimSpace(cgType) + switch cgType { + // If the cgroup is in an invalid mode (usually this means there's an internal + // process in the cgroup tree, because we created a cgroup under an + // already-populated-by-other-processes cgroup), then we have to error out if + // the user requested controllers which are not thread-aware. However, if all + // the controllers requested are thread-aware we can simply put the cgroup into + // threaded mode. + case "domain invalid": + if containsDomainController(c.Resources) { + return fmt.Errorf("cannot enter cgroupv2 %q with domain controllers -- it is in an invalid state", current) + } else { + // Not entirely correct (in theory we'd always want to be a domain -- + // since that means we're a properly delegated cgroup subtree) but in + // this case there's not much we can do and it's better than giving an + // error. + _ = cgroups.WriteFile(current, cgTypeFile, "threaded") + } + // If the cgroup is in (threaded) or (domain threaded) mode, we can only use thread-aware controllers + // (and you cannot usually take a cgroup out of threaded mode). + case "domain threaded": + fallthrough + case "threaded": + if containsDomainController(c.Resources) { + return fmt.Errorf("cannot enter cgroupv2 %q with domain controllers -- it is in %s mode", current, cgType) + } + } + } + // enable all supported controllers + if i < len(elements)-1 { + if err := cgroups.WriteFile(current, cgStCtlFile, res); err != nil { + // try write one by one + allCtrs := strings.Split(res, " ") + for _, ctr := range allCtrs { + _ = cgroups.WriteFile(current, cgStCtlFile, ctr) + } + } + // Some controllers might not be enabled when rootless or containerized, + // but we don't catch the error here. (Caught in setXXX() functions.) + } + } + + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/defaultpath.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/defaultpath.go new file mode 100644 index 000000000..9c949c91f --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/defaultpath.go @@ -0,0 +1,99 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package fs2 + +import ( + "bufio" + "errors" + "fmt" + "io" + "os" + "path/filepath" + "strings" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/utils" +) + +const UnifiedMountpoint = "/sys/fs/cgroup" + +func defaultDirPath(c *configs.Cgroup) (string, error) { + if (c.Name != "" || c.Parent != "") && c.Path != "" { + return "", fmt.Errorf("cgroup: either Path or Name and Parent should be used, got %+v", c) + } + + return _defaultDirPath(UnifiedMountpoint, c.Path, c.Parent, c.Name) +} + +func _defaultDirPath(root, cgPath, cgParent, cgName string) (string, error) { + if (cgName != "" || cgParent != "") && cgPath != "" { + return "", errors.New("cgroup: either Path or Name and Parent should be used") + } + + // XXX: Do not remove CleanPath. Path safety is important! -- cyphar + innerPath := utils.CleanPath(cgPath) + if innerPath == "" { + cgParent := utils.CleanPath(cgParent) + cgName := utils.CleanPath(cgName) + innerPath = filepath.Join(cgParent, cgName) + } + if filepath.IsAbs(innerPath) { + return filepath.Join(root, innerPath), nil + } + + ownCgroup, err := parseCgroupFile("/proc/self/cgroup") + if err != nil { + return "", err + } + // The current user scope most probably has tasks in it already, + // making it impossible to enable controllers for its sub-cgroup. + // A parent cgroup (with no tasks in it) is what we need. + ownCgroup = filepath.Dir(ownCgroup) + + return filepath.Join(root, ownCgroup, innerPath), nil +} + +// parseCgroupFile parses /proc/PID/cgroup file and return string +func parseCgroupFile(path string) (string, error) { + f, err := os.Open(path) + if err != nil { + return "", err + } + defer f.Close() + return parseCgroupFromReader(f) +} + +func parseCgroupFromReader(r io.Reader) (string, error) { + s := bufio.NewScanner(r) + for s.Scan() { + var ( + text = s.Text() + parts = strings.SplitN(text, ":", 3) + ) + if len(parts) < 3 { + return "", fmt.Errorf("invalid cgroup entry: %q", text) + } + // text is like "0::/user.slice/user-1001.slice/session-1.scope" + if parts[0] == "0" && parts[1] == "" { + return parts[2], nil + } + } + if err := s.Err(); err != nil { + return "", err + } + return "", errors.New("cgroup path not found") +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/freezer.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/freezer.go new file mode 100644 index 000000000..8917a6411 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/freezer.go @@ -0,0 +1,127 @@ +package fs2 + +import ( + "bufio" + "errors" + "fmt" + "os" + "strings" + "time" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func setFreezer(dirPath string, state configs.FreezerState) error { + var stateStr string + switch state { + case configs.Undefined: + return nil + case configs.Frozen: + stateStr = "1" + case configs.Thawed: + stateStr = "0" + default: + return fmt.Errorf("invalid freezer state %q requested", state) + } + + fd, err := cgroups.OpenFile(dirPath, "cgroup.freeze", unix.O_RDWR) + if err != nil { + // We can ignore this request as long as the user didn't ask us to + // freeze the container (since without the freezer cgroup, that's a + // no-op). + if state != configs.Frozen { + return nil + } + return fmt.Errorf("freezer not supported: %w", err) + } + defer fd.Close() + + if _, err := fd.WriteString(stateStr); err != nil { + return err + } + // Confirm that the cgroup did actually change states. + if actualState, err := readFreezer(dirPath, fd); err != nil { + return err + } else if actualState != state { + return fmt.Errorf(`expected "cgroup.freeze" to be in state %q but was in %q`, state, actualState) + } + return nil +} + +func getFreezer(dirPath string) (configs.FreezerState, error) { + fd, err := cgroups.OpenFile(dirPath, "cgroup.freeze", unix.O_RDONLY) + if err != nil { + // If the kernel is too old, then we just treat the freezer as being in + // an "undefined" state. + if os.IsNotExist(err) || errors.Is(err, unix.ENODEV) { + err = nil + } + return configs.Undefined, err + } + defer fd.Close() + + return readFreezer(dirPath, fd) +} + +func readFreezer(dirPath string, fd *os.File) (configs.FreezerState, error) { + if _, err := fd.Seek(0, 0); err != nil { + return configs.Undefined, err + } + state := make([]byte, 2) + if _, err := fd.Read(state); err != nil { + return configs.Undefined, err + } + switch string(state) { + case "0\n": + return configs.Thawed, nil + case "1\n": + return waitFrozen(dirPath) + default: + return configs.Undefined, fmt.Errorf(`unknown "cgroup.freeze" state: %q`, state) + } +} + +// waitFrozen polls cgroup.events until it sees "frozen 1" in it. +func waitFrozen(dirPath string) (configs.FreezerState, error) { + fd, err := cgroups.OpenFile(dirPath, "cgroup.events", unix.O_RDONLY) + if err != nil { + return configs.Undefined, err + } + defer fd.Close() + + // XXX: Simple wait/read/retry is used here. An implementation + // based on poll(2) or inotify(7) is possible, but it makes the code + // much more complicated. Maybe address this later. + const ( + // Perform maxIter with waitTime in between iterations. + waitTime = 10 * time.Millisecond + maxIter = 1000 + ) + scanner := bufio.NewScanner(fd) + for i := 0; scanner.Scan(); { + if i == maxIter { + return configs.Undefined, fmt.Errorf("timeout of %s reached waiting for the cgroup to freeze", waitTime*maxIter) + } + line := scanner.Text() + val := strings.TrimPrefix(line, "frozen ") + if val != line { // got prefix + if val[0] == '1' { + return configs.Frozen, nil + } + + i++ + // wait, then re-read + time.Sleep(waitTime) + _, err := fd.Seek(0, 0) + if err != nil { + return configs.Undefined, err + } + } + } + // Should only reach here either on read error, + // or if the file does not contain "frozen " line. + return configs.Undefined, scanner.Err() +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/fs2.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/fs2.go new file mode 100644 index 000000000..d5208d778 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/fs2.go @@ -0,0 +1,271 @@ +package fs2 + +import ( + "errors" + "fmt" + "os" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type parseError = fscommon.ParseError + +type manager struct { + config *configs.Cgroup + // dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope" + dirPath string + // controllers is content of "cgroup.controllers" file. + // excludes pseudo-controllers ("devices" and "freezer"). + controllers map[string]struct{} +} + +// NewManager creates a manager for cgroup v2 unified hierarchy. +// dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope". +// If dirPath is empty, it is automatically set using config. +func NewManager(config *configs.Cgroup, dirPath string) (cgroups.Manager, error) { + if dirPath == "" { + var err error + dirPath, err = defaultDirPath(config) + if err != nil { + return nil, err + } + } + + m := &manager{ + config: config, + dirPath: dirPath, + } + return m, nil +} + +func (m *manager) getControllers() error { + if m.controllers != nil { + return nil + } + + data, err := cgroups.ReadFile(m.dirPath, "cgroup.controllers") + if err != nil { + if m.config.Rootless && m.config.Path == "" { + return nil + } + return err + } + fields := strings.Fields(data) + m.controllers = make(map[string]struct{}, len(fields)) + for _, c := range fields { + m.controllers[c] = struct{}{} + } + + return nil +} + +func (m *manager) Apply(pid int) error { + if err := CreateCgroupPath(m.dirPath, m.config); err != nil { + // Related tests: + // - "runc create (no limits + no cgrouppath + no permission) succeeds" + // - "runc create (rootless + no limits + cgrouppath + no permission) fails with permission error" + // - "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" + if m.config.Rootless { + if m.config.Path == "" { + if blNeed, nErr := needAnyControllers(m.config.Resources); nErr == nil && !blNeed { + return nil + } + return fmt.Errorf("rootless needs no limits + no cgrouppath when no permission is granted for cgroups: %w", err) + } + } + return err + } + if err := cgroups.WriteCgroupProc(m.dirPath, pid); err != nil { + return err + } + return nil +} + +func (m *manager) GetPids() ([]int, error) { + return cgroups.GetPids(m.dirPath) +} + +func (m *manager) GetAllPids() ([]int, error) { + return cgroups.GetAllPids(m.dirPath) +} + +func (m *manager) GetStats() (*cgroups.Stats, error) { + var errs []error + + st := cgroups.NewStats() + + // pids (since kernel 4.5) + if err := statPids(m.dirPath, st); err != nil { + errs = append(errs, err) + } + // memory (since kernel 4.5) + if err := statMemory(m.dirPath, st); err != nil && !os.IsNotExist(err) { + errs = append(errs, err) + } + // io (since kernel 4.5) + if err := statIo(m.dirPath, st); err != nil && !os.IsNotExist(err) { + errs = append(errs, err) + } + // cpu (since kernel 4.15) + // Note cpu.stat is available even if the controller is not enabled. + if err := statCpu(m.dirPath, st); err != nil && !os.IsNotExist(err) { + errs = append(errs, err) + } + // hugetlb (since kernel 5.6) + if err := statHugeTlb(m.dirPath, st); err != nil && !os.IsNotExist(err) { + errs = append(errs, err) + } + // rdma (since kernel 4.11) + if err := fscommon.RdmaGetStats(m.dirPath, st); err != nil && !os.IsNotExist(err) { + errs = append(errs, err) + } + if len(errs) > 0 && !m.config.Rootless { + return st, fmt.Errorf("error while statting cgroup v2: %+v", errs) + } + return st, nil +} + +func (m *manager) Freeze(state configs.FreezerState) error { + if m.config.Resources == nil { + return errors.New("cannot toggle freezer: cgroups not configured for container") + } + if err := setFreezer(m.dirPath, state); err != nil { + return err + } + m.config.Resources.Freezer = state + return nil +} + +func (m *manager) Destroy() error { + return cgroups.RemovePath(m.dirPath) +} + +func (m *manager) Path(_ string) string { + return m.dirPath +} + +func (m *manager) Set(r *configs.Resources) error { + if r == nil { + return nil + } + if err := m.getControllers(); err != nil { + return err + } + // pids (since kernel 4.5) + if err := setPids(m.dirPath, r); err != nil { + return err + } + // memory (since kernel 4.5) + if err := setMemory(m.dirPath, r); err != nil { + return err + } + // io (since kernel 4.5) + if err := setIo(m.dirPath, r); err != nil { + return err + } + // cpu (since kernel 4.15) + if err := setCpu(m.dirPath, r); err != nil { + return err + } + // devices (since kernel 4.15, pseudo-controller) + // + // When rootless is true, errors from the device subsystem are ignored because it is really not expected to work. + // However, errors from other subsystems are not ignored. + // see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" + if err := setDevices(m.dirPath, r); err != nil { + if !m.config.Rootless || errors.Is(err, cgroups.ErrDevicesUnsupported) { + return err + } + } + // cpuset (since kernel 5.0) + if err := setCpuset(m.dirPath, r); err != nil { + return err + } + // hugetlb (since kernel 5.6) + if err := setHugeTlb(m.dirPath, r); err != nil { + return err + } + // rdma (since kernel 4.11) + if err := fscommon.RdmaSet(m.dirPath, r); err != nil { + return err + } + // freezer (since kernel 5.2, pseudo-controller) + if err := setFreezer(m.dirPath, r.Freezer); err != nil { + return err + } + if err := m.setUnified(r.Unified); err != nil { + return err + } + m.config.Resources = r + return nil +} + +func setDevices(dirPath string, r *configs.Resources) error { + if cgroups.DevicesSetV2 == nil { + if len(r.Devices) > 0 { + return cgroups.ErrDevicesUnsupported + } + return nil + } + return cgroups.DevicesSetV2(dirPath, r) +} + +func (m *manager) setUnified(res map[string]string) error { + for k, v := range res { + if strings.Contains(k, "/") { + return fmt.Errorf("unified resource %q must be a file name (no slashes)", k) + } + if err := cgroups.WriteFile(m.dirPath, k, v); err != nil { + // Check for both EPERM and ENOENT since O_CREAT is used by WriteFile. + if errors.Is(err, os.ErrPermission) || errors.Is(err, os.ErrNotExist) { + // Check if a controller is available, + // to give more specific error if not. + sk := strings.SplitN(k, ".", 2) + if len(sk) != 2 { + return fmt.Errorf("unified resource %q must be in the form CONTROLLER.PARAMETER", k) + } + c := sk[0] + if _, ok := m.controllers[c]; !ok && c != "cgroup" { + return fmt.Errorf("unified resource %q can't be set: controller %q not available", k, c) + } + } + return fmt.Errorf("unable to set unified resource %q: %w", k, err) + } + } + + return nil +} + +func (m *manager) GetPaths() map[string]string { + paths := make(map[string]string, 1) + paths[""] = m.dirPath + return paths +} + +func (m *manager) GetCgroups() (*configs.Cgroup, error) { + return m.config, nil +} + +func (m *manager) GetFreezerState() (configs.FreezerState, error) { + return getFreezer(m.dirPath) +} + +func (m *manager) Exists() bool { + return cgroups.PathExists(m.dirPath) +} + +func OOMKillCount(path string) (uint64, error) { + return fscommon.GetValueByKey(path, "memory.events", "oom_kill") +} + +func (m *manager) OOMKillCount() (uint64, error) { + c, err := OOMKillCount(m.dirPath) + if err != nil && m.config.Rootless && os.IsNotExist(err) { + err = nil + } + + return c, err +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/hugetlb.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/hugetlb.go new file mode 100644 index 000000000..c92a7e64a --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/hugetlb.go @@ -0,0 +1,48 @@ +package fs2 + +import ( + "strconv" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func isHugeTlbSet(r *configs.Resources) bool { + return len(r.HugetlbLimit) > 0 +} + +func setHugeTlb(dirPath string, r *configs.Resources) error { + if !isHugeTlbSet(r) { + return nil + } + for _, hugetlb := range r.HugetlbLimit { + if err := cgroups.WriteFile(dirPath, "hugetlb."+hugetlb.Pagesize+".max", strconv.FormatUint(hugetlb.Limit, 10)); err != nil { + return err + } + } + + return nil +} + +func statHugeTlb(dirPath string, stats *cgroups.Stats) error { + hugetlbStats := cgroups.HugetlbStats{} + for _, pagesize := range cgroups.HugePageSizes() { + value, err := fscommon.GetCgroupParamUint(dirPath, "hugetlb."+pagesize+".current") + if err != nil { + return err + } + hugetlbStats.Usage = value + + fileName := "hugetlb." + pagesize + ".events" + value, err = fscommon.GetValueByKey(dirPath, fileName, "max") + if err != nil { + return err + } + hugetlbStats.Failcnt = value + + stats.HugetlbStats[pagesize] = hugetlbStats + } + + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/io.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/io.go new file mode 100644 index 000000000..b2ff7d340 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/io.go @@ -0,0 +1,193 @@ +package fs2 + +import ( + "bufio" + "bytes" + "fmt" + "os" + "strconv" + "strings" + + "github.com/sirupsen/logrus" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func isIoSet(r *configs.Resources) bool { + return r.BlkioWeight != 0 || + len(r.BlkioWeightDevice) > 0 || + len(r.BlkioThrottleReadBpsDevice) > 0 || + len(r.BlkioThrottleWriteBpsDevice) > 0 || + len(r.BlkioThrottleReadIOPSDevice) > 0 || + len(r.BlkioThrottleWriteIOPSDevice) > 0 +} + +// bfqDeviceWeightSupported checks for per-device BFQ weight support (added +// in kernel v5.4, commit 795fe54c2a8) by reading from "io.bfq.weight". +func bfqDeviceWeightSupported(bfq *os.File) bool { + if bfq == nil { + return false + } + _, _ = bfq.Seek(0, 0) + buf := make([]byte, 32) + _, _ = bfq.Read(buf) + // If only a single number (default weight) if read back, we have older kernel. + _, err := strconv.ParseInt(string(bytes.TrimSpace(buf)), 10, 64) + return err != nil +} + +func setIo(dirPath string, r *configs.Resources) error { + if !isIoSet(r) { + return nil + } + + // If BFQ IO scheduler is available, use it. + var bfq *os.File + if r.BlkioWeight != 0 || len(r.BlkioWeightDevice) > 0 { + var err error + bfq, err = cgroups.OpenFile(dirPath, "io.bfq.weight", os.O_RDWR) + if err == nil { + defer bfq.Close() + } else if !os.IsNotExist(err) { + return err + } + } + + if r.BlkioWeight != 0 { + if bfq != nil { // Use BFQ. + if _, err := bfq.WriteString(strconv.FormatUint(uint64(r.BlkioWeight), 10)); err != nil { + return err + } + } else { + // Fallback to io.weight with a conversion scheme. + v := cgroups.ConvertBlkIOToIOWeightValue(r.BlkioWeight) + if err := cgroups.WriteFile(dirPath, "io.weight", strconv.FormatUint(v, 10)); err != nil { + return err + } + } + } + if bfqDeviceWeightSupported(bfq) { + for _, wd := range r.BlkioWeightDevice { + if _, err := bfq.WriteString(wd.WeightString() + "\n"); err != nil { + return fmt.Errorf("setting device weight %q: %w", wd.WeightString(), err) + } + } + } + for _, td := range r.BlkioThrottleReadBpsDevice { + if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("rbps")); err != nil { + return err + } + } + for _, td := range r.BlkioThrottleWriteBpsDevice { + if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("wbps")); err != nil { + return err + } + } + for _, td := range r.BlkioThrottleReadIOPSDevice { + if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("riops")); err != nil { + return err + } + } + for _, td := range r.BlkioThrottleWriteIOPSDevice { + if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("wiops")); err != nil { + return err + } + } + + return nil +} + +func readCgroup2MapFile(dirPath string, name string) (map[string][]string, error) { + ret := map[string][]string{} + f, err := cgroups.OpenFile(dirPath, name, os.O_RDONLY) + if err != nil { + return nil, err + } + defer f.Close() + scanner := bufio.NewScanner(f) + for scanner.Scan() { + line := scanner.Text() + parts := strings.Fields(line) + if len(parts) < 2 { + continue + } + ret[parts[0]] = parts[1:] + } + if err := scanner.Err(); err != nil { + return nil, &parseError{Path: dirPath, File: name, Err: err} + } + return ret, nil +} + +func statIo(dirPath string, stats *cgroups.Stats) error { + const file = "io.stat" + values, err := readCgroup2MapFile(dirPath, file) + if err != nil { + return err + } + // more details on the io.stat file format: https://www.kernel.org/doc/Documentation/cgroup-v2.txt + var parsedStats cgroups.BlkioStats + for k, v := range values { + d := strings.Split(k, ":") + if len(d) != 2 { + continue + } + major, err := strconv.ParseUint(d[0], 10, 64) + if err != nil { + return &parseError{Path: dirPath, File: file, Err: err} + } + minor, err := strconv.ParseUint(d[1], 10, 64) + if err != nil { + return &parseError{Path: dirPath, File: file, Err: err} + } + + for _, item := range v { + d := strings.Split(item, "=") + if len(d) != 2 { + continue + } + op := d[0] + + // Map to the cgroupv1 naming and layout (in separate tables). + var targetTable *[]cgroups.BlkioStatEntry + switch op { + // Equivalent to cgroupv1's blkio.io_service_bytes. + case "rbytes": + op = "Read" + targetTable = &parsedStats.IoServiceBytesRecursive + case "wbytes": + op = "Write" + targetTable = &parsedStats.IoServiceBytesRecursive + // Equivalent to cgroupv1's blkio.io_serviced. + case "rios": + op = "Read" + targetTable = &parsedStats.IoServicedRecursive + case "wios": + op = "Write" + targetTable = &parsedStats.IoServicedRecursive + default: + // Skip over entries we cannot map to cgroupv1 stats for now. + // In the future we should expand the stats struct to include + // them. + logrus.Debugf("cgroupv2 io stats: skipping over unmappable %s entry", item) + continue + } + + value, err := strconv.ParseUint(d[1], 10, 64) + if err != nil { + return &parseError{Path: dirPath, File: file, Err: err} + } + + entry := cgroups.BlkioStatEntry{ + Op: op, + Major: major, + Minor: minor, + Value: value, + } + *targetTable = append(*targetTable, entry) + } + } + stats.BlkioStats = parsedStats + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/memory.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/memory.go new file mode 100644 index 000000000..adbc4b230 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/memory.go @@ -0,0 +1,216 @@ +package fs2 + +import ( + "bufio" + "errors" + "math" + "os" + "strconv" + "strings" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +// numToStr converts an int64 value to a string for writing to a +// cgroupv2 files with .min, .max, .low, or .high suffix. +// The value of -1 is converted to "max" for cgroupv1 compatibility +// (which used to write -1 to remove the limit). +func numToStr(value int64) (ret string) { + switch { + case value == 0: + ret = "" + case value == -1: + ret = "max" + default: + ret = strconv.FormatInt(value, 10) + } + + return ret +} + +func isMemorySet(r *configs.Resources) bool { + return r.MemoryReservation != 0 || r.Memory != 0 || r.MemorySwap != 0 +} + +func setMemory(dirPath string, r *configs.Resources) error { + if !isMemorySet(r) { + return nil + } + swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory) + if err != nil { + return err + } + swapStr := numToStr(swap) + if swapStr == "" && swap == 0 && r.MemorySwap > 0 { + // memory and memorySwap set to the same value -- disable swap + swapStr = "0" + } + // never write empty string to `memory.swap.max`, it means set to 0. + if swapStr != "" { + if err := cgroups.WriteFile(dirPath, "memory.swap.max", swapStr); err != nil { + return err + } + } + + if val := numToStr(r.Memory); val != "" { + if err := cgroups.WriteFile(dirPath, "memory.max", val); err != nil { + return err + } + } + + // cgroup.Resources.KernelMemory is ignored + + if val := numToStr(r.MemoryReservation); val != "" { + if err := cgroups.WriteFile(dirPath, "memory.low", val); err != nil { + return err + } + } + + return nil +} + +func statMemory(dirPath string, stats *cgroups.Stats) error { + const file = "memory.stat" + statsFile, err := cgroups.OpenFile(dirPath, file, os.O_RDONLY) + if err != nil { + return err + } + defer statsFile.Close() + + sc := bufio.NewScanner(statsFile) + for sc.Scan() { + t, v, err := fscommon.ParseKeyValue(sc.Text()) + if err != nil { + return &parseError{Path: dirPath, File: file, Err: err} + } + stats.MemoryStats.Stats[t] = v + } + if err := sc.Err(); err != nil { + return &parseError{Path: dirPath, File: file, Err: err} + } + stats.MemoryStats.Cache = stats.MemoryStats.Stats["file"] + // Unlike cgroup v1 which has memory.use_hierarchy binary knob, + // cgroup v2 is always hierarchical. + stats.MemoryStats.UseHierarchy = true + + memoryUsage, err := getMemoryDataV2(dirPath, "") + if err != nil { + if errors.Is(err, unix.ENOENT) && dirPath == UnifiedMountpoint { + // The root cgroup does not have memory.{current,max} + // so emulate those using data from /proc/meminfo. + return statsFromMeminfo(stats) + } + return err + } + stats.MemoryStats.Usage = memoryUsage + swapUsage, err := getMemoryDataV2(dirPath, "swap") + if err != nil { + return err + } + // As cgroup v1 reports SwapUsage values as mem+swap combined, + // while in cgroup v2 swap values do not include memory, + // report combined mem+swap for v1 compatibility. + swapUsage.Usage += memoryUsage.Usage + if swapUsage.Limit != math.MaxUint64 { + swapUsage.Limit += memoryUsage.Limit + } + stats.MemoryStats.SwapUsage = swapUsage + + return nil +} + +func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) { + memoryData := cgroups.MemoryData{} + + moduleName := "memory" + if name != "" { + moduleName = "memory." + name + } + usage := moduleName + ".current" + limit := moduleName + ".max" + + value, err := fscommon.GetCgroupParamUint(path, usage) + if err != nil { + if name != "" && os.IsNotExist(err) { + // Ignore EEXIST as there's no swap accounting + // if kernel CONFIG_MEMCG_SWAP is not set or + // swapaccount=0 kernel boot parameter is given. + return cgroups.MemoryData{}, nil + } + return cgroups.MemoryData{}, err + } + memoryData.Usage = value + + value, err = fscommon.GetCgroupParamUint(path, limit) + if err != nil { + return cgroups.MemoryData{}, err + } + memoryData.Limit = value + + return memoryData, nil +} + +func statsFromMeminfo(stats *cgroups.Stats) error { + const file = "/proc/meminfo" + f, err := os.Open(file) + if err != nil { + return err + } + defer f.Close() + + // Fields we are interested in. + var ( + swap_free uint64 + swap_total uint64 + main_total uint64 + main_free uint64 + ) + mem := map[string]*uint64{ + "SwapFree": &swap_free, + "SwapTotal": &swap_total, + "MemTotal": &main_total, + "MemFree": &main_free, + } + + found := 0 + sc := bufio.NewScanner(f) + for sc.Scan() { + parts := strings.SplitN(sc.Text(), ":", 3) + if len(parts) != 2 { + // Should not happen. + continue + } + k := parts[0] + p, ok := mem[k] + if !ok { + // Unknown field -- not interested. + continue + } + vStr := strings.TrimSpace(strings.TrimSuffix(parts[1], " kB")) + *p, err = strconv.ParseUint(vStr, 10, 64) + if err != nil { + return &parseError{File: file, Err: errors.New("bad value for " + k)} + } + + found++ + if found == len(mem) { + // Got everything we need -- skip the rest. + break + } + } + if err := sc.Err(); err != nil { + return &parseError{Path: "", File: file, Err: err} + } + + stats.MemoryStats.SwapUsage.Usage = (swap_total - swap_free) * 1024 + stats.MemoryStats.SwapUsage.Limit = math.MaxUint64 + + stats.MemoryStats.Usage.Usage = (main_total - main_free) * 1024 + stats.MemoryStats.Usage.Limit = math.MaxUint64 + + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/pids.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/pids.go new file mode 100644 index 000000000..c8c4a3658 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/pids.go @@ -0,0 +1,72 @@ +package fs2 + +import ( + "errors" + "math" + "os" + "strings" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func isPidsSet(r *configs.Resources) bool { + return r.PidsLimit != 0 +} + +func setPids(dirPath string, r *configs.Resources) error { + if !isPidsSet(r) { + return nil + } + if val := numToStr(r.PidsLimit); val != "" { + if err := cgroups.WriteFile(dirPath, "pids.max", val); err != nil { + return err + } + } + + return nil +} + +func statPidsFromCgroupProcs(dirPath string, stats *cgroups.Stats) error { + // if the controller is not enabled, let's read PIDS from cgroups.procs + // (or threads if cgroup.threads is enabled) + contents, err := cgroups.ReadFile(dirPath, "cgroup.procs") + if errors.Is(err, unix.ENOTSUP) { + contents, err = cgroups.ReadFile(dirPath, "cgroup.threads") + } + if err != nil { + return err + } + pids := strings.Count(contents, "\n") + stats.PidsStats.Current = uint64(pids) + stats.PidsStats.Limit = 0 + return nil +} + +func statPids(dirPath string, stats *cgroups.Stats) error { + current, err := fscommon.GetCgroupParamUint(dirPath, "pids.current") + if err != nil { + if os.IsNotExist(err) { + return statPidsFromCgroupProcs(dirPath, stats) + } + return err + } + + max, err := fscommon.GetCgroupParamUint(dirPath, "pids.max") + if err != nil { + return err + } + // If no limit is set, read from pids.max returns "max", which is + // converted to MaxUint64 by GetCgroupParamUint. Historically, we + // represent "no limit" for pids as 0, thus this conversion. + if max == math.MaxUint64 { + max = 0 + } + + stats.PidsStats.Current = current + stats.PidsStats.Limit = max + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/rdma.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/rdma.go new file mode 100644 index 000000000..d463d15ee --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/rdma.go @@ -0,0 +1,121 @@ +package fscommon + +import ( + "bufio" + "errors" + "math" + "os" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "golang.org/x/sys/unix" +) + +// parseRdmaKV parses raw string to RdmaEntry. +func parseRdmaKV(raw string, entry *cgroups.RdmaEntry) error { + var value uint32 + + parts := strings.SplitN(raw, "=", 3) + + if len(parts) != 2 { + return errors.New("Unable to parse RDMA entry") + } + + k, v := parts[0], parts[1] + + if v == "max" { + value = math.MaxUint32 + } else { + val64, err := strconv.ParseUint(v, 10, 32) + if err != nil { + return err + } + value = uint32(val64) + } + if k == "hca_handle" { + entry.HcaHandles = value + } else if k == "hca_object" { + entry.HcaObjects = value + } + + return nil +} + +// readRdmaEntries reads and converts array of rawstrings to RdmaEntries from file. +// example entry: mlx4_0 hca_handle=2 hca_object=2000 +func readRdmaEntries(dir, file string) ([]cgroups.RdmaEntry, error) { + rdmaEntries := make([]cgroups.RdmaEntry, 0) + fd, err := cgroups.OpenFile(dir, file, unix.O_RDONLY) + if err != nil { + return nil, err + } + defer fd.Close() //nolint:errorlint + scanner := bufio.NewScanner(fd) + for scanner.Scan() { + parts := strings.SplitN(scanner.Text(), " ", 4) + if len(parts) == 3 { + entry := new(cgroups.RdmaEntry) + entry.Device = parts[0] + err = parseRdmaKV(parts[1], entry) + if err != nil { + continue + } + err = parseRdmaKV(parts[2], entry) + if err != nil { + continue + } + + rdmaEntries = append(rdmaEntries, *entry) + } + } + return rdmaEntries, scanner.Err() +} + +// RdmaGetStats returns rdma stats such as totalLimit and current entries. +func RdmaGetStats(path string, stats *cgroups.Stats) error { + currentEntries, err := readRdmaEntries(path, "rdma.current") + if err != nil { + if errors.Is(err, os.ErrNotExist) { + err = nil + } + return err + } + maxEntries, err := readRdmaEntries(path, "rdma.max") + if err != nil { + return err + } + // If device got removed between reading two files, ignore returning stats. + if len(currentEntries) != len(maxEntries) { + return nil + } + + stats.RdmaStats = cgroups.RdmaStats{ + RdmaLimit: maxEntries, + RdmaCurrent: currentEntries, + } + + return nil +} + +func createCmdString(device string, limits configs.LinuxRdma) string { + cmdString := device + if limits.HcaHandles != nil { + cmdString += " hca_handle=" + strconv.FormatUint(uint64(*limits.HcaHandles), 10) + } + if limits.HcaObjects != nil { + cmdString += " hca_object=" + strconv.FormatUint(uint64(*limits.HcaObjects), 10) + } + return cmdString +} + +// RdmaSet sets RDMA resources. +func RdmaSet(path string, r *configs.Resources) error { + for device, limits := range r.Rdma { + if err := cgroups.WriteFile(path, "rdma.max", createCmdString(device, limits)); err != nil { + return err + } + } + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/utils.go new file mode 100644 index 000000000..f4a51c9e5 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/utils.go @@ -0,0 +1,145 @@ +package fscommon + +import ( + "errors" + "fmt" + "math" + "path" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" +) + +var ( + // Deprecated: use cgroups.OpenFile instead. + OpenFile = cgroups.OpenFile + // Deprecated: use cgroups.ReadFile instead. + ReadFile = cgroups.ReadFile + // Deprecated: use cgroups.WriteFile instead. + WriteFile = cgroups.WriteFile +) + +// ParseError records a parse error details, including the file path. +type ParseError struct { + Path string + File string + Err error +} + +func (e *ParseError) Error() string { + return "unable to parse " + path.Join(e.Path, e.File) + ": " + e.Err.Error() +} + +func (e *ParseError) Unwrap() error { return e.Err } + +// ParseUint converts a string to an uint64 integer. +// Negative values are returned at zero as, due to kernel bugs, +// some of the memory cgroup stats can be negative. +func ParseUint(s string, base, bitSize int) (uint64, error) { + value, err := strconv.ParseUint(s, base, bitSize) + if err != nil { + intValue, intErr := strconv.ParseInt(s, base, bitSize) + // 1. Handle negative values greater than MinInt64 (and) + // 2. Handle negative values lesser than MinInt64 + if intErr == nil && intValue < 0 { + return 0, nil + } else if errors.Is(intErr, strconv.ErrRange) && intValue < 0 { + return 0, nil + } + + return value, err + } + + return value, nil +} + +// ParseKeyValue parses a space-separated "name value" kind of cgroup +// parameter and returns its key as a string, and its value as uint64 +// (ParseUint is used to convert the value). For example, +// "io_service_bytes 1234" will be returned as "io_service_bytes", 1234. +func ParseKeyValue(t string) (string, uint64, error) { + parts := strings.SplitN(t, " ", 3) + if len(parts) != 2 { + return "", 0, fmt.Errorf("line %q is not in key value format", t) + } + + value, err := ParseUint(parts[1], 10, 64) + if err != nil { + return "", 0, err + } + + return parts[0], value, nil +} + +// GetValueByKey reads a key-value pairs from the specified cgroup file, +// and returns a value of the specified key. ParseUint is used for value +// conversion. +func GetValueByKey(path, file, key string) (uint64, error) { + content, err := cgroups.ReadFile(path, file) + if err != nil { + return 0, err + } + + lines := strings.Split(content, "\n") + for _, line := range lines { + arr := strings.Split(line, " ") + if len(arr) == 2 && arr[0] == key { + val, err := ParseUint(arr[1], 10, 64) + if err != nil { + err = &ParseError{Path: path, File: file, Err: err} + } + return val, err + } + } + + return 0, nil +} + +// GetCgroupParamUint reads a single uint64 value from the specified cgroup file. +// If the value read is "max", the math.MaxUint64 is returned. +func GetCgroupParamUint(path, file string) (uint64, error) { + contents, err := GetCgroupParamString(path, file) + if err != nil { + return 0, err + } + contents = strings.TrimSpace(contents) + if contents == "max" { + return math.MaxUint64, nil + } + + res, err := ParseUint(contents, 10, 64) + if err != nil { + return res, &ParseError{Path: path, File: file, Err: err} + } + return res, nil +} + +// GetCgroupParamInt reads a single int64 value from specified cgroup file. +// If the value read is "max", the math.MaxInt64 is returned. +func GetCgroupParamInt(path, file string) (int64, error) { + contents, err := cgroups.ReadFile(path, file) + if err != nil { + return 0, err + } + contents = strings.TrimSpace(contents) + if contents == "max" { + return math.MaxInt64, nil + } + + res, err := strconv.ParseInt(contents, 10, 64) + if err != nil { + return res, &ParseError{Path: path, File: file, Err: err} + } + return res, nil +} + +// GetCgroupParamString reads a string from the specified cgroup file. +func GetCgroupParamString(path, file string) (string, error) { + contents, err := cgroups.ReadFile(path, file) + if err != nil { + return "", err + } + + return strings.TrimSpace(contents), nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go index fa195bf90..865344f99 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go @@ -2,8 +2,8 @@ package configs import "fmt" -// blockIODevice holds major:minor format supported in blkio cgroup -type blockIODevice struct { +// BlockIODevice holds major:minor format supported in blkio cgroup. +type BlockIODevice struct { // Major is the device's major number Major int64 `json:"major"` // Minor is the device's minor number @@ -12,7 +12,7 @@ type blockIODevice struct { // WeightDevice struct holds a `major:minor weight`|`major:minor leaf_weight` pair type WeightDevice struct { - blockIODevice + BlockIODevice // Weight is the bandwidth rate for the device, range is from 10 to 1000 Weight uint16 `json:"weight"` // LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only @@ -41,7 +41,7 @@ func (wd *WeightDevice) LeafWeightString() string { // ThrottleDevice struct holds a `major:minor rate_per_second` pair type ThrottleDevice struct { - blockIODevice + BlockIODevice // Rate is the IO rate limit per cgroup per device Rate uint64 `json:"rate"` } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go index c1b4a0041..7cf2fb657 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go @@ -83,9 +83,6 @@ type Syscall struct { Args []*Arg `json:"args"` } -// TODO Windows. Many of these fields should be factored out into those parts -// which are common across platforms, and those which are platform specific. - // Config defines configuration options for executing a process inside a contained environment. type Config struct { // NoPivotRoot will use MS_MOVE and a chroot to jail the process into the container's rootfs diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go index 784c61820..b4c616d55 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go @@ -35,12 +35,6 @@ type Mount struct { // Extensions are additional flags that are specific to runc. Extensions int `json:"extensions"` - - // Optional Command to be run before Source is mounted. - PremountCmds []Command `json:"premount_cmds"` - - // Optional Command to be run after Source is mounted. - PostmountCmds []Command `json:"postmount_cmds"` } func (m *Mount) IsBind() bool { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go index 6b9fc3435..dbd435341 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go @@ -132,19 +132,16 @@ func WithProcfd(root, unsafePath string, fn func(procfd string) error) error { return fn(procfd) } -// SearchLabels searches a list of key-value pairs for the provided key and -// returns the corresponding value. The pairs must be separated with '='. -func SearchLabels(labels []string, query string) string { - for _, l := range labels { - parts := strings.SplitN(l, "=", 2) - if len(parts) < 2 { - continue - } - if parts[0] == query { - return parts[1] +// SearchLabels searches through a list of key=value pairs for a given key, +// returning its value, and the binary flag telling whether the key exist. +func SearchLabels(labels []string, key string) (string, bool) { + key += "=" + for _, s := range labels { + if strings.HasPrefix(s, key) { + return s[len(key):], true } } - return "" + return "", false } // Annotations returns the bundle path and user defined annotations from the |