diff options
Diffstat (limited to 'vendor/github.com/opencontainers/runc/libcontainer')
9 files changed, 351 insertions, 6 deletions
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor.go b/vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor.go index 7fff0627f..debfc1e48 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor.go @@ -6,6 +6,8 @@ import ( "fmt" "io/ioutil" "os" + + "github.com/opencontainers/runc/libcontainer/utils" ) // IsEnabled returns true if apparmor is enabled for the host. @@ -19,7 +21,7 @@ func IsEnabled() bool { return false } -func setprocattr(attr, value string) error { +func setProcAttr(attr, value string) error { // Under AppArmor you can only change your own attr, so use /proc/self/ // instead of /proc/<tid>/ like libapparmor does path := fmt.Sprintf("/proc/self/attr/%s", attr) @@ -30,6 +32,10 @@ func setprocattr(attr, value string) error { } defer f.Close() + if err := utils.EnsureProcHandle(f); err != nil { + return err + } + _, err = fmt.Fprintf(f, "%s", value) return err } @@ -37,7 +43,7 @@ func setprocattr(attr, value string) error { // changeOnExec reimplements aa_change_onexec from libapparmor in Go func changeOnExec(name string) error { value := "exec " + name - if err := setprocattr("exec", value); err != nil { + if err := setProcAttr("exec", value); err != nil { return fmt.Errorf("apparmor failed to apply profile: %s", err) } return nil diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go index ec79ae767..60790f83b 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go @@ -11,6 +11,8 @@ import ( "path/filepath" "strconv" "strings" + "sync" + "syscall" "time" units "github.com/docker/go-units" @@ -22,6 +24,11 @@ const ( CgroupProcesses = "cgroup.procs" ) +var ( + isUnifiedOnce sync.Once + isUnified bool +) + // HugePageSizeUnitList is a list of the units used by the linux kernel when // naming the HugePage control files. // https://www.kernel.org/doc/Documentation/cgroup-v1/hugetlb.txt @@ -29,6 +36,18 @@ const ( // depends on https://github.com/docker/go-units/commit/a09cd47f892041a4fac473133d181f5aea6fa393 var HugePageSizeUnitList = []string{"B", "KB", "MB", "GB", "TB", "PB"} +// IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode. +func IsCgroup2UnifiedMode() bool { + isUnifiedOnce.Do(func() { + var st syscall.Statfs_t + if err := syscall.Statfs("/sys/fs/cgroup", &st); err != nil { + panic("cannot statfs cgroup root") + } + isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC + }) + return isUnified +} + // https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) { mnt, _, err := FindCgroupMountpointAndRoot(cgroupPath, subsystem) @@ -49,6 +68,10 @@ func FindCgroupMountpointAndRoot(cgroupPath, subsystem string) (string, string, } defer f.Close() + if IsCgroup2UnifiedMode() { + subsystem = "" + } + return findCgroupMountpointAndRootFromReader(f, cgroupPath, subsystem) } @@ -57,12 +80,12 @@ func findCgroupMountpointAndRootFromReader(reader io.Reader, cgroupPath, subsyst for scanner.Scan() { txt := scanner.Text() fields := strings.Fields(txt) - if len(fields) < 5 { + if len(fields) < 9 { continue } if strings.HasPrefix(fields[4], cgroupPath) { for _, opt := range strings.Split(fields[len(fields)-1], ",") { - if opt == subsystem { + if (subsystem == "" && fields[9] == "cgroup2") || opt == subsystem { return fields[4], fields[3], nil } } @@ -76,6 +99,19 @@ func findCgroupMountpointAndRootFromReader(reader io.Reader, cgroupPath, subsyst } func isSubsystemAvailable(subsystem string) bool { + if IsCgroup2UnifiedMode() { + controllers, err := GetAllSubsystems() + if err != nil { + return false + } + for _, c := range controllers { + if c == subsystem { + return true + } + } + return false + } + cgroups, err := ParseCgroupFile("/proc/self/cgroup") if err != nil { return false @@ -120,7 +156,7 @@ func FindCgroupMountpointDir() (string, error) { return "", fmt.Errorf("Found no fields post '-' in %q", text) } - if postSeparatorFields[0] == "cgroup" { + if postSeparatorFields[0] == "cgroup" || postSeparatorFields[0] == "cgroup2" { // Check that the mount is properly formatted. if numPostFields < 3 { return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text) @@ -193,6 +229,19 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount, // GetCgroupMounts returns the mounts for the cgroup subsystems. // all indicates whether to return just the first instance or all the mounts. func GetCgroupMounts(all bool) ([]Mount, error) { + if IsCgroup2UnifiedMode() { + availableControllers, err := GetAllSubsystems() + if err != nil { + return nil, err + } + m := Mount{ + Mountpoint: "/sys/fs/cgroup", + Root: "/sys/fs/cgroup", + Subsystems: availableControllers, + } + return []Mount{m}, nil + } + f, err := os.Open("/proc/self/mountinfo") if err != nil { return nil, err @@ -356,6 +405,9 @@ func parseCgroupFromReader(r io.Reader) (map[string]string, error) { } func getControllerPath(subsystem string, cgroups map[string]string) (string, error) { + if IsCgroup2UnifiedMode() { + return "/", nil + } if p, ok := cgroups[subsystem]; ok { return p, nil diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go index e0f3ca165..fa195bf90 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go @@ -59,3 +59,8 @@ func NewThrottleDevice(major, minor int64, rate uint64) *ThrottleDevice { func (td *ThrottleDevice) String() string { return fmt.Sprintf("%d:%d %d", td.Major, td.Minor, td.Rate) } + +// StringName formats the struct to be writable to the cgroup specific file +func (td *ThrottleDevice) StringName(name string) string { + return fmt.Sprintf("%d:%d %s=%d", td.Major, td.Minor, name, td.Rate) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go index e15a662f5..58ed19c9e 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go @@ -119,4 +119,12 @@ type Resources struct { // Set class identifier for container's network packets NetClsClassid uint32 `json:"net_cls_classid_u"` + + // Used on cgroups v2: + + // CpuWeight sets a proportional bandwidth limit. + CpuWeight uint64 `json:"cpu_weight"` + + // CpuMax sets she maximum bandwidth limit (format: max period). + CpuMax string `json:"cpu_max"` } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go index 7728522fe..24989e9f5 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go @@ -44,6 +44,7 @@ const ( Trap Allow Trace + Log ) // Operator is a comparison operator to be used when matching syscall arguments in Seccomp diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_64.go b/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_64.go index 11c3faafb..e05e30adc 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_64.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_64.go @@ -1,5 +1,5 @@ // +build linux -// +build arm64 amd64 mips mipsle mips64 mips64le ppc ppc64 ppc64le s390x +// +build arm64 amd64 mips mipsle mips64 mips64le ppc ppc64 ppc64le riscv64 s390x package system diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go new file mode 100644 index 000000000..c8a9364d5 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go @@ -0,0 +1,93 @@ +// +build linux + +package utils + +/* + * Copyright 2016, 2017 SUSE LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import ( + "fmt" + "os" + + "golang.org/x/sys/unix" +) + +// MaxSendfdLen is the maximum length of the name of a file descriptor being +// sent using SendFd. The name of the file handle returned by RecvFd will never +// be larger than this value. +const MaxNameLen = 4096 + +// oobSpace is the size of the oob slice required to store a single FD. Note +// that unix.UnixRights appears to make the assumption that fd is always int32, +// so sizeof(fd) = 4. +var oobSpace = unix.CmsgSpace(4) + +// RecvFd waits for a file descriptor to be sent over the given AF_UNIX +// socket. The file name of the remote file descriptor will be recreated +// locally (it is sent as non-auxiliary data in the same payload). +func RecvFd(socket *os.File) (*os.File, error) { + // For some reason, unix.Recvmsg uses the length rather than the capacity + // when passing the msg_controllen and other attributes to recvmsg. So we + // have to actually set the length. + name := make([]byte, MaxNameLen) + oob := make([]byte, oobSpace) + + sockfd := socket.Fd() + n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, 0) + if err != nil { + return nil, err + } + + if n >= MaxNameLen || oobn != oobSpace { + return nil, fmt.Errorf("recvfd: incorrect number of bytes read (n=%d oobn=%d)", n, oobn) + } + + // Truncate. + name = name[:n] + oob = oob[:oobn] + + scms, err := unix.ParseSocketControlMessage(oob) + if err != nil { + return nil, err + } + if len(scms) != 1 { + return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms)) + } + scm := scms[0] + + fds, err := unix.ParseUnixRights(&scm) + if err != nil { + return nil, err + } + if len(fds) != 1 { + return nil, fmt.Errorf("recvfd: number of fds is not 1: %d", len(fds)) + } + fd := uintptr(fds[0]) + + return os.NewFile(fd, string(name)), nil +} + +// SendFd sends a file descriptor over the given AF_UNIX socket. In +// addition, the file.Name() of the given file will also be sent as +// non-auxiliary data in the same payload (allowing to send contextual +// information for a file descriptor). +func SendFd(socket *os.File, name string, fd uintptr) error { + if len(name) >= MaxNameLen { + return fmt.Errorf("sendfd: filename too long: %s", name) + } + oob := unix.UnixRights(int(fd)) + return unix.Sendmsg(int(socket.Fd()), []byte(name), oob, nil, 0) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go new file mode 100644 index 000000000..40ccfaa1a --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go @@ -0,0 +1,112 @@ +package utils + +import ( + "encoding/json" + "io" + "os" + "path/filepath" + "strings" + "unsafe" + + "golang.org/x/sys/unix" +) + +const ( + exitSignalOffset = 128 +) + +// ResolveRootfs ensures that the current working directory is +// not a symlink and returns the absolute path to the rootfs +func ResolveRootfs(uncleanRootfs string) (string, error) { + rootfs, err := filepath.Abs(uncleanRootfs) + if err != nil { + return "", err + } + return filepath.EvalSymlinks(rootfs) +} + +// ExitStatus returns the correct exit status for a process based on if it +// was signaled or exited cleanly +func ExitStatus(status unix.WaitStatus) int { + if status.Signaled() { + return exitSignalOffset + int(status.Signal()) + } + return status.ExitStatus() +} + +// WriteJSON writes the provided struct v to w using standard json marshaling +func WriteJSON(w io.Writer, v interface{}) error { + data, err := json.Marshal(v) + if err != nil { + return err + } + _, err = w.Write(data) + return err +} + +// CleanPath makes a path safe for use with filepath.Join. This is done by not +// only cleaning the path, but also (if the path is relative) adding a leading +// '/' and cleaning it (then removing the leading '/'). This ensures that a +// path resulting from prepending another path will always resolve to lexically +// be a subdirectory of the prefixed path. This is all done lexically, so paths +// that include symlinks won't be safe as a result of using CleanPath. +func CleanPath(path string) string { + // Deal with empty strings nicely. + if path == "" { + return "" + } + + // Ensure that all paths are cleaned (especially problematic ones like + // "/../../../../../" which can cause lots of issues). + path = filepath.Clean(path) + + // If the path isn't absolute, we need to do more processing to fix paths + // such as "../../../../<etc>/some/path". We also shouldn't convert absolute + // paths to relative ones. + if !filepath.IsAbs(path) { + path = filepath.Clean(string(os.PathSeparator) + path) + // This can't fail, as (by definition) all paths are relative to root. + path, _ = filepath.Rel(string(os.PathSeparator), path) + } + + // Clean the path again for good measure. + return filepath.Clean(path) +} + +// SearchLabels searches a list of key-value pairs for the provided key and +// returns the corresponding value. The pairs must be separated with '='. +func SearchLabels(labels []string, query string) string { + for _, l := range labels { + parts := strings.SplitN(l, "=", 2) + if len(parts) < 2 { + continue + } + if parts[0] == query { + return parts[1] + } + } + return "" +} + +// Annotations returns the bundle path and user defined annotations from the +// libcontainer state. We need to remove the bundle because that is a label +// added by libcontainer. +func Annotations(labels []string) (bundle string, userAnnotations map[string]string) { + userAnnotations = make(map[string]string) + for _, l := range labels { + parts := strings.SplitN(l, "=", 2) + if len(parts) < 2 { + continue + } + if parts[0] == "bundle" { + bundle = parts[1] + } else { + userAnnotations[parts[0]] = parts[1] + } + } + return +} + +func GetIntSize() int { + return int(unsafe.Sizeof(1)) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go new file mode 100644 index 000000000..1576f2d4a --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go @@ -0,0 +1,68 @@ +// +build !windows + +package utils + +import ( + "fmt" + "os" + "strconv" + + "golang.org/x/sys/unix" +) + +// EnsureProcHandle returns whether or not the given file handle is on procfs. +func EnsureProcHandle(fh *os.File) error { + var buf unix.Statfs_t + if err := unix.Fstatfs(int(fh.Fd()), &buf); err != nil { + return fmt.Errorf("ensure %s is on procfs: %v", fh.Name(), err) + } + if buf.Type != unix.PROC_SUPER_MAGIC { + return fmt.Errorf("%s is not on procfs", fh.Name()) + } + return nil +} + +// CloseExecFrom applies O_CLOEXEC to all file descriptors currently open for +// the process (except for those below the given fd value). +func CloseExecFrom(minFd int) error { + fdDir, err := os.Open("/proc/self/fd") + if err != nil { + return err + } + defer fdDir.Close() + + if err := EnsureProcHandle(fdDir); err != nil { + return err + } + + fdList, err := fdDir.Readdirnames(-1) + if err != nil { + return err + } + for _, fdStr := range fdList { + fd, err := strconv.Atoi(fdStr) + // Ignore non-numeric file names. + if err != nil { + continue + } + // Ignore descriptors lower than our specified minimum. + if fd < minFd { + continue + } + // Intentionally ignore errors from unix.CloseOnExec -- the cases where + // this might fail are basically file descriptors that have already + // been closed (including and especially the one that was created when + // ioutil.ReadDir did the "opendir" syscall). + unix.CloseOnExec(fd) + } + return nil +} + +// NewSockPair returns a new unix socket pair +func NewSockPair(name string) (parent *os.File, child *os.File, err error) { + fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) + if err != nil { + return nil, nil, err + } + return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil +} |