diff options
Diffstat (limited to 'vendor/github.com/opencontainers/runc/libcontainer')
9 files changed, 142 insertions, 97 deletions
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/README.md b/vendor/github.com/opencontainers/runc/libcontainer/README.md index 42f3efe56..1d7fa04c0 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/README.md +++ b/vendor/github.com/opencontainers/runc/libcontainer/README.md @@ -148,6 +148,7 @@ config := &configs.Config{ {Type: configs.NEWPID}, {Type: configs.NEWUSER}, {Type: configs.NEWNET}, + {Type: configs.NEWCGROUP}, }), Cgroups: &configs.Cgroup{ Name: "test-container", @@ -323,6 +324,7 @@ generated when building libcontainer with docker. ## Copyright and license -Code and documentation copyright 2014 Docker, inc. Code released under the Apache 2.0 license. -Docs released under Creative commons. - +Code and documentation copyright 2014 Docker, inc. +The code and documentation are released under the [Apache 2.0 license](../LICENSE). +The documentation is also released under Creative Commons Attribution 4.0 International License. +You may obtain a copy of the license, titled CC-BY-4.0, at http://creativecommons.org/licenses/by/4.0/. diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go index 7c995efee..ea571ad93 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go @@ -13,40 +13,50 @@ import ( "strings" "time" - "github.com/docker/go-units" + units "github.com/docker/go-units" ) const ( - cgroupNamePrefix = "name=" + CgroupNamePrefix = "name=" CgroupProcesses = "cgroup.procs" ) // https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt -func FindCgroupMountpoint(subsystem string) (string, error) { - mnt, _, err := FindCgroupMountpointAndRoot(subsystem) +func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) { + mnt, _, err := FindCgroupMountpointAndRoot(cgroupPath, subsystem) return mnt, err } -func FindCgroupMountpointAndRoot(subsystem string) (string, string, error) { +func FindCgroupMountpointAndRoot(cgroupPath, subsystem string) (string, string, error) { // We are not using mount.GetMounts() because it's super-inefficient, // parsing it directly sped up x10 times because of not using Sscanf. // It was one of two major performance drawbacks in container start. if !isSubsystemAvailable(subsystem) { return "", "", NewNotFoundError(subsystem) } + f, err := os.Open("/proc/self/mountinfo") if err != nil { return "", "", err } defer f.Close() - scanner := bufio.NewScanner(f) + return findCgroupMountpointAndRootFromReader(f, cgroupPath, subsystem) +} + +func findCgroupMountpointAndRootFromReader(reader io.Reader, cgroupPath, subsystem string) (string, string, error) { + scanner := bufio.NewScanner(reader) for scanner.Scan() { txt := scanner.Text() - fields := strings.Split(txt, " ") - for _, opt := range strings.Split(fields[len(fields)-1], ",") { - if opt == subsystem { - return fields[4], fields[3], nil + fields := strings.Fields(txt) + if len(fields) < 5 { + continue + } + if strings.HasPrefix(fields[4], cgroupPath) { + for _, opt := range strings.Split(fields[len(fields)-1], ",") { + if opt == subsystem { + return fields[4], fields[3], nil + } } } } @@ -103,7 +113,7 @@ func FindCgroupMountpointDir() (string, error) { } if postSeparatorFields[0] == "cgroup" { - // Check that the mount is properly formated. + // Check that the mount is properly formatted. if numPostFields < 3 { return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text) } @@ -151,19 +161,20 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount, Root: fields[3], } for _, opt := range strings.Split(fields[len(fields)-1], ",") { - if !ss[opt] { + seen, known := ss[opt] + if !known || (!all && seen) { continue } - if strings.HasPrefix(opt, cgroupNamePrefix) { - m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):]) - } else { - m.Subsystems = append(m.Subsystems, opt) - } - if !all { - numFound++ + ss[opt] = true + if strings.HasPrefix(opt, CgroupNamePrefix) { + opt = opt[len(CgroupNamePrefix):] } + m.Subsystems = append(m.Subsystems, opt) + numFound++ + } + if len(m.Subsystems) > 0 || all { + res = append(res, m) } - res = append(res, m) } if err := scanner.Err(); err != nil { return nil, err @@ -187,7 +198,7 @@ func GetCgroupMounts(all bool) ([]Mount, error) { allMap := make(map[string]bool) for s := range allSubsystems { - allMap[s] = true + allMap[s] = false } return getCgroupMountsHelper(allMap, f, all) } @@ -256,13 +267,13 @@ func GetInitCgroupPath(subsystem string) (string, error) { } func getCgroupPathHelper(subsystem, cgroup string) (string, error) { - mnt, root, err := FindCgroupMountpointAndRoot(subsystem) + mnt, root, err := FindCgroupMountpointAndRoot("", subsystem) if err != nil { return "", err } // This is needed for nested containers, because in /proc/self/cgroup we - // see pathes from host, which don't exist in container. + // see paths from host, which don't exist in container. relCgroup, err := filepath.Rel(root, cgroup) if err != nil { return "", err @@ -342,7 +353,7 @@ func getControllerPath(subsystem string, cgroups map[string]string) (string, err return p, nil } - if p, ok := cgroups[cgroupNamePrefix+subsystem]; ok { + if p, ok := cgroups[CgroupNamePrefix+subsystem]; ok { return p, nil } @@ -452,7 +463,7 @@ func WriteCgroupProc(dir string, pid int) error { return fmt.Errorf("no such directory for %s", CgroupProcesses) } - // Dont attach any pid to the cgroup if -1 is specified as a pid + // Don't attach any pid to the cgroup if -1 is specified as a pid if pid != -1 { if err := ioutil.WriteFile(filepath.Join(dir, CgroupProcesses), []byte(strconv.Itoa(pid)), 0700); err != nil { return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err) diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go index b1c4762fe..7728522fe 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go @@ -186,12 +186,19 @@ type Config struct { // callers keyring in this case. NoNewKeyring bool `json:"no_new_keyring"` - // Rootless specifies whether the container is a rootless container. - Rootless bool `json:"rootless"` - - // IntelRdt specifies settings for Intel RDT/CAT group that the container is placed into - // to limit the resources (e.g., L3 cache) the container has available + // IntelRdt specifies settings for Intel RDT group that the container is placed into + // to limit the resources (e.g., L3 cache, memory bandwidth) the container has available IntelRdt *IntelRdt `json:"intel_rdt,omitempty"` + + // RootlessEUID is set when the runc was launched with non-zero EUID. + // Note that RootlessEUID is set to false when launched with EUID=0 in userns. + // When RootlessEUID is set, runc creates a new userns for the container. + // (config.json needs to contain userns settings) + RootlessEUID bool `json:"rootless_euid,omitempty"` + + // RootlessCgroups is set when unlikely to have the full access to cgroups. + // When RootlessCgroups is set, cgroups errors are ignored. + RootlessCgroups bool `json:"rootless_cgroups,omitempty"` } type Hooks struct { @@ -265,26 +272,23 @@ func (hooks Hooks) MarshalJSON() ([]byte, error) { }) } -// HookState is the payload provided to a hook on execution. -type HookState specs.State - type Hook interface { // Run executes the hook with the provided state. - Run(HookState) error + Run(*specs.State) error } // NewFunctionHook will call the provided function when the hook is run. -func NewFunctionHook(f func(HookState) error) FuncHook { +func NewFunctionHook(f func(*specs.State) error) FuncHook { return FuncHook{ run: f, } } type FuncHook struct { - run func(HookState) error + run func(*specs.State) error } -func (f FuncHook) Run(s HookState) error { +func (f FuncHook) Run(s *specs.State) error { return f.run(s) } @@ -307,7 +311,7 @@ type CommandHook struct { Command } -func (c Command) Run(s HookState) error { +func (c Command) Run(s *specs.State) error { b, err := json.Marshal(s) if err != nil { return err diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go index 36bd5f96a..57e9f037d 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go @@ -4,4 +4,10 @@ type IntelRdt struct { // The schema for L3 cache id and capacity bitmask (CBM) // Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..." L3CacheSchema string `json:"l3_cache_schema,omitempty"` + + // The schema of memory bandwidth per L3 cache id + // Format: "MB:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;..." + // The unit of memory bandwidth is specified in "percentages" by + // default, and in "MBps" if MBA Software Controller is enabled. + MemBwSchema string `json:"memBwSchema,omitempty"` } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go index 5fc171a57..1bbaef9bd 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go @@ -7,12 +7,13 @@ import ( ) const ( - NEWNET NamespaceType = "NEWNET" - NEWPID NamespaceType = "NEWPID" - NEWNS NamespaceType = "NEWNS" - NEWUTS NamespaceType = "NEWUTS" - NEWIPC NamespaceType = "NEWIPC" - NEWUSER NamespaceType = "NEWUSER" + NEWNET NamespaceType = "NEWNET" + NEWPID NamespaceType = "NEWPID" + NEWNS NamespaceType = "NEWNS" + NEWUTS NamespaceType = "NEWUTS" + NEWIPC NamespaceType = "NEWIPC" + NEWUSER NamespaceType = "NEWUSER" + NEWCGROUP NamespaceType = "NEWCGROUP" ) var ( @@ -35,6 +36,8 @@ func NsName(ns NamespaceType) string { return "user" case NEWUTS: return "uts" + case NEWCGROUP: + return "cgroup" } return "" } @@ -68,6 +71,7 @@ func NamespaceTypes() []NamespaceType { NEWNET, NEWPID, NEWNS, + NEWCGROUP, } } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go index 4ce6813d2..2dc7adfc9 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go @@ -9,12 +9,13 @@ func (n *Namespace) Syscall() int { } var namespaceInfo = map[NamespaceType]int{ - NEWNET: unix.CLONE_NEWNET, - NEWNS: unix.CLONE_NEWNS, - NEWUSER: unix.CLONE_NEWUSER, - NEWIPC: unix.CLONE_NEWIPC, - NEWUTS: unix.CLONE_NEWUTS, - NEWPID: unix.CLONE_NEWPID, + NEWNET: unix.CLONE_NEWNET, + NEWNS: unix.CLONE_NEWNS, + NEWUSER: unix.CLONE_NEWUSER, + NEWIPC: unix.CLONE_NEWIPC, + NEWUTS: unix.CLONE_NEWUTS, + NEWPID: unix.CLONE_NEWPID, + NEWCGROUP: unix.CLONE_NEWCGROUP, } // CloneFlags parses the container's Namespaces options to set the correct diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/README.md b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/README.md index 575701371..9ec6c3931 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/README.md +++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/README.md @@ -10,8 +10,8 @@ The `nsenter` package will `import "C"` and it uses [cgo](https://golang.org/cmd package. In cgo, if the import of "C" is immediately preceded by a comment, that comment, called the preamble, is used as a header when compiling the C parts of the package. So every time we import package `nsenter`, the C code function `nsexec()` would be -called. And package `nsenter` is now only imported in `main_unix.go`, so every time -before we call `cmd.Start` on linux, that C code would run. +called. And package `nsenter` is only imported in `init.go`, so every time the runc +`init` command is invoked, that C code is run. Because `nsexec()` must be run before the Go runtime in order to use the Linux kernel namespace, you must `import` this library into a package if @@ -37,7 +37,7 @@ the parent `nsexec()` will exit and the child `nsexec()` process will return to allow the Go runtime take over. NOTE: We do both `setns(2)` and `clone(2)` even if we don't have any -CLONE_NEW* clone flags because we must fork a new process in order to +`CLONE_NEW*` clone flags because we must fork a new process in order to enter the PID namespace. diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c index a4cd1399d..28269dfc0 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c +++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c @@ -42,6 +42,12 @@ enum sync_t { SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */ }; +/* + * Synchronisation value for cgroup namespace setup. + * The same constant is defined in process_linux.go as "createCgroupns". + */ +#define CREATECGROUPNS 0x80 + /* longjmp() arguments. */ #define JUMP_PARENT 0x00 #define JUMP_CHILD 0xA0 @@ -82,7 +88,7 @@ struct nlconfig_t { uint8_t is_setgroup; /* Rootless container settings. */ - uint8_t is_rootless; + uint8_t is_rootless_euid; /* boolean */ char *uidmappath; size_t uidmappath_len; char *gidmappath; @@ -100,7 +106,7 @@ struct nlconfig_t { #define GIDMAP_ATTR 27284 #define SETGROUP_ATTR 27285 #define OOM_SCORE_ADJ_ATTR 27286 -#define ROOTLESS_ATTR 27287 +#define ROOTLESS_EUID_ATTR 27287 #define UIDMAPPATH_ATTR 27288 #define GIDMAPPATH_ATTR 27289 @@ -211,7 +217,7 @@ static int try_mapping_tool(const char *app, int pid, char *map, size_t map_len) /* * If @app is NULL, execve will segfault. Just check it here and bail (if - * we're in this path, the caller is already getting desparate and there + * we're in this path, the caller is already getting desperate and there * isn't a backup to this failing). This usually would be a configuration * or programming issue. */ @@ -419,8 +425,8 @@ static void nl_parse(int fd, struct nlconfig_t *config) case CLONE_FLAGS_ATTR: config->cloneflags = readint32(current); break; - case ROOTLESS_ATTR: - config->is_rootless = readint8(current); + case ROOTLESS_EUID_ATTR: + config->is_rootless_euid = readint8(current); /* boolean */ break; case OOM_SCORE_ADJ_ATTR: config->oom_score_adj = current; @@ -640,7 +646,6 @@ void nsexec(void) case JUMP_PARENT:{ int len; pid_t child, first_child = -1; - char buf[JSON_MAX]; bool ready = false; /* For debugging. */ @@ -687,7 +692,7 @@ void nsexec(void) * newuidmap/newgidmap shall be used. */ - if (config.is_rootless && !config.is_setgroup) + if (config.is_rootless_euid && !config.is_setgroup) update_setgroups(child, SETGROUPS_DENY); /* Set up mappings. */ @@ -716,6 +721,18 @@ void nsexec(void) kill(child, SIGKILL); bail("failed to sync with child: write(SYNC_RECVPID_ACK)"); } + + /* Send the init_func pid back to our parent. + * + * Send the init_func pid and the pid of the first child back to our parent. + * We need to send both back because we can't reap the first child we created (CLONE_PARENT). + * It becomes the responsibility of our parent to reap the first child. + */ + len = dprintf(pipenum, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child); + if (len < 0) { + kill(child, SIGKILL); + bail("unable to generate JSON for child pid"); + } } break; case SYNC_CHILD_READY: @@ -759,23 +776,6 @@ void nsexec(void) bail("unexpected sync value: %u", s); } } - - /* - * Send the init_func pid and the pid of the first child back to our parent. - * - * We need to send both back because we can't reap the first child we created (CLONE_PARENT). - * It becomes the responsibility of our parent to reap the first child. - */ - len = snprintf(buf, JSON_MAX, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child); - if (len < 0) { - kill(child, SIGKILL); - bail("unable to generate JSON for child pid"); - } - if (write(pipenum, buf, len) != len) { - kill(child, SIGKILL); - bail("unable to send child pid to bootstrapper"); - } - exit(0); } @@ -862,14 +862,17 @@ void nsexec(void) if (setresuid(0, 0, 0) < 0) bail("failed to become root in user namespace"); } - /* - * Unshare all of the namespaces. Note that we don't merge this - * with clone() because there were some old kernel versions where - * clone(CLONE_PARENT | CLONE_NEWPID) was broken, so we'll just do - * it the long way. + * Unshare all of the namespaces. Now, it should be noted that this + * ordering might break in the future (especially with rootless + * containers). But for now, it's not possible to split this into + * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues. + * + * Note that we don't merge this with clone() because there were + * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID) + * was broken, so we'll just do it the long way anyway. */ - if (unshare(config.cloneflags) < 0) + if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0) bail("failed to unshare namespaces"); /* @@ -953,11 +956,23 @@ void nsexec(void) if (setgid(0) < 0) bail("setgid failed"); - if (!config.is_rootless && config.is_setgroup) { + if (!config.is_rootless_euid && config.is_setgroup) { if (setgroups(0, NULL) < 0) bail("setgroups failed"); } + /* ... wait until our topmost parent has finished cgroup setup in p.manager.Apply() ... */ + if (config.cloneflags & CLONE_NEWCGROUP) { + uint8_t value; + if (read(pipenum, &value, sizeof(value)) != sizeof(value)) + bail("read synchronisation value failed"); + if (value == CREATECGROUPNS) { + if (unshare(CLONE_NEWCGROUP) < 0) + bail("failed to unshare cgroup namespace"); + } else + bail("received unknown synchronisation value"); + } + s = SYNC_CHILD_READY; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) bail("failed to sync with patent: write(SYNC_CHILD_READY)"); diff --git a/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go b/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go index c1e634c94..92b5ae8de 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go @@ -5,6 +5,7 @@ package user import ( "io" "os" + "strconv" "golang.org/x/sys/unix" ) @@ -115,22 +116,23 @@ func CurrentGroup() (Group, error) { return LookupGid(unix.Getgid()) } -func CurrentUserSubUIDs() ([]SubID, error) { +func currentUserSubIDs(fileName string) ([]SubID, error) { u, err := CurrentUser() if err != nil { return nil, err } - return ParseSubIDFileFilter("/etc/subuid", - func(entry SubID) bool { return entry.Name == u.Name }) + filter := func(entry SubID) bool { + return entry.Name == u.Name || entry.Name == strconv.Itoa(u.Uid) + } + return ParseSubIDFileFilter(fileName, filter) } -func CurrentGroupSubGIDs() ([]SubID, error) { - g, err := CurrentGroup() - if err != nil { - return nil, err - } - return ParseSubIDFileFilter("/etc/subgid", - func(entry SubID) bool { return entry.Name == g.Name }) +func CurrentUserSubUIDs() ([]SubID, error) { + return currentUserSubIDs("/etc/subuid") +} + +func CurrentUserSubGIDs() ([]SubID, error) { + return currentUserSubIDs("/etc/subgid") } func CurrentProcessUIDMap() ([]IDMap, error) { |