diff options
Diffstat (limited to 'vendor/github.com/opencontainers/runc/libcontainer')
8 files changed, 2 insertions, 1443 deletions
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/README.md b/vendor/github.com/opencontainers/runc/libcontainer/README.md deleted file mode 100644 index 1d7fa04c0..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/README.md +++ /dev/null @@ -1,330 +0,0 @@ -# libcontainer - -[![GoDoc](https://godoc.org/github.com/opencontainers/runc/libcontainer?status.svg)](https://godoc.org/github.com/opencontainers/runc/libcontainer) - -Libcontainer provides a native Go implementation for creating containers -with namespaces, cgroups, capabilities, and filesystem access controls. -It allows you to manage the lifecycle of the container performing additional operations -after the container is created. - - -#### Container -A container is a self contained execution environment that shares the kernel of the -host system and which is (optionally) isolated from other containers in the system. - -#### Using libcontainer - -Because containers are spawned in a two step process you will need a binary that -will be executed as the init process for the container. In libcontainer, we use -the current binary (/proc/self/exe) to be executed as the init process, and use -arg "init", we call the first step process "bootstrap", so you always need a "init" -function as the entry of "bootstrap". - -In addition to the go init function the early stage bootstrap is handled by importing -[nsenter](https://github.com/opencontainers/runc/blob/master/libcontainer/nsenter/README.md). - -```go -import ( - _ "github.com/opencontainers/runc/libcontainer/nsenter" -) - -func init() { - if len(os.Args) > 1 && os.Args[1] == "init" { - runtime.GOMAXPROCS(1) - runtime.LockOSThread() - factory, _ := libcontainer.New("") - if err := factory.StartInitialization(); err != nil { - logrus.Fatal(err) - } - panic("--this line should have never been executed, congratulations--") - } -} -``` - -Then to create a container you first have to initialize an instance of a factory -that will handle the creation and initialization for a container. - -```go -factory, err := libcontainer.New("/var/lib/container", libcontainer.Cgroupfs, libcontainer.InitArgs(os.Args[0], "init")) -if err != nil { - logrus.Fatal(err) - return -} -``` - -Once you have an instance of the factory created we can create a configuration -struct describing how the container is to be created. A sample would look similar to this: - -```go -defaultMountFlags := unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV -config := &configs.Config{ - Rootfs: "/your/path/to/rootfs", - Capabilities: &configs.Capabilities{ - Bounding: []string{ - "CAP_CHOWN", - "CAP_DAC_OVERRIDE", - "CAP_FSETID", - "CAP_FOWNER", - "CAP_MKNOD", - "CAP_NET_RAW", - "CAP_SETGID", - "CAP_SETUID", - "CAP_SETFCAP", - "CAP_SETPCAP", - "CAP_NET_BIND_SERVICE", - "CAP_SYS_CHROOT", - "CAP_KILL", - "CAP_AUDIT_WRITE", - }, - Effective: []string{ - "CAP_CHOWN", - "CAP_DAC_OVERRIDE", - "CAP_FSETID", - "CAP_FOWNER", - "CAP_MKNOD", - "CAP_NET_RAW", - "CAP_SETGID", - "CAP_SETUID", - "CAP_SETFCAP", - "CAP_SETPCAP", - "CAP_NET_BIND_SERVICE", - "CAP_SYS_CHROOT", - "CAP_KILL", - "CAP_AUDIT_WRITE", - }, - Inheritable: []string{ - "CAP_CHOWN", - "CAP_DAC_OVERRIDE", - "CAP_FSETID", - "CAP_FOWNER", - "CAP_MKNOD", - "CAP_NET_RAW", - "CAP_SETGID", - "CAP_SETUID", - "CAP_SETFCAP", - "CAP_SETPCAP", - "CAP_NET_BIND_SERVICE", - "CAP_SYS_CHROOT", - "CAP_KILL", - "CAP_AUDIT_WRITE", - }, - Permitted: []string{ - "CAP_CHOWN", - "CAP_DAC_OVERRIDE", - "CAP_FSETID", - "CAP_FOWNER", - "CAP_MKNOD", - "CAP_NET_RAW", - "CAP_SETGID", - "CAP_SETUID", - "CAP_SETFCAP", - "CAP_SETPCAP", - "CAP_NET_BIND_SERVICE", - "CAP_SYS_CHROOT", - "CAP_KILL", - "CAP_AUDIT_WRITE", - }, - Ambient: []string{ - "CAP_CHOWN", - "CAP_DAC_OVERRIDE", - "CAP_FSETID", - "CAP_FOWNER", - "CAP_MKNOD", - "CAP_NET_RAW", - "CAP_SETGID", - "CAP_SETUID", - "CAP_SETFCAP", - "CAP_SETPCAP", - "CAP_NET_BIND_SERVICE", - "CAP_SYS_CHROOT", - "CAP_KILL", - "CAP_AUDIT_WRITE", - }, - }, - Namespaces: configs.Namespaces([]configs.Namespace{ - {Type: configs.NEWNS}, - {Type: configs.NEWUTS}, - {Type: configs.NEWIPC}, - {Type: configs.NEWPID}, - {Type: configs.NEWUSER}, - {Type: configs.NEWNET}, - {Type: configs.NEWCGROUP}, - }), - Cgroups: &configs.Cgroup{ - Name: "test-container", - Parent: "system", - Resources: &configs.Resources{ - MemorySwappiness: nil, - AllowAllDevices: nil, - AllowedDevices: configs.DefaultAllowedDevices, - }, - }, - MaskPaths: []string{ - "/proc/kcore", - "/sys/firmware", - }, - ReadonlyPaths: []string{ - "/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus", - }, - Devices: configs.DefaultAutoCreatedDevices, - Hostname: "testing", - Mounts: []*configs.Mount{ - { - Source: "proc", - Destination: "/proc", - Device: "proc", - Flags: defaultMountFlags, - }, - { - Source: "tmpfs", - Destination: "/dev", - Device: "tmpfs", - Flags: unix.MS_NOSUID | unix.MS_STRICTATIME, - Data: "mode=755", - }, - { - Source: "devpts", - Destination: "/dev/pts", - Device: "devpts", - Flags: unix.MS_NOSUID | unix.MS_NOEXEC, - Data: "newinstance,ptmxmode=0666,mode=0620,gid=5", - }, - { - Device: "tmpfs", - Source: "shm", - Destination: "/dev/shm", - Data: "mode=1777,size=65536k", - Flags: defaultMountFlags, - }, - { - Source: "mqueue", - Destination: "/dev/mqueue", - Device: "mqueue", - Flags: defaultMountFlags, - }, - { - Source: "sysfs", - Destination: "/sys", - Device: "sysfs", - Flags: defaultMountFlags | unix.MS_RDONLY, - }, - }, - UidMappings: []configs.IDMap{ - { - ContainerID: 0, - HostID: 1000, - Size: 65536, - }, - }, - GidMappings: []configs.IDMap{ - { - ContainerID: 0, - HostID: 1000, - Size: 65536, - }, - }, - Networks: []*configs.Network{ - { - Type: "loopback", - Address: "127.0.0.1/0", - Gateway: "localhost", - }, - }, - Rlimits: []configs.Rlimit{ - { - Type: unix.RLIMIT_NOFILE, - Hard: uint64(1025), - Soft: uint64(1025), - }, - }, -} -``` - -Once you have the configuration populated you can create a container: - -```go -container, err := factory.Create("container-id", config) -if err != nil { - logrus.Fatal(err) - return -} -``` - -To spawn bash as the initial process inside the container and have the -processes pid returned in order to wait, signal, or kill the process: - -```go -process := &libcontainer.Process{ - Args: []string{"/bin/bash"}, - Env: []string{"PATH=/bin"}, - User: "daemon", - Stdin: os.Stdin, - Stdout: os.Stdout, - Stderr: os.Stderr, -} - -err := container.Run(process) -if err != nil { - container.Destroy() - logrus.Fatal(err) - return -} - -// wait for the process to finish. -_, err := process.Wait() -if err != nil { - logrus.Fatal(err) -} - -// destroy the container. -container.Destroy() -``` - -Additional ways to interact with a running container are: - -```go -// return all the pids for all processes running inside the container. -processes, err := container.Processes() - -// get detailed cpu, memory, io, and network statistics for the container and -// it's processes. -stats, err := container.Stats() - -// pause all processes inside the container. -container.Pause() - -// resume all paused processes. -container.Resume() - -// send signal to container's init process. -container.Signal(signal) - -// update container resource constraints. -container.Set(config) - -// get current status of the container. -status, err := container.Status() - -// get current container's state information. -state, err := container.State() -``` - - -#### Checkpoint & Restore - -libcontainer now integrates [CRIU](http://criu.org/) for checkpointing and restoring containers. -This let's you save the state of a process running inside a container to disk, and then restore -that state into a new process, on the same machine or on another machine. - -`criu` version 1.5.2 or higher is required to use checkpoint and restore. -If you don't already have `criu` installed, you can build it from source, following the -[online instructions](http://criu.org/Installation). `criu` is also installed in the docker image -generated when building libcontainer with docker. - - -## Copyright and license - -Code and documentation copyright 2014 Docker, inc. -The code and documentation are released under the [Apache 2.0 license](../LICENSE). -The documentation is also released under Creative Commons Attribution 4.0 International License. -You may obtain a copy of the license, titled CC-BY-4.0, at http://creativecommons.org/licenses/by/4.0/. diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/README.md b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/README.md deleted file mode 100644 index 9ec6c3931..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/README.md +++ /dev/null @@ -1,44 +0,0 @@ -## nsenter - -The `nsenter` package registers a special init constructor that is called before -the Go runtime has a chance to boot. This provides us the ability to `setns` on -existing namespaces and avoid the issues that the Go runtime has with multiple -threads. This constructor will be called if this package is registered, -imported, in your go application. - -The `nsenter` package will `import "C"` and it uses [cgo](https://golang.org/cmd/cgo/) -package. In cgo, if the import of "C" is immediately preceded by a comment, that comment, -called the preamble, is used as a header when compiling the C parts of the package. -So every time we import package `nsenter`, the C code function `nsexec()` would be -called. And package `nsenter` is only imported in `init.go`, so every time the runc -`init` command is invoked, that C code is run. - -Because `nsexec()` must be run before the Go runtime in order to use the -Linux kernel namespace, you must `import` this library into a package if -you plan to use `libcontainer` directly. Otherwise Go will not execute -the `nsexec()` constructor, which means that the re-exec will not cause -the namespaces to be joined. You can import it like this: - -```go -import _ "github.com/opencontainers/runc/libcontainer/nsenter" -``` - -`nsexec()` will first get the file descriptor number for the init pipe -from the environment variable `_LIBCONTAINER_INITPIPE` (which was opened -by the parent and kept open across the fork-exec of the `nsexec()` init -process). The init pipe is used to read bootstrap data (namespace paths, -clone flags, uid and gid mappings, and the console path) from the parent -process. `nsexec()` will then call `setns(2)` to join the namespaces -provided in the bootstrap data (if available), `clone(2)` a child process -with the provided clone flags, update the user and group ID mappings, do -some further miscellaneous setup steps, and then send the PID of the -child process to the parent of the `nsexec()` "caller". Finally, -the parent `nsexec()` will exit and the child `nsexec()` process will -return to allow the Go runtime take over. - -NOTE: We do both `setns(2)` and `clone(2)` even if we don't have any -`CLONE_NEW*` clone flags because we must fork a new process in order to -enter the PID namespace. - - - diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/namespace.h b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/namespace.h deleted file mode 100644 index 9e9bdca05..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/namespace.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef NSENTER_NAMESPACE_H -#define NSENTER_NAMESPACE_H - -#ifndef _GNU_SOURCE -# define _GNU_SOURCE -#endif -#include <sched.h> - -/* All of these are taken from include/uapi/linux/sched.h */ -#ifndef CLONE_NEWNS -# define CLONE_NEWNS 0x00020000 /* New mount namespace group */ -#endif -#ifndef CLONE_NEWCGROUP -# define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */ -#endif -#ifndef CLONE_NEWUTS -# define CLONE_NEWUTS 0x04000000 /* New utsname namespace */ -#endif -#ifndef CLONE_NEWIPC -# define CLONE_NEWIPC 0x08000000 /* New ipc namespace */ -#endif -#ifndef CLONE_NEWUSER -# define CLONE_NEWUSER 0x10000000 /* New user namespace */ -#endif -#ifndef CLONE_NEWPID -# define CLONE_NEWPID 0x20000000 /* New pid namespace */ -#endif -#ifndef CLONE_NEWNET -# define CLONE_NEWNET 0x40000000 /* New network namespace */ -#endif - -#endif /* NSENTER_NAMESPACE_H */ diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter.go b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter.go deleted file mode 100644 index 07f4d63e4..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter.go +++ /dev/null @@ -1,12 +0,0 @@ -// +build linux,!gccgo - -package nsenter - -/* -#cgo CFLAGS: -Wall -extern void nsexec(); -void __attribute__((constructor)) init(void) { - nsexec(); -} -*/ -import "C" diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_gccgo.go b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_gccgo.go deleted file mode 100644 index 63c7a3ec2..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_gccgo.go +++ /dev/null @@ -1,25 +0,0 @@ -// +build linux,gccgo - -package nsenter - -/* -#cgo CFLAGS: -Wall -extern void nsexec(); -void __attribute__((constructor)) init(void) { - nsexec(); -} -*/ -import "C" - -// AlwaysFalse is here to stay false -// (and be exported so the compiler doesn't optimize out its reference) -var AlwaysFalse bool - -func init() { - if AlwaysFalse { - // by referencing this C init() in a noop test, it will ensure the compiler - // links in the C function. - // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65134 - C.init() - } -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_unsupported.go deleted file mode 100644 index ac701ca39..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_unsupported.go +++ /dev/null @@ -1,5 +0,0 @@ -// +build !linux !cgo - -package nsenter - -import "C" diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c deleted file mode 100644 index 28269dfc0..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c +++ /dev/null @@ -1,995 +0,0 @@ - -#define _GNU_SOURCE -#include <endian.h> -#include <errno.h> -#include <fcntl.h> -#include <grp.h> -#include <sched.h> -#include <setjmp.h> -#include <signal.h> -#include <stdarg.h> -#include <stdbool.h> -#include <stdint.h> -#include <stdio.h> -#include <stdlib.h> -#include <stdbool.h> -#include <string.h> -#include <unistd.h> - -#include <sys/ioctl.h> -#include <sys/prctl.h> -#include <sys/socket.h> -#include <sys/types.h> -#include <sys/wait.h> - -#include <linux/limits.h> -#include <linux/netlink.h> -#include <linux/types.h> - -/* Get all of the CLONE_NEW* flags. */ -#include "namespace.h" - -/* Synchronisation values. */ -enum sync_t { - SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */ - SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */ - SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */ - SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */ - SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */ - SYNC_CHILD_READY = 0x45, /* The child or grandchild is ready to return. */ - - /* XXX: This doesn't help with segfaults and other such issues. */ - SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */ -}; - -/* - * Synchronisation value for cgroup namespace setup. - * The same constant is defined in process_linux.go as "createCgroupns". - */ -#define CREATECGROUPNS 0x80 - -/* longjmp() arguments. */ -#define JUMP_PARENT 0x00 -#define JUMP_CHILD 0xA0 -#define JUMP_INIT 0xA1 - -/* JSON buffer. */ -#define JSON_MAX 4096 - -/* Assume the stack grows down, so arguments should be above it. */ -struct clone_t { - /* - * Reserve some space for clone() to locate arguments - * and retcode in this place - */ - char stack[4096] __attribute__ ((aligned(16))); - char stack_ptr[0]; - - /* There's two children. This is used to execute the different code. */ - jmp_buf *env; - int jmpval; -}; - -struct nlconfig_t { - char *data; - - /* Process settings. */ - uint32_t cloneflags; - char *oom_score_adj; - size_t oom_score_adj_len; - - /* User namespace settings. */ - char *uidmap; - size_t uidmap_len; - char *gidmap; - size_t gidmap_len; - char *namespaces; - size_t namespaces_len; - uint8_t is_setgroup; - - /* Rootless container settings. */ - uint8_t is_rootless_euid; /* boolean */ - char *uidmappath; - size_t uidmappath_len; - char *gidmappath; - size_t gidmappath_len; -}; - -/* - * List of netlink message types sent to us as part of bootstrapping the init. - * These constants are defined in libcontainer/message_linux.go. - */ -#define INIT_MSG 62000 -#define CLONE_FLAGS_ATTR 27281 -#define NS_PATHS_ATTR 27282 -#define UIDMAP_ATTR 27283 -#define GIDMAP_ATTR 27284 -#define SETGROUP_ATTR 27285 -#define OOM_SCORE_ADJ_ATTR 27286 -#define ROOTLESS_EUID_ATTR 27287 -#define UIDMAPPATH_ATTR 27288 -#define GIDMAPPATH_ATTR 27289 - -/* - * Use the raw syscall for versions of glibc which don't include a function for - * it, namely (glibc 2.12). - */ -#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14 -# define _GNU_SOURCE -# include "syscall.h" -# if !defined(SYS_setns) && defined(__NR_setns) -# define SYS_setns __NR_setns -# endif - -#ifndef SYS_setns -# error "setns(2) syscall not supported by glibc version" -#endif - -int setns(int fd, int nstype) -{ - return syscall(SYS_setns, fd, nstype); -} -#endif - -/* XXX: This is ugly. */ -static int syncfd = -1; - -/* TODO(cyphar): Fix this so it correctly deals with syncT. */ -#define bail(fmt, ...) \ - do { \ - int ret = __COUNTER__ + 1; \ - fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__); \ - if (syncfd >= 0) { \ - enum sync_t s = SYNC_ERR; \ - if (write(syncfd, &s, sizeof(s)) != sizeof(s)) \ - fprintf(stderr, "nsenter: failed: write(s)"); \ - if (write(syncfd, &ret, sizeof(ret)) != sizeof(ret)) \ - fprintf(stderr, "nsenter: failed: write(ret)"); \ - } \ - exit(ret); \ - } while(0) - -static int write_file(char *data, size_t data_len, char *pathfmt, ...) -{ - int fd, len, ret = 0; - char path[PATH_MAX]; - - va_list ap; - va_start(ap, pathfmt); - len = vsnprintf(path, PATH_MAX, pathfmt, ap); - va_end(ap); - if (len < 0) - return -1; - - fd = open(path, O_RDWR); - if (fd < 0) { - return -1; - } - - len = write(fd, data, data_len); - if (len != data_len) { - ret = -1; - goto out; - } - - out: - close(fd); - return ret; -} - -enum policy_t { - SETGROUPS_DEFAULT = 0, - SETGROUPS_ALLOW, - SETGROUPS_DENY, -}; - -/* This *must* be called before we touch gid_map. */ -static void update_setgroups(int pid, enum policy_t setgroup) -{ - char *policy; - - switch (setgroup) { - case SETGROUPS_ALLOW: - policy = "allow"; - break; - case SETGROUPS_DENY: - policy = "deny"; - break; - case SETGROUPS_DEFAULT: - default: - /* Nothing to do. */ - return; - } - - if (write_file(policy, strlen(policy), "/proc/%d/setgroups", pid) < 0) { - /* - * If the kernel is too old to support /proc/pid/setgroups, - * open(2) or write(2) will return ENOENT. This is fine. - */ - if (errno != ENOENT) - bail("failed to write '%s' to /proc/%d/setgroups", policy, pid); - } -} - -static int try_mapping_tool(const char *app, int pid, char *map, size_t map_len) -{ - int child; - - /* - * If @app is NULL, execve will segfault. Just check it here and bail (if - * we're in this path, the caller is already getting desperate and there - * isn't a backup to this failing). This usually would be a configuration - * or programming issue. - */ - if (!app) - bail("mapping tool not present"); - - child = fork(); - if (child < 0) - bail("failed to fork"); - - if (!child) { -#define MAX_ARGV 20 - char *argv[MAX_ARGV]; - char *envp[] = { NULL }; - char pid_fmt[16]; - int argc = 0; - char *next; - - snprintf(pid_fmt, 16, "%d", pid); - - argv[argc++] = (char *)app; - argv[argc++] = pid_fmt; - /* - * Convert the map string into a list of argument that - * newuidmap/newgidmap can understand. - */ - - while (argc < MAX_ARGV) { - if (*map == '\0') { - argv[argc++] = NULL; - break; - } - argv[argc++] = map; - next = strpbrk(map, "\n "); - if (next == NULL) - break; - *next++ = '\0'; - map = next + strspn(next, "\n "); - } - - execve(app, argv, envp); - bail("failed to execv"); - } else { - int status; - - while (true) { - if (waitpid(child, &status, 0) < 0) { - if (errno == EINTR) - continue; - bail("failed to waitpid"); - } - if (WIFEXITED(status) || WIFSIGNALED(status)) - return WEXITSTATUS(status); - } - } - - return -1; -} - -static void update_uidmap(const char *path, int pid, char *map, size_t map_len) -{ - if (map == NULL || map_len <= 0) - return; - - if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) { - if (errno != EPERM) - bail("failed to update /proc/%d/uid_map", pid); - if (try_mapping_tool(path, pid, map, map_len)) - bail("failed to use newuid map on %d", pid); - } -} - -static void update_gidmap(const char *path, int pid, char *map, size_t map_len) -{ - if (map == NULL || map_len <= 0) - return; - - if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) { - if (errno != EPERM) - bail("failed to update /proc/%d/gid_map", pid); - if (try_mapping_tool(path, pid, map, map_len)) - bail("failed to use newgid map on %d", pid); - } -} - -static void update_oom_score_adj(char *data, size_t len) -{ - if (data == NULL || len <= 0) - return; - - if (write_file(data, len, "/proc/self/oom_score_adj") < 0) - bail("failed to update /proc/self/oom_score_adj"); -} - -/* A dummy function that just jumps to the given jumpval. */ -static int child_func(void *arg) __attribute__ ((noinline)); -static int child_func(void *arg) -{ - struct clone_t *ca = (struct clone_t *)arg; - longjmp(*ca->env, ca->jmpval); -} - -static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline)); -static int clone_parent(jmp_buf *env, int jmpval) -{ - struct clone_t ca = { - .env = env, - .jmpval = jmpval, - }; - - return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca); -} - -/* - * Gets the init pipe fd from the environment, which is used to read the - * bootstrap data and tell the parent what the new pid is after we finish - * setting up the environment. - */ -static int initpipe(void) -{ - int pipenum; - char *initpipe, *endptr; - - initpipe = getenv("_LIBCONTAINER_INITPIPE"); - if (initpipe == NULL || *initpipe == '\0') - return -1; - - pipenum = strtol(initpipe, &endptr, 10); - if (*endptr != '\0') - bail("unable to parse _LIBCONTAINER_INITPIPE"); - - return pipenum; -} - -/* Returns the clone(2) flag for a namespace, given the name of a namespace. */ -static int nsflag(char *name) -{ - if (!strcmp(name, "cgroup")) - return CLONE_NEWCGROUP; - else if (!strcmp(name, "ipc")) - return CLONE_NEWIPC; - else if (!strcmp(name, "mnt")) - return CLONE_NEWNS; - else if (!strcmp(name, "net")) - return CLONE_NEWNET; - else if (!strcmp(name, "pid")) - return CLONE_NEWPID; - else if (!strcmp(name, "user")) - return CLONE_NEWUSER; - else if (!strcmp(name, "uts")) - return CLONE_NEWUTS; - - /* If we don't recognise a name, fallback to 0. */ - return 0; -} - -static uint32_t readint32(char *buf) -{ - return *(uint32_t *) buf; -} - -static uint8_t readint8(char *buf) -{ - return *(uint8_t *) buf; -} - -static void nl_parse(int fd, struct nlconfig_t *config) -{ - size_t len, size; - struct nlmsghdr hdr; - char *data, *current; - - /* Retrieve the netlink header. */ - len = read(fd, &hdr, NLMSG_HDRLEN); - if (len != NLMSG_HDRLEN) - bail("invalid netlink header length %zu", len); - - if (hdr.nlmsg_type == NLMSG_ERROR) - bail("failed to read netlink message"); - - if (hdr.nlmsg_type != INIT_MSG) - bail("unexpected msg type %d", hdr.nlmsg_type); - - /* Retrieve data. */ - size = NLMSG_PAYLOAD(&hdr, 0); - current = data = malloc(size); - if (!data) - bail("failed to allocate %zu bytes of memory for nl_payload", size); - - len = read(fd, data, size); - if (len != size) - bail("failed to read netlink payload, %zu != %zu", len, size); - - /* Parse the netlink payload. */ - config->data = data; - while (current < data + size) { - struct nlattr *nlattr = (struct nlattr *)current; - size_t payload_len = nlattr->nla_len - NLA_HDRLEN; - - /* Advance to payload. */ - current += NLA_HDRLEN; - - /* Handle payload. */ - switch (nlattr->nla_type) { - case CLONE_FLAGS_ATTR: - config->cloneflags = readint32(current); - break; - case ROOTLESS_EUID_ATTR: - config->is_rootless_euid = readint8(current); /* boolean */ - break; - case OOM_SCORE_ADJ_ATTR: - config->oom_score_adj = current; - config->oom_score_adj_len = payload_len; - break; - case NS_PATHS_ATTR: - config->namespaces = current; - config->namespaces_len = payload_len; - break; - case UIDMAP_ATTR: - config->uidmap = current; - config->uidmap_len = payload_len; - break; - case GIDMAP_ATTR: - config->gidmap = current; - config->gidmap_len = payload_len; - break; - case UIDMAPPATH_ATTR: - config->uidmappath = current; - config->uidmappath_len = payload_len; - break; - case GIDMAPPATH_ATTR: - config->gidmappath = current; - config->gidmappath_len = payload_len; - break; - case SETGROUP_ATTR: - config->is_setgroup = readint8(current); - break; - default: - bail("unknown netlink message type %d", nlattr->nla_type); - } - - current += NLA_ALIGN(payload_len); - } -} - -void nl_free(struct nlconfig_t *config) -{ - free(config->data); -} - -void join_namespaces(char *nslist) -{ - int num = 0, i; - char *saveptr = NULL; - char *namespace = strtok_r(nslist, ",", &saveptr); - struct namespace_t { - int fd; - int ns; - char type[PATH_MAX]; - char path[PATH_MAX]; - } *namespaces = NULL; - - if (!namespace || !strlen(namespace) || !strlen(nslist)) - bail("ns paths are empty"); - - /* - * We have to open the file descriptors first, since after - * we join the mnt namespace we might no longer be able to - * access the paths. - */ - do { - int fd; - char *path; - struct namespace_t *ns; - - /* Resize the namespace array. */ - namespaces = realloc(namespaces, ++num * sizeof(struct namespace_t)); - if (!namespaces) - bail("failed to reallocate namespace array"); - ns = &namespaces[num - 1]; - - /* Split 'ns:path'. */ - path = strstr(namespace, ":"); - if (!path) - bail("failed to parse %s", namespace); - *path++ = '\0'; - - fd = open(path, O_RDONLY); - if (fd < 0) - bail("failed to open %s", path); - - ns->fd = fd; - ns->ns = nsflag(namespace); - strncpy(ns->path, path, PATH_MAX - 1); - ns->path[PATH_MAX - 1] = '\0'; - } while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL); - - /* - * The ordering in which we join namespaces is important. We should - * always join the user namespace *first*. This is all guaranteed - * from the container_linux.go side of this, so we're just going to - * follow the order given to us. - */ - - for (i = 0; i < num; i++) { - struct namespace_t ns = namespaces[i]; - - if (setns(ns.fd, ns.ns) < 0) - bail("failed to setns to %s", ns.path); - - close(ns.fd); - } - - free(namespaces); -} - -void nsexec(void) -{ - int pipenum; - jmp_buf env; - int sync_child_pipe[2], sync_grandchild_pipe[2]; - struct nlconfig_t config = { 0 }; - - /* - * If we don't have an init pipe, just return to the go routine. - * We'll only get an init pipe for start or exec. - */ - pipenum = initpipe(); - if (pipenum == -1) - return; - - /* Parse all of the netlink configuration. */ - nl_parse(pipenum, &config); - - /* Set oom_score_adj. This has to be done before !dumpable because - * /proc/self/oom_score_adj is not writeable unless you're an privileged - * user (if !dumpable is set). All children inherit their parent's - * oom_score_adj value on fork(2) so this will always be propagated - * properly. - */ - update_oom_score_adj(config.oom_score_adj, config.oom_score_adj_len); - - /* - * Make the process non-dumpable, to avoid various race conditions that - * could cause processes in namespaces we're joining to access host - * resources (or potentially execute code). - * - * However, if the number of namespaces we are joining is 0, we are not - * going to be switching to a different security context. Thus setting - * ourselves to be non-dumpable only breaks things (like rootless - * containers), which is the recommendation from the kernel folks. - */ - if (config.namespaces) { - if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0) - bail("failed to set process as non-dumpable"); - } - - /* Pipe so we can tell the child when we've finished setting up. */ - if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_child_pipe) < 0) - bail("failed to setup sync pipe between parent and child"); - - /* - * We need a new socketpair to sync with grandchild so we don't have - * race condition with child. - */ - if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_grandchild_pipe) < 0) - bail("failed to setup sync pipe between parent and grandchild"); - - /* TODO: Currently we aren't dealing with child deaths properly. */ - - /* - * Okay, so this is quite annoying. - * - * In order for this unsharing code to be more extensible we need to split - * up unshare(CLONE_NEWUSER) and clone() in various ways. The ideal case - * would be if we did clone(CLONE_NEWUSER) and the other namespaces - * separately, but because of SELinux issues we cannot really do that. But - * we cannot just dump the namespace flags into clone(...) because several - * usecases (such as rootless containers) require more granularity around - * the namespace setup. In addition, some older kernels had issues where - * CLONE_NEWUSER wasn't handled before other namespaces (but we cannot - * handle this while also dealing with SELinux so we choose SELinux support - * over broken kernel support). - * - * However, if we unshare(2) the user namespace *before* we clone(2), then - * all hell breaks loose. - * - * The parent no longer has permissions to do many things (unshare(2) drops - * all capabilities in your old namespace), and the container cannot be set - * up to have more than one {uid,gid} mapping. This is obviously less than - * ideal. In order to fix this, we have to first clone(2) and then unshare. - * - * Unfortunately, it's not as simple as that. We have to fork to enter the - * PID namespace (the PID namespace only applies to children). Since we'll - * have to double-fork, this clone_parent() call won't be able to get the - * PID of the _actual_ init process (without doing more synchronisation than - * I can deal with at the moment). So we'll just get the parent to send it - * for us, the only job of this process is to update - * /proc/pid/{setgroups,uid_map,gid_map}. - * - * And as a result of the above, we also need to setns(2) in the first child - * because if we join a PID namespace in the topmost parent then our child - * will be in that namespace (and it will not be able to give us a PID value - * that makes sense without resorting to sending things with cmsg). - * - * This also deals with an older issue caused by dumping cloneflags into - * clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so - * we have to unshare(2) before clone(2) in order to do this. This was fixed - * in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was - * introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're - * aware, the last mainline kernel which had this bug was Linux 3.12. - * However, we cannot comment on which kernels the broken patch was - * backported to. - * - * -- Aleksa "what has my life come to?" Sarai - */ - - switch (setjmp(env)) { - /* - * Stage 0: We're in the parent. Our job is just to create a new child - * (stage 1: JUMP_CHILD) process and write its uid_map and - * gid_map. That process will go on to create a new process, then - * it will send us its PID which we will send to the bootstrap - * process. - */ - case JUMP_PARENT:{ - int len; - pid_t child, first_child = -1; - bool ready = false; - - /* For debugging. */ - prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0); - - /* Start the process of getting a container. */ - child = clone_parent(&env, JUMP_CHILD); - if (child < 0) - bail("unable to fork: child_func"); - - /* - * State machine for synchronisation with the children. - * - * Father only return when both child and grandchild are - * ready, so we can receive all possible error codes - * generated by children. - */ - while (!ready) { - enum sync_t s; - int ret; - - syncfd = sync_child_pipe[1]; - close(sync_child_pipe[0]); - - if (read(syncfd, &s, sizeof(s)) != sizeof(s)) - bail("failed to sync with child: next state"); - - switch (s) { - case SYNC_ERR: - /* We have to mirror the error code of the child. */ - if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret)) - bail("failed to sync with child: read(error code)"); - - exit(ret); - case SYNC_USERMAP_PLS: - /* - * Enable setgroups(2) if we've been asked to. But we also - * have to explicitly disable setgroups(2) if we're - * creating a rootless container for single-entry mapping. - * i.e. config.is_setgroup == false. - * (this is required since Linux 3.19). - * - * For rootless multi-entry mapping, config.is_setgroup shall be true and - * newuidmap/newgidmap shall be used. - */ - - if (config.is_rootless_euid && !config.is_setgroup) - update_setgroups(child, SETGROUPS_DENY); - - /* Set up mappings. */ - update_uidmap(config.uidmappath, child, config.uidmap, config.uidmap_len); - update_gidmap(config.gidmappath, child, config.gidmap, config.gidmap_len); - - s = SYNC_USERMAP_ACK; - if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { - kill(child, SIGKILL); - bail("failed to sync with child: write(SYNC_USERMAP_ACK)"); - } - break; - case SYNC_RECVPID_PLS:{ - first_child = child; - - /* Get the init_func pid. */ - if (read(syncfd, &child, sizeof(child)) != sizeof(child)) { - kill(first_child, SIGKILL); - bail("failed to sync with child: read(childpid)"); - } - - /* Send ACK. */ - s = SYNC_RECVPID_ACK; - if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { - kill(first_child, SIGKILL); - kill(child, SIGKILL); - bail("failed to sync with child: write(SYNC_RECVPID_ACK)"); - } - - /* Send the init_func pid back to our parent. - * - * Send the init_func pid and the pid of the first child back to our parent. - * We need to send both back because we can't reap the first child we created (CLONE_PARENT). - * It becomes the responsibility of our parent to reap the first child. - */ - len = dprintf(pipenum, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child); - if (len < 0) { - kill(child, SIGKILL); - bail("unable to generate JSON for child pid"); - } - } - break; - case SYNC_CHILD_READY: - ready = true; - break; - default: - bail("unexpected sync value: %u", s); - } - } - - /* Now sync with grandchild. */ - - ready = false; - while (!ready) { - enum sync_t s; - int ret; - - syncfd = sync_grandchild_pipe[1]; - close(sync_grandchild_pipe[0]); - - s = SYNC_GRANDCHILD; - if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { - kill(child, SIGKILL); - bail("failed to sync with child: write(SYNC_GRANDCHILD)"); - } - - if (read(syncfd, &s, sizeof(s)) != sizeof(s)) - bail("failed to sync with child: next state"); - - switch (s) { - case SYNC_ERR: - /* We have to mirror the error code of the child. */ - if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret)) - bail("failed to sync with child: read(error code)"); - - exit(ret); - case SYNC_CHILD_READY: - ready = true; - break; - default: - bail("unexpected sync value: %u", s); - } - } - exit(0); - } - - /* - * Stage 1: We're in the first child process. Our job is to join any - * provided namespaces in the netlink payload and unshare all - * of the requested namespaces. If we've been asked to - * CLONE_NEWUSER, we will ask our parent (stage 0) to set up - * our user mappings for us. Then, we create a new child - * (stage 2: JUMP_INIT) for PID namespace. We then send the - * child's PID to our parent (stage 0). - */ - case JUMP_CHILD:{ - pid_t child; - enum sync_t s; - - /* We're in a child and thus need to tell the parent if we die. */ - syncfd = sync_child_pipe[0]; - close(sync_child_pipe[1]); - - /* For debugging. */ - prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0); - - /* - * We need to setns first. We cannot do this earlier (in stage 0) - * because of the fact that we forked to get here (the PID of - * [stage 2: JUMP_INIT]) would be meaningless). We could send it - * using cmsg(3) but that's just annoying. - */ - if (config.namespaces) - join_namespaces(config.namespaces); - - /* - * Deal with user namespaces first. They are quite special, as they - * affect our ability to unshare other namespaces and are used as - * context for privilege checks. - * - * We don't unshare all namespaces in one go. The reason for this - * is that, while the kernel documentation may claim otherwise, - * there are certain cases where unsharing all namespaces at once - * will result in namespace objects being owned incorrectly. - * Ideally we should just fix these kernel bugs, but it's better to - * be safe than sorry, and fix them separately. - * - * A specific case of this is that the SELinux label of the - * internal kern-mount that mqueue uses will be incorrect if the - * UTS namespace is cloned before the USER namespace is mapped. - * I've also heard of similar problems with the network namespace - * in some scenarios. This also mirrors how LXC deals with this - * problem. - */ - if (config.cloneflags & CLONE_NEWUSER) { - if (unshare(CLONE_NEWUSER) < 0) - bail("failed to unshare user namespace"); - config.cloneflags &= ~CLONE_NEWUSER; - - /* - * We don't have the privileges to do any mapping here (see the - * clone_parent rant). So signal our parent to hook us up. - */ - - /* Switching is only necessary if we joined namespaces. */ - if (config.namespaces) { - if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0) - bail("failed to set process as dumpable"); - } - s = SYNC_USERMAP_PLS; - if (write(syncfd, &s, sizeof(s)) != sizeof(s)) - bail("failed to sync with parent: write(SYNC_USERMAP_PLS)"); - - /* ... wait for mapping ... */ - - if (read(syncfd, &s, sizeof(s)) != sizeof(s)) - bail("failed to sync with parent: read(SYNC_USERMAP_ACK)"); - if (s != SYNC_USERMAP_ACK) - bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s); - /* Switching is only necessary if we joined namespaces. */ - if (config.namespaces) { - if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0) - bail("failed to set process as dumpable"); - } - - /* Become root in the namespace proper. */ - if (setresuid(0, 0, 0) < 0) - bail("failed to become root in user namespace"); - } - /* - * Unshare all of the namespaces. Now, it should be noted that this - * ordering might break in the future (especially with rootless - * containers). But for now, it's not possible to split this into - * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues. - * - * Note that we don't merge this with clone() because there were - * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID) - * was broken, so we'll just do it the long way anyway. - */ - if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0) - bail("failed to unshare namespaces"); - - /* - * TODO: What about non-namespace clone flags that we're dropping here? - * - * We fork again because of PID namespace, setns(2) or unshare(2) don't - * change the PID namespace of the calling process, because doing so - * would change the caller's idea of its own PID (as reported by getpid()), - * which would break many applications and libraries, so we must fork - * to actually enter the new PID namespace. - */ - child = clone_parent(&env, JUMP_INIT); - if (child < 0) - bail("unable to fork: init_func"); - - /* Send the child to our parent, which knows what it's doing. */ - s = SYNC_RECVPID_PLS; - if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { - kill(child, SIGKILL); - bail("failed to sync with parent: write(SYNC_RECVPID_PLS)"); - } - if (write(syncfd, &child, sizeof(child)) != sizeof(child)) { - kill(child, SIGKILL); - bail("failed to sync with parent: write(childpid)"); - } - - /* ... wait for parent to get the pid ... */ - - if (read(syncfd, &s, sizeof(s)) != sizeof(s)) { - kill(child, SIGKILL); - bail("failed to sync with parent: read(SYNC_RECVPID_ACK)"); - } - if (s != SYNC_RECVPID_ACK) { - kill(child, SIGKILL); - bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s); - } - - s = SYNC_CHILD_READY; - if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { - kill(child, SIGKILL); - bail("failed to sync with parent: write(SYNC_CHILD_READY)"); - } - - /* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */ - exit(0); - } - - /* - * Stage 2: We're the final child process, and the only process that will - * actually return to the Go runtime. Our job is to just do the - * final cleanup steps and then return to the Go runtime to allow - * init_linux.go to run. - */ - case JUMP_INIT:{ - /* - * We're inside the child now, having jumped from the - * start_child() code after forking in the parent. - */ - enum sync_t s; - - /* We're in a child and thus need to tell the parent if we die. */ - syncfd = sync_grandchild_pipe[0]; - close(sync_grandchild_pipe[1]); - close(sync_child_pipe[0]); - close(sync_child_pipe[1]); - - /* For debugging. */ - prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0); - - if (read(syncfd, &s, sizeof(s)) != sizeof(s)) - bail("failed to sync with parent: read(SYNC_GRANDCHILD)"); - if (s != SYNC_GRANDCHILD) - bail("failed to sync with parent: SYNC_GRANDCHILD: got %u", s); - - if (setsid() < 0) - bail("setsid failed"); - - if (setuid(0) < 0) - bail("setuid failed"); - - if (setgid(0) < 0) - bail("setgid failed"); - - if (!config.is_rootless_euid && config.is_setgroup) { - if (setgroups(0, NULL) < 0) - bail("setgroups failed"); - } - - /* ... wait until our topmost parent has finished cgroup setup in p.manager.Apply() ... */ - if (config.cloneflags & CLONE_NEWCGROUP) { - uint8_t value; - if (read(pipenum, &value, sizeof(value)) != sizeof(value)) - bail("read synchronisation value failed"); - if (value == CREATECGROUPNS) { - if (unshare(CLONE_NEWCGROUP) < 0) - bail("failed to unshare cgroup namespace"); - } else - bail("received unknown synchronisation value"); - } - - s = SYNC_CHILD_READY; - if (write(syncfd, &s, sizeof(s)) != sizeof(s)) - bail("failed to sync with patent: write(SYNC_CHILD_READY)"); - - /* Close sync pipes. */ - close(sync_grandchild_pipe[0]); - - /* Free netlink data. */ - nl_free(&config); - - /* Finish executing, let the Go runtime take over. */ - return; - } - default: - bail("unexpected jump value"); - } - - /* Should never be reached. */ - bail("should never be reached"); -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/user/MAINTAINERS b/vendor/github.com/opencontainers/runc/libcontainer/user/MAINTAINERS new file mode 100644 index 000000000..edbe20066 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/user/MAINTAINERS @@ -0,0 +1,2 @@ +Tianon Gravi <admwiggin@gmail.com> (@tianon) +Aleksa Sarai <cyphar@cyphar.com> (@cyphar) |