diff options
author | baude <bbaude@redhat.com> | 2018-02-14 12:51:06 -0600 |
---|---|---|
committer | Atomic Bot <atomic-devel@projectatomic.io> | 2018-02-15 00:20:47 +0000 |
commit | be9ed1cfacc19d1ad3c09e10481da445615b8b8e (patch) | |
tree | 1c0c01daf5b43c6139e37408be601475c1dcea41 /vendor/github.com/opencontainers/runc/libcontainer/nsenter | |
parent | d051dc38d81920c94c37b20ceba0d33b35299bca (diff) | |
download | podman-be9ed1cfacc19d1ad3c09e10481da445615b8b8e.tar.gz podman-be9ed1cfacc19d1ad3c09e10481da445615b8b8e.tar.bz2 podman-be9ed1cfacc19d1ad3c09e10481da445615b8b8e.zip |
Privileged containers should inherit host devices
When running a privileged container, it should inherit the same
devices the host has.
Signed-off-by: baude <bbaude@redhat.com>
Closes: #330
Approved by: mheon
Diffstat (limited to 'vendor/github.com/opencontainers/runc/libcontainer/nsenter')
-rw-r--r-- | vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c | 231 |
1 files changed, 166 insertions, 65 deletions
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c index 197e6d08e..2c69cee5d 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c +++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c @@ -1,3 +1,4 @@ + #define _GNU_SOURCE #include <endian.h> #include <errno.h> @@ -19,6 +20,7 @@ #include <sys/prctl.h> #include <sys/socket.h> #include <sys/types.h> +#include <sys/wait.h> #include <linux/limits.h> #include <linux/netlink.h> @@ -29,15 +31,15 @@ /* Synchronisation values. */ enum sync_t { - SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */ - SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */ - SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */ - SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */ - SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */ - SYNC_CHILD_READY = 0x45, /* The child or grandchild is ready to return. */ + SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */ + SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */ + SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */ + SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */ + SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */ + SYNC_CHILD_READY = 0x45, /* The child or grandchild is ready to return. */ /* XXX: This doesn't help with segfaults and other such issues. */ - SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */ + SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */ }; /* longjmp() arguments. */ @@ -64,7 +66,13 @@ struct clone_t { struct nlconfig_t { char *data; + + /* Process settings. */ uint32_t cloneflags; + char *oom_score_adj; + size_t oom_score_adj_len; + + /* User namespace settings. */ char *uidmap; size_t uidmap_len; char *gidmap; @@ -72,9 +80,13 @@ struct nlconfig_t { char *namespaces; size_t namespaces_len; uint8_t is_setgroup; + + /* Rootless container settings. */ uint8_t is_rootless; - char *oom_score_adj; - size_t oom_score_adj_len; + char *uidmappath; + size_t uidmappath_len; + char *gidmappath; + size_t gidmappath_len; }; /* @@ -89,6 +101,8 @@ struct nlconfig_t { #define SETGROUP_ATTR 27285 #define OOM_SCORE_ADJ_ATTR 27286 #define ROOTLESS_ATTR 27287 +#define UIDMAPPATH_ATTR 27288 +#define GIDMAPPATH_ATTR 27289 /* * Use the raw syscall for versions of glibc which don't include a function for @@ -152,7 +166,7 @@ static int write_file(char *data, size_t data_len, char *pathfmt, ...) goto out; } -out: + out: close(fd); return ret; } @@ -169,16 +183,16 @@ static void update_setgroups(int pid, enum policy_t setgroup) char *policy; switch (setgroup) { - case SETGROUPS_ALLOW: - policy = "allow"; - break; - case SETGROUPS_DENY: - policy = "deny"; - break; - case SETGROUPS_DEFAULT: - default: - /* Nothing to do. */ - return; + case SETGROUPS_ALLOW: + policy = "allow"; + break; + case SETGROUPS_DENY: + policy = "deny"; + break; + case SETGROUPS_DEFAULT: + default: + /* Nothing to do. */ + return; } if (write_file(policy, strlen(policy), "/proc/%d/setgroups", pid) < 0) { @@ -191,22 +205,96 @@ static void update_setgroups(int pid, enum policy_t setgroup) } } -static void update_uidmap(int pid, char *map, size_t map_len) +static int try_mapping_tool(const char *app, int pid, char *map, size_t map_len) +{ + int child; + + /* + * If @app is NULL, execve will segfault. Just check it here and bail (if + * we're in this path, the caller is already getting desparate and there + * isn't a backup to this failing). This usually would be a configuration + * or programming issue. + */ + if (!app) + bail("mapping tool not present"); + + child = fork(); + if (child < 0) + bail("failed to fork"); + + if (!child) { +#define MAX_ARGV 20 + char *argv[MAX_ARGV]; + char *envp[] = { NULL }; + char pid_fmt[16]; + int argc = 0; + char *next; + + snprintf(pid_fmt, 16, "%d", pid); + + argv[argc++] = (char *)app; + argv[argc++] = pid_fmt; + /* + * Convert the map string into a list of argument that + * newuidmap/newgidmap can understand. + */ + + while (argc < MAX_ARGV) { + if (*map == '\0') { + argv[argc++] = NULL; + break; + } + argv[argc++] = map; + next = strpbrk(map, "\n "); + if (next == NULL) + break; + *next++ = '\0'; + map = next + strspn(next, "\n "); + } + + execve(app, argv, envp); + bail("failed to execv"); + } else { + int status; + + while (true) { + if (waitpid(child, &status, 0) < 0) { + if (errno == EINTR) + continue; + bail("failed to waitpid"); + } + if (WIFEXITED(status) || WIFSIGNALED(status)) + return WEXITSTATUS(status); + } + } + + return -1; +} + +static void update_uidmap(const char *path, int pid, char *map, size_t map_len) { if (map == NULL || map_len <= 0) return; - if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) - bail("failed to update /proc/%d/uid_map", pid); + if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) { + if (errno != EPERM) + bail("failed to update /proc/%d/uid_map", pid); + if (try_mapping_tool(path, pid, map, map_len)) + bail("failed to use newuid map on %d", pid); + } } -static void update_gidmap(int pid, char *map, size_t map_len) +static void update_gidmap(const char *path, int pid, char *map, size_t map_len) { if (map == NULL || map_len <= 0) return; - if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) - bail("failed to update /proc/%d/gid_map", pid); + if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) { + if (errno != EPERM) + bail("failed to update /proc/%d/gid_map", pid); + if (try_mapping_tool(path, pid, map, map_len)) + bail("failed to use newgid map on %d", pid); + } } static void update_oom_score_adj(char *data, size_t len) @@ -230,7 +318,7 @@ static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline)); static int clone_parent(jmp_buf *env, int jmpval) { struct clone_t ca = { - .env = env, + .env = env, .jmpval = jmpval, }; @@ -350,6 +438,14 @@ static void nl_parse(int fd, struct nlconfig_t *config) config->gidmap = current; config->gidmap_len = payload_len; break; + case UIDMAPPATH_ATTR: + config->uidmappath = current; + config->uidmappath_len = payload_len; + break; + case GIDMAPPATH_ATTR: + config->gidmappath = current; + config->gidmappath_len = payload_len; + break; case SETGROUP_ATTR: config->is_setgroup = readint8(current); break; @@ -436,7 +532,7 @@ void nsexec(void) int pipenum; jmp_buf env; int sync_child_pipe[2], sync_grandchild_pipe[2]; - struct nlconfig_t config = {0}; + struct nlconfig_t config = { 0 }; /* * If we don't have an init pipe, just return to the go routine. @@ -533,21 +629,21 @@ void nsexec(void) */ switch (setjmp(env)) { - /* - * Stage 0: We're in the parent. Our job is just to create a new child - * (stage 1: JUMP_CHILD) process and write its uid_map and - * gid_map. That process will go on to create a new process, then - * it will send us its PID which we will send to the bootstrap - * process. - */ - case JUMP_PARENT: { + /* + * Stage 0: We're in the parent. Our job is just to create a new child + * (stage 1: JUMP_CHILD) process and write its uid_map and + * gid_map. That process will go on to create a new process, then + * it will send us its PID which we will send to the bootstrap + * process. + */ + case JUMP_PARENT:{ int len; - pid_t child; + pid_t child, first_child = -1; char buf[JSON_MAX]; bool ready = false; /* For debugging. */ - prctl(PR_SET_NAME, (unsigned long) "runc:[0:PARENT]", 0, 0, 0); + prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0); /* Start the process of getting a container. */ child = clone_parent(&env, JUMP_CHILD); @@ -596,8 +692,8 @@ void nsexec(void) update_setgroups(child, SETGROUPS_DENY); /* Set up mappings. */ - update_uidmap(child, config.uidmap, config.uidmap_len); - update_gidmap(child, config.gidmap, config.gidmap_len); + update_uidmap(config.uidmappath, child, config.uidmap, config.uidmap_len); + update_gidmap(config.gidmappath, child, config.gidmap, config.gidmap_len); s = SYNC_USERMAP_ACK; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { @@ -605,19 +701,19 @@ void nsexec(void) bail("failed to sync with child: write(SYNC_USERMAP_ACK)"); } break; - case SYNC_RECVPID_PLS: { - pid_t old = child; + case SYNC_RECVPID_PLS:{ + first_child = child; /* Get the init_func pid. */ if (read(syncfd, &child, sizeof(child)) != sizeof(child)) { - kill(old, SIGKILL); + kill(first_child, SIGKILL); bail("failed to sync with child: read(childpid)"); } /* Send ACK. */ s = SYNC_RECVPID_ACK; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { - kill(old, SIGKILL); + kill(first_child, SIGKILL); kill(child, SIGKILL); bail("failed to sync with child: write(SYNC_RECVPID_ACK)"); } @@ -665,8 +761,13 @@ void nsexec(void) } } - /* Send the init_func pid back to our parent. */ - len = snprintf(buf, JSON_MAX, "{\"pid\": %d}\n", child); + /* + * Send the init_func pid and the pid of the first child back to our parent. + * + * We need to send both back because we can't reap the first child we created (CLONE_PARENT). + * It becomes the responsibility of our parent to reap the first child. + */ + len = snprintf(buf, JSON_MAX, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child); if (len < 0) { kill(child, SIGKILL); bail("unable to generate JSON for child pid"); @@ -679,16 +780,16 @@ void nsexec(void) exit(0); } - /* - * Stage 1: We're in the first child process. Our job is to join any - * provided namespaces in the netlink payload and unshare all - * of the requested namespaces. If we've been asked to - * CLONE_NEWUSER, we will ask our parent (stage 0) to set up - * our user mappings for us. Then, we create a new child - * (stage 2: JUMP_INIT) for PID namespace. We then send the - * child's PID to our parent (stage 0). - */ - case JUMP_CHILD: { + /* + * Stage 1: We're in the first child process. Our job is to join any + * provided namespaces in the netlink payload and unshare all + * of the requested namespaces. If we've been asked to + * CLONE_NEWUSER, we will ask our parent (stage 0) to set up + * our user mappings for us. Then, we create a new child + * (stage 2: JUMP_INIT) for PID namespace. We then send the + * child's PID to our parent (stage 0). + */ + case JUMP_CHILD:{ pid_t child; enum sync_t s; @@ -697,7 +798,7 @@ void nsexec(void) close(sync_child_pipe[1]); /* For debugging. */ - prctl(PR_SET_NAME, (unsigned long) "runc:[1:CHILD]", 0, 0, 0); + prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0); /* * We need to setns first. We cannot do this earlier (in stage 0) @@ -799,13 +900,13 @@ void nsexec(void) exit(0); } - /* - * Stage 2: We're the final child process, and the only process that will - * actually return to the Go runtime. Our job is to just do the - * final cleanup steps and then return to the Go runtime to allow - * init_linux.go to run. - */ - case JUMP_INIT: { + /* + * Stage 2: We're the final child process, and the only process that will + * actually return to the Go runtime. Our job is to just do the + * final cleanup steps and then return to the Go runtime to allow + * init_linux.go to run. + */ + case JUMP_INIT:{ /* * We're inside the child now, having jumped from the * start_child() code after forking in the parent. @@ -819,7 +920,7 @@ void nsexec(void) close(sync_child_pipe[1]); /* For debugging. */ - prctl(PR_SET_NAME, (unsigned long) "runc:[2:INIT]", 0, 0, 0); + prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0); if (read(syncfd, &s, sizeof(s)) != sizeof(s)) bail("failed to sync with parent: read(SYNC_GRANDCHILD)"); |