summaryrefslogtreecommitdiff
path: root/vendor/github.com/opencontainers/runc/libcontainer/nsenter
diff options
context:
space:
mode:
authorbaude <bbaude@redhat.com>2018-02-14 12:51:06 -0600
committerAtomic Bot <atomic-devel@projectatomic.io>2018-02-15 00:20:47 +0000
commitbe9ed1cfacc19d1ad3c09e10481da445615b8b8e (patch)
tree1c0c01daf5b43c6139e37408be601475c1dcea41 /vendor/github.com/opencontainers/runc/libcontainer/nsenter
parentd051dc38d81920c94c37b20ceba0d33b35299bca (diff)
downloadpodman-be9ed1cfacc19d1ad3c09e10481da445615b8b8e.tar.gz
podman-be9ed1cfacc19d1ad3c09e10481da445615b8b8e.tar.bz2
podman-be9ed1cfacc19d1ad3c09e10481da445615b8b8e.zip
Privileged containers should inherit host devices
When running a privileged container, it should inherit the same devices the host has. Signed-off-by: baude <bbaude@redhat.com> Closes: #330 Approved by: mheon
Diffstat (limited to 'vendor/github.com/opencontainers/runc/libcontainer/nsenter')
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c231
1 files changed, 166 insertions, 65 deletions
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c
index 197e6d08e..2c69cee5d 100644
--- a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c
+++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c
@@ -1,3 +1,4 @@
+
#define _GNU_SOURCE
#include <endian.h>
#include <errno.h>
@@ -19,6 +20,7 @@
#include <sys/prctl.h>
#include <sys/socket.h>
#include <sys/types.h>
+#include <sys/wait.h>
#include <linux/limits.h>
#include <linux/netlink.h>
@@ -29,15 +31,15 @@
/* Synchronisation values. */
enum sync_t {
- SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */
- SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */
- SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */
- SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */
- SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */
- SYNC_CHILD_READY = 0x45, /* The child or grandchild is ready to return. */
+ SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */
+ SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */
+ SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */
+ SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */
+ SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */
+ SYNC_CHILD_READY = 0x45, /* The child or grandchild is ready to return. */
/* XXX: This doesn't help with segfaults and other such issues. */
- SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */
+ SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */
};
/* longjmp() arguments. */
@@ -64,7 +66,13 @@ struct clone_t {
struct nlconfig_t {
char *data;
+
+ /* Process settings. */
uint32_t cloneflags;
+ char *oom_score_adj;
+ size_t oom_score_adj_len;
+
+ /* User namespace settings. */
char *uidmap;
size_t uidmap_len;
char *gidmap;
@@ -72,9 +80,13 @@ struct nlconfig_t {
char *namespaces;
size_t namespaces_len;
uint8_t is_setgroup;
+
+ /* Rootless container settings. */
uint8_t is_rootless;
- char *oom_score_adj;
- size_t oom_score_adj_len;
+ char *uidmappath;
+ size_t uidmappath_len;
+ char *gidmappath;
+ size_t gidmappath_len;
};
/*
@@ -89,6 +101,8 @@ struct nlconfig_t {
#define SETGROUP_ATTR 27285
#define OOM_SCORE_ADJ_ATTR 27286
#define ROOTLESS_ATTR 27287
+#define UIDMAPPATH_ATTR 27288
+#define GIDMAPPATH_ATTR 27289
/*
* Use the raw syscall for versions of glibc which don't include a function for
@@ -152,7 +166,7 @@ static int write_file(char *data, size_t data_len, char *pathfmt, ...)
goto out;
}
-out:
+ out:
close(fd);
return ret;
}
@@ -169,16 +183,16 @@ static void update_setgroups(int pid, enum policy_t setgroup)
char *policy;
switch (setgroup) {
- case SETGROUPS_ALLOW:
- policy = "allow";
- break;
- case SETGROUPS_DENY:
- policy = "deny";
- break;
- case SETGROUPS_DEFAULT:
- default:
- /* Nothing to do. */
- return;
+ case SETGROUPS_ALLOW:
+ policy = "allow";
+ break;
+ case SETGROUPS_DENY:
+ policy = "deny";
+ break;
+ case SETGROUPS_DEFAULT:
+ default:
+ /* Nothing to do. */
+ return;
}
if (write_file(policy, strlen(policy), "/proc/%d/setgroups", pid) < 0) {
@@ -191,22 +205,96 @@ static void update_setgroups(int pid, enum policy_t setgroup)
}
}
-static void update_uidmap(int pid, char *map, size_t map_len)
+static int try_mapping_tool(const char *app, int pid, char *map, size_t map_len)
+{
+ int child;
+
+ /*
+ * If @app is NULL, execve will segfault. Just check it here and bail (if
+ * we're in this path, the caller is already getting desparate and there
+ * isn't a backup to this failing). This usually would be a configuration
+ * or programming issue.
+ */
+ if (!app)
+ bail("mapping tool not present");
+
+ child = fork();
+ if (child < 0)
+ bail("failed to fork");
+
+ if (!child) {
+#define MAX_ARGV 20
+ char *argv[MAX_ARGV];
+ char *envp[] = { NULL };
+ char pid_fmt[16];
+ int argc = 0;
+ char *next;
+
+ snprintf(pid_fmt, 16, "%d", pid);
+
+ argv[argc++] = (char *)app;
+ argv[argc++] = pid_fmt;
+ /*
+ * Convert the map string into a list of argument that
+ * newuidmap/newgidmap can understand.
+ */
+
+ while (argc < MAX_ARGV) {
+ if (*map == '\0') {
+ argv[argc++] = NULL;
+ break;
+ }
+ argv[argc++] = map;
+ next = strpbrk(map, "\n ");
+ if (next == NULL)
+ break;
+ *next++ = '\0';
+ map = next + strspn(next, "\n ");
+ }
+
+ execve(app, argv, envp);
+ bail("failed to execv");
+ } else {
+ int status;
+
+ while (true) {
+ if (waitpid(child, &status, 0) < 0) {
+ if (errno == EINTR)
+ continue;
+ bail("failed to waitpid");
+ }
+ if (WIFEXITED(status) || WIFSIGNALED(status))
+ return WEXITSTATUS(status);
+ }
+ }
+
+ return -1;
+}
+
+static void update_uidmap(const char *path, int pid, char *map, size_t map_len)
{
if (map == NULL || map_len <= 0)
return;
- if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0)
- bail("failed to update /proc/%d/uid_map", pid);
+ if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) {
+ if (errno != EPERM)
+ bail("failed to update /proc/%d/uid_map", pid);
+ if (try_mapping_tool(path, pid, map, map_len))
+ bail("failed to use newuid map on %d", pid);
+ }
}
-static void update_gidmap(int pid, char *map, size_t map_len)
+static void update_gidmap(const char *path, int pid, char *map, size_t map_len)
{
if (map == NULL || map_len <= 0)
return;
- if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0)
- bail("failed to update /proc/%d/gid_map", pid);
+ if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) {
+ if (errno != EPERM)
+ bail("failed to update /proc/%d/gid_map", pid);
+ if (try_mapping_tool(path, pid, map, map_len))
+ bail("failed to use newgid map on %d", pid);
+ }
}
static void update_oom_score_adj(char *data, size_t len)
@@ -230,7 +318,7 @@ static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline));
static int clone_parent(jmp_buf *env, int jmpval)
{
struct clone_t ca = {
- .env = env,
+ .env = env,
.jmpval = jmpval,
};
@@ -350,6 +438,14 @@ static void nl_parse(int fd, struct nlconfig_t *config)
config->gidmap = current;
config->gidmap_len = payload_len;
break;
+ case UIDMAPPATH_ATTR:
+ config->uidmappath = current;
+ config->uidmappath_len = payload_len;
+ break;
+ case GIDMAPPATH_ATTR:
+ config->gidmappath = current;
+ config->gidmappath_len = payload_len;
+ break;
case SETGROUP_ATTR:
config->is_setgroup = readint8(current);
break;
@@ -436,7 +532,7 @@ void nsexec(void)
int pipenum;
jmp_buf env;
int sync_child_pipe[2], sync_grandchild_pipe[2];
- struct nlconfig_t config = {0};
+ struct nlconfig_t config = { 0 };
/*
* If we don't have an init pipe, just return to the go routine.
@@ -533,21 +629,21 @@ void nsexec(void)
*/
switch (setjmp(env)) {
- /*
- * Stage 0: We're in the parent. Our job is just to create a new child
- * (stage 1: JUMP_CHILD) process and write its uid_map and
- * gid_map. That process will go on to create a new process, then
- * it will send us its PID which we will send to the bootstrap
- * process.
- */
- case JUMP_PARENT: {
+ /*
+ * Stage 0: We're in the parent. Our job is just to create a new child
+ * (stage 1: JUMP_CHILD) process and write its uid_map and
+ * gid_map. That process will go on to create a new process, then
+ * it will send us its PID which we will send to the bootstrap
+ * process.
+ */
+ case JUMP_PARENT:{
int len;
- pid_t child;
+ pid_t child, first_child = -1;
char buf[JSON_MAX];
bool ready = false;
/* For debugging. */
- prctl(PR_SET_NAME, (unsigned long) "runc:[0:PARENT]", 0, 0, 0);
+ prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0);
/* Start the process of getting a container. */
child = clone_parent(&env, JUMP_CHILD);
@@ -596,8 +692,8 @@ void nsexec(void)
update_setgroups(child, SETGROUPS_DENY);
/* Set up mappings. */
- update_uidmap(child, config.uidmap, config.uidmap_len);
- update_gidmap(child, config.gidmap, config.gidmap_len);
+ update_uidmap(config.uidmappath, child, config.uidmap, config.uidmap_len);
+ update_gidmap(config.gidmappath, child, config.gidmap, config.gidmap_len);
s = SYNC_USERMAP_ACK;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
@@ -605,19 +701,19 @@ void nsexec(void)
bail("failed to sync with child: write(SYNC_USERMAP_ACK)");
}
break;
- case SYNC_RECVPID_PLS: {
- pid_t old = child;
+ case SYNC_RECVPID_PLS:{
+ first_child = child;
/* Get the init_func pid. */
if (read(syncfd, &child, sizeof(child)) != sizeof(child)) {
- kill(old, SIGKILL);
+ kill(first_child, SIGKILL);
bail("failed to sync with child: read(childpid)");
}
/* Send ACK. */
s = SYNC_RECVPID_ACK;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
- kill(old, SIGKILL);
+ kill(first_child, SIGKILL);
kill(child, SIGKILL);
bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
}
@@ -665,8 +761,13 @@ void nsexec(void)
}
}
- /* Send the init_func pid back to our parent. */
- len = snprintf(buf, JSON_MAX, "{\"pid\": %d}\n", child);
+ /*
+ * Send the init_func pid and the pid of the first child back to our parent.
+ *
+ * We need to send both back because we can't reap the first child we created (CLONE_PARENT).
+ * It becomes the responsibility of our parent to reap the first child.
+ */
+ len = snprintf(buf, JSON_MAX, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child);
if (len < 0) {
kill(child, SIGKILL);
bail("unable to generate JSON for child pid");
@@ -679,16 +780,16 @@ void nsexec(void)
exit(0);
}
- /*
- * Stage 1: We're in the first child process. Our job is to join any
- * provided namespaces in the netlink payload and unshare all
- * of the requested namespaces. If we've been asked to
- * CLONE_NEWUSER, we will ask our parent (stage 0) to set up
- * our user mappings for us. Then, we create a new child
- * (stage 2: JUMP_INIT) for PID namespace. We then send the
- * child's PID to our parent (stage 0).
- */
- case JUMP_CHILD: {
+ /*
+ * Stage 1: We're in the first child process. Our job is to join any
+ * provided namespaces in the netlink payload and unshare all
+ * of the requested namespaces. If we've been asked to
+ * CLONE_NEWUSER, we will ask our parent (stage 0) to set up
+ * our user mappings for us. Then, we create a new child
+ * (stage 2: JUMP_INIT) for PID namespace. We then send the
+ * child's PID to our parent (stage 0).
+ */
+ case JUMP_CHILD:{
pid_t child;
enum sync_t s;
@@ -697,7 +798,7 @@ void nsexec(void)
close(sync_child_pipe[1]);
/* For debugging. */
- prctl(PR_SET_NAME, (unsigned long) "runc:[1:CHILD]", 0, 0, 0);
+ prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0);
/*
* We need to setns first. We cannot do this earlier (in stage 0)
@@ -799,13 +900,13 @@ void nsexec(void)
exit(0);
}
- /*
- * Stage 2: We're the final child process, and the only process that will
- * actually return to the Go runtime. Our job is to just do the
- * final cleanup steps and then return to the Go runtime to allow
- * init_linux.go to run.
- */
- case JUMP_INIT: {
+ /*
+ * Stage 2: We're the final child process, and the only process that will
+ * actually return to the Go runtime. Our job is to just do the
+ * final cleanup steps and then return to the Go runtime to allow
+ * init_linux.go to run.
+ */
+ case JUMP_INIT:{
/*
* We're inside the child now, having jumped from the
* start_child() code after forking in the parent.
@@ -819,7 +920,7 @@ void nsexec(void)
close(sync_child_pipe[1]);
/* For debugging. */
- prctl(PR_SET_NAME, (unsigned long) "runc:[2:INIT]", 0, 0, 0);
+ prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0);
if (read(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with parent: read(SYNC_GRANDCHILD)");