summaryrefslogtreecommitdiff
path: root/pkg/rootless/rootless_linux.c
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/rootless/rootless_linux.c')
-rw-r--r--pkg/rootless/rootless_linux.c412
1 files changed, 365 insertions, 47 deletions
diff --git a/pkg/rootless/rootless_linux.c b/pkg/rootless/rootless_linux.c
index 1d32b1adb..a08cfd36a 100644
--- a/pkg/rootless/rootless_linux.c
+++ b/pkg/rootless/rootless_linux.c
@@ -17,38 +17,29 @@
#include <sys/prctl.h>
#include <dirent.h>
#include <sys/select.h>
+#include <stdio.h>
+
+#ifndef RENAME_NOREPLACE
+# define RENAME_NOREPLACE (1 << 0)
+
+int renameat2 (int olddirfd, const char *oldpath, int newdirfd, const char *newpath, unsigned int flags)
+{
+# ifdef __NR_renameat2
+ return (int) syscall (__NR_renameat2, olddirfd, oldpath, newdirfd, newpath, flags);
+# else
+ /* no way to implement it atomically. */
+ errno = ENOSYS;
+ return -1;
+# endif
+}
+#endif
static const char *_max_user_namespaces = "/proc/sys/user/max_user_namespaces";
static const char *_unprivileged_user_namespaces = "/proc/sys/kernel/unprivileged_userns_clone";
static int open_files_max_fd;
fd_set open_files_set;
-
-static void __attribute__((constructor)) init()
-{
- DIR *d;
-
- /* Store how many FDs were open before the Go runtime kicked in. */
- d = opendir ("/proc/self/fd");
- if (d)
- {
- struct dirent *ent;
-
- FD_ZERO (&open_files_set);
- for (ent = readdir (d); ent; ent = readdir (d))
- {
- int fd = atoi (ent->d_name);
- if (fd != dirfd (d))
- {
- if (fd > open_files_max_fd)
- open_files_max_fd = fd;
- FD_SET (fd, &open_files_set);
- }
- }
- closedir (d);
- }
-}
-
+static uid_t rootless_uid_init;
static int
syscall_setresuid (uid_t ruid, uid_t euid, uid_t suid)
@@ -62,14 +53,18 @@ syscall_setresgid (gid_t rgid, gid_t egid, gid_t sgid)
return (int) syscall (__NR_setresgid, rgid, egid, sgid);
}
-static int
-syscall_clone (unsigned long flags, void *child_stack)
+uid_t
+rootless_uid ()
{
-#if defined(__s390__) || defined(__CRIS__)
- return (int) syscall (__NR_clone, child_stack, flags);
-#else
- return (int) syscall (__NR_clone, flags, child_stack);
-#endif
+ return rootless_uid_init;
+}
+
+static void
+do_pause ()
+{
+ prctl (PR_SET_NAME, "podman pause", NULL, NULL, NULL);
+ while (1)
+ pause ();
}
static char **
@@ -84,7 +79,10 @@ get_cmd_line_args (pid_t pid)
int i, argc = 0;
char **argv;
- sprintf (path, "/proc/%d/cmdline", pid);
+ if (pid)
+ sprintf (path, "/proc/%d/cmdline", pid);
+ else
+ strcpy (path, "/proc/self/cmdline");
fd = open (path, O_RDONLY);
if (fd < 0)
return NULL;
@@ -99,7 +97,10 @@ get_cmd_line_args (pid_t pid)
ret = read (fd, buffer + used, allocated - used);
while (ret < 0 && errno == EINTR);
if (ret < 0)
- return NULL;
+ {
+ free (buffer);
+ return NULL;
+ }
if (ret == 0)
break;
@@ -109,11 +110,12 @@ get_cmd_line_args (pid_t pid)
{
allocated += 512;
char *tmp = realloc (buffer, allocated);
- if (buffer == NULL) {
- free(buffer);
- return NULL;
- }
- buffer=tmp;
+ if (buffer == NULL)
+ {
+ free (buffer);
+ return NULL;
+ }
+ buffer = tmp;
}
}
close (fd);
@@ -122,11 +124,17 @@ get_cmd_line_args (pid_t pid)
if (buffer[i] == '\0')
argc++;
if (argc == 0)
- return NULL;
+ {
+ free (buffer);
+ return NULL;
+ }
argv = malloc (sizeof (char *) * (argc + 1));
if (argv == NULL)
- return NULL;
+ {
+ free (buffer);
+ return NULL;
+ }
argc = 0;
argv[argc++] = buffer;
@@ -139,8 +147,296 @@ get_cmd_line_args (pid_t pid)
return argv;
}
+static bool
+can_use_shortcut ()
+{
+ int argc;
+ char **argv;
+ bool ret = true;
+
+#ifdef DISABLE_JOIN_SHORTCUT
+ return false;
+#endif
+
+ argv = get_cmd_line_args (0);
+ if (argv == NULL)
+ return NULL;
+
+ for (argc = 0; argv[argc]; argc++)
+ {
+ if (argc == 0 || argv[argc][0] == '-')
+ continue;
+
+ if (strcmp (argv[argc], "mount") == 0
+ || strcmp (argv[argc], "search") == 0
+ || strcmp (argv[argc], "system") == 0)
+ {
+ ret = false;
+ break;
+ }
+ }
+
+ free (argv[0]);
+ free (argv);
+ return ret;
+}
+
+static void __attribute__((constructor)) init()
+{
+ const char *xdg_runtime_dir;
+ const char *pause;
+ DIR *d;
+
+ pause = getenv ("_PODMAN_PAUSE");
+ if (pause && pause[0])
+ {
+ do_pause ();
+ _exit (EXIT_FAILURE);
+ }
+
+ /* Store how many FDs were open before the Go runtime kicked in. */
+ d = opendir ("/proc/self/fd");
+ if (d)
+ {
+ struct dirent *ent;
+
+ FD_ZERO (&open_files_set);
+ for (ent = readdir (d); ent; ent = readdir (d))
+ {
+ int fd = atoi (ent->d_name);
+ if (fd != dirfd (d))
+ {
+ if (fd > open_files_max_fd)
+ open_files_max_fd = fd;
+ FD_SET (fd, &open_files_set);
+ }
+ }
+ closedir (d);
+ }
+
+ /* Shortcut. If we are able to join the pause pid file, do it now so we don't
+ need to re-exec. */
+ xdg_runtime_dir = getenv ("XDG_RUNTIME_DIR");
+ if (xdg_runtime_dir && xdg_runtime_dir[0] && can_use_shortcut ())
+ {
+ int r;
+ int fd;
+ long pid;
+ char buf[12];
+ uid_t uid;
+ char path[PATH_MAX];
+ const char *const suffix = "/libpod/pause.pid";
+ char *cwd = getcwd (NULL, 0);
+
+ if (cwd == NULL)
+ {
+ fprintf (stderr, "error getting current working directory: %s\n", strerror (errno));
+ _exit (EXIT_FAILURE);
+ }
+
+ if (strlen (xdg_runtime_dir) >= PATH_MAX - strlen (suffix))
+ {
+ fprintf (stderr, "invalid value for XDG_RUNTIME_DIR: %s", strerror (ENAMETOOLONG));
+ exit (EXIT_FAILURE);
+ }
+
+ sprintf (path, "%s%s", xdg_runtime_dir, suffix);
+ fd = open (path, O_RDONLY);
+ if (fd < 0)
+ {
+ free (cwd);
+ return;
+ }
+
+ r = read (fd, buf, sizeof (buf));
+ close (fd);
+ if (r < 0)
+ {
+ free (cwd);
+ return;
+ }
+ pid = strtol (buf, NULL, 10);
+ if (pid == LONG_MAX)
+ {
+ free (cwd);
+ return;
+ }
+
+ uid = geteuid ();
+
+ sprintf (path, "/proc/%d/ns/user", pid);
+ fd = open (path, O_RDONLY);
+ if (fd < 0 || setns (fd, 0) < 0)
+ {
+ free (cwd);
+ return;
+ }
+ close (fd);
+
+ /* Errors here cannot be ignored as we already joined a ns. */
+ sprintf (path, "/proc/%d/ns/mnt", pid);
+ fd = open (path, O_RDONLY);
+ if (fd < 0)
+ {
+ fprintf (stderr, "cannot open %s: %s", path, strerror (errno));
+ exit (EXIT_FAILURE);
+ }
+
+ r = setns (fd, 0);
+ if (r < 0)
+ {
+ fprintf (stderr, "cannot join mount namespace for %d: %s", pid, strerror (errno));
+ exit (EXIT_FAILURE);
+ }
+ close (fd);
+
+ if (syscall_setresgid (0, 0, 0) < 0)
+ {
+ fprintf (stderr, "cannot setresgid: %s\n", strerror (errno));
+ _exit (EXIT_FAILURE);
+ }
+
+ if (syscall_setresuid (0, 0, 0) < 0)
+ {
+ fprintf (stderr, "cannot setresuid: %s\n", strerror (errno));
+ _exit (EXIT_FAILURE);
+ }
+
+ if (chdir (cwd) < 0)
+ {
+ fprintf (stderr, "cannot chdir: %s\n", strerror (errno));
+ _exit (EXIT_FAILURE);
+ }
+
+ free (cwd);
+ rootless_uid_init = uid;
+ }
+}
+
+static int
+syscall_clone (unsigned long flags, void *child_stack)
+{
+#if defined(__s390__) || defined(__CRIS__)
+ return (int) syscall (__NR_clone, child_stack, flags);
+#else
+ return (int) syscall (__NR_clone, flags, child_stack);
+#endif
+}
+
+static int
+create_pause_process (const char *pause_pid_file_path, char **argv)
+{
+ int r, p[2];
+
+ if (pipe (p) < 0)
+ _exit (EXIT_FAILURE);
+
+ r = fork ();
+ if (r < 0)
+ _exit (EXIT_FAILURE);
+
+ if (r)
+ {
+ char b;
+
+ close (p[1]);
+ /* Block until we write the pid file. */
+ do
+ r = read (p[0], &b, 1);
+ while (r < 0 && errno == EINTR);
+ close (p[0]);
+
+ return r == 1 && b == '0' ? 0 : -1;
+ }
+ else
+ {
+ int fd;
+ pid_t pid;
+
+ close (p[0]);
+
+ setsid ();
+ pid = fork ();
+ if (r < 0)
+ _exit (EXIT_FAILURE);
+
+ if (pid)
+ {
+ char pid_str[12];
+ char *tmp_file_path = NULL;
+
+ sprintf (pid_str, "%d", pid);
+
+ asprintf (&tmp_file_path, "%s.XXXXXX", pause_pid_file_path);
+ if (tmp_file_path == NULL)
+ {
+ kill (pid, SIGKILL);
+ _exit (EXIT_FAILURE);
+ }
+
+ fd = mkstemp (tmp_file_path);
+ if (fd < 0)
+ {
+ kill (pid, SIGKILL);
+ _exit (EXIT_FAILURE);
+ }
+
+ do
+ r = write (fd, pid_str, strlen (pid_str));
+ while (r < 0 && errno == EINTR);
+ if (r < 0)
+ {
+ kill (pid, SIGKILL);
+ _exit (EXIT_FAILURE);
+ }
+ close (fd);
+
+ /* There can be another process at this point trying to configure the user namespace and the pause
+ process, do not override the pid file if it already exists. */
+ if (renameat2 (AT_FDCWD, tmp_file_path, AT_FDCWD, pause_pid_file_path, RENAME_NOREPLACE) < 0)
+ {
+ unlink (tmp_file_path);
+ kill (pid, SIGKILL);
+ _exit (EXIT_FAILURE);
+ }
+
+ do
+ r = write (p[1], "0", 1);
+ while (r < 0 && errno == EINTR);
+ close (p[1]);
+
+ _exit (EXIT_SUCCESS);
+ }
+ else
+ {
+ int null;
+
+ close (p[1]);
+
+ null = open ("/dev/null", O_RDWR);
+ if (null >= 0)
+ {
+ dup2 (null, 0);
+ dup2 (null, 1);
+ dup2 (null, 2);
+ close (null);
+ }
+
+ for (fd = 3; fd < open_files_max_fd + 16; fd++)
+ close (fd);
+
+ setenv ("_PODMAN_PAUSE", "1", 1);
+ execlp (argv[0], NULL);
+
+ /* If the execve fails, then do the pause here. */
+ do_pause ();
+ _exit (EXIT_FAILURE);
+ }
+ }
+}
+
int
-reexec_userns_join (int userns, int mountns)
+reexec_userns_join (int userns, int mountns, char *pause_pid_file_path)
{
pid_t ppid = getpid ();
char uid[16];
@@ -200,7 +496,7 @@ reexec_userns_join (int userns, int mountns)
fprintf (stderr, "cannot setns: %s\n", strerror (errno));
_exit (EXIT_FAILURE);
}
- close (userns);
+ close (mountns);
if (syscall_setresgid (0, 0, 0) < 0)
{
@@ -221,6 +517,12 @@ reexec_userns_join (int userns, int mountns)
}
free (cwd);
+ if (pause_pid_file_path && pause_pid_file_path[0] != '\0')
+ {
+ /* We ignore errors here as we didn't create the namespace anyway. */
+ create_pause_process (pause_pid_file_path, argv);
+ }
+
execvp (argv[0], argv);
_exit (EXIT_FAILURE);
@@ -246,7 +548,7 @@ check_proc_sys_userns_file (const char *path)
}
int
-reexec_in_user_namespace (int ready)
+reexec_in_user_namespace (int ready, char *pause_pid_file_path)
{
int ret;
pid_t pid;
@@ -328,29 +630,45 @@ reexec_in_user_namespace (int ready)
fprintf (stderr, "cannot read from sync pipe: %s\n", strerror (errno));
_exit (EXIT_FAILURE);
}
- close (ready);
- if (b != '1')
+ if (b != '0')
_exit (EXIT_FAILURE);
if (syscall_setresgid (0, 0, 0) < 0)
{
fprintf (stderr, "cannot setresgid: %s\n", strerror (errno));
+ write (ready, "1", 1);
_exit (EXIT_FAILURE);
}
if (syscall_setresuid (0, 0, 0) < 0)
{
fprintf (stderr, "cannot setresuid: %s\n", strerror (errno));
+ write (ready, "1", 1);
_exit (EXIT_FAILURE);
}
if (chdir (cwd) < 0)
{
fprintf (stderr, "cannot chdir: %s\n", strerror (errno));
+ write (ready, "1", 1);
_exit (EXIT_FAILURE);
}
free (cwd);
+ if (pause_pid_file_path && pause_pid_file_path[0] != '\0')
+ {
+ if (create_pause_process (pause_pid_file_path, argv) < 0)
+ {
+ write (ready, "2", 1);
+ _exit (EXIT_FAILURE);
+ }
+ }
+
+ do
+ ret = write (ready, "0", 1) < 0;
+ while (ret < 0 && errno == EINTR);
+ close (ready);
+
execvp (argv[0], argv);
_exit (EXIT_FAILURE);