diff options
Diffstat (limited to 'pkg/rootless/rootless_linux.c')
-rw-r--r-- | pkg/rootless/rootless_linux.c | 240 |
1 files changed, 203 insertions, 37 deletions
diff --git a/pkg/rootless/rootless_linux.c b/pkg/rootless/rootless_linux.c index 1d32b1adb..b87deb86e 100644 --- a/pkg/rootless/rootless_linux.c +++ b/pkg/rootless/rootless_linux.c @@ -17,6 +17,22 @@ #include <sys/prctl.h> #include <dirent.h> #include <sys/select.h> +#include <stdio.h> + +#ifndef RENAME_NOREPLACE +# define RENAME_NOREPLACE (1 << 0) + +int renameat2 (int olddirfd, const char *oldpath, int newdirfd, const char *newpath, unsigned int flags) +{ +# ifdef __NR_renameat2 + return (int) syscall (__NR_renameat2, olddirfd, oldpath, newdirfd, newpath, flags); +# else + /* no way to implement it atomically. */ + errno = ENOSYS; + return -1; +# endif +} +#endif static const char *_max_user_namespaces = "/proc/sys/user/max_user_namespaces"; static const char *_unprivileged_user_namespaces = "/proc/sys/kernel/unprivileged_userns_clone"; @@ -24,32 +40,6 @@ static const char *_unprivileged_user_namespaces = "/proc/sys/kernel/unprivilege static int open_files_max_fd; fd_set open_files_set; -static void __attribute__((constructor)) init() -{ - DIR *d; - - /* Store how many FDs were open before the Go runtime kicked in. */ - d = opendir ("/proc/self/fd"); - if (d) - { - struct dirent *ent; - - FD_ZERO (&open_files_set); - for (ent = readdir (d); ent; ent = readdir (d)) - { - int fd = atoi (ent->d_name); - if (fd != dirfd (d)) - { - if (fd > open_files_max_fd) - open_files_max_fd = fd; - FD_SET (fd, &open_files_set); - } - } - closedir (d); - } -} - - static int syscall_setresuid (uid_t ruid, uid_t euid, uid_t suid) { @@ -62,14 +52,12 @@ syscall_setresgid (gid_t rgid, gid_t egid, gid_t sgid) return (int) syscall (__NR_setresgid, rgid, egid, sgid); } -static int -syscall_clone (unsigned long flags, void *child_stack) +static void +do_pause () { -#if defined(__s390__) || defined(__CRIS__) - return (int) syscall (__NR_clone, child_stack, flags); -#else - return (int) syscall (__NR_clone, flags, child_stack); -#endif + prctl (PR_SET_NAME, "podman pause", NULL, NULL, NULL); + while (1) + pause (); } static char ** @@ -139,8 +127,164 @@ get_cmd_line_args (pid_t pid) return argv; } +static void __attribute__((constructor)) init() +{ + const char *xdg_runtime_dir; + const char *pause; + DIR *d; + + pause = getenv ("_PODMAN_PAUSE"); + if (pause && pause[0]) + { + do_pause (); + _exit (EXIT_FAILURE); + } + + /* Store how many FDs were open before the Go runtime kicked in. */ + d = opendir ("/proc/self/fd"); + if (d) + { + struct dirent *ent; + + FD_ZERO (&open_files_set); + for (ent = readdir (d); ent; ent = readdir (d)) + { + int fd = atoi (ent->d_name); + if (fd != dirfd (d)) + { + if (fd > open_files_max_fd) + open_files_max_fd = fd; + FD_SET (fd, &open_files_set); + } + } + closedir (d); + } +} + +static int +syscall_clone (unsigned long flags, void *child_stack) +{ +#if defined(__s390__) || defined(__CRIS__) + return (int) syscall (__NR_clone, child_stack, flags); +#else + return (int) syscall (__NR_clone, flags, child_stack); +#endif +} + +static int +create_pause_process (const char *pause_pid_file_path, char **argv) +{ + int r, p[2]; + + if (pipe (p) < 0) + _exit (EXIT_FAILURE); + + r = fork (); + if (r < 0) + _exit (EXIT_FAILURE); + + if (r) + { + char b; + + close (p[1]); + /* Block until we write the pid file. */ + do + r = read (p[0], &b, 1); + while (r < 0 && errno == EINTR); + close (p[0]); + + return r == 1 && b == '0' ? 0 : -1; + } + else + { + int fd; + pid_t pid; + + close (p[0]); + + setsid (); + pid = fork (); + if (r < 0) + _exit (EXIT_FAILURE); + + if (pid) + { + char pid_str[12]; + char *tmp_file_path = NULL; + + sprintf (pid_str, "%d", pid); + + asprintf (&tmp_file_path, "%s.XXXXXX", pause_pid_file_path); + if (tmp_file_path == NULL) + { + kill (pid, SIGKILL); + _exit (EXIT_FAILURE); + } + + fd = mkstemp (tmp_file_path); + if (fd < 0) + { + kill (pid, SIGKILL); + _exit (EXIT_FAILURE); + } + + do + r = write (fd, pid_str, strlen (pid_str)); + while (r < 0 && errno == EINTR); + if (r < 0) + { + kill (pid, SIGKILL); + _exit (EXIT_FAILURE); + } + close (fd); + + /* There can be another process at this point trying to configure the user namespace and the pause + process, do not override the pid file if it already exists. */ + if (renameat2 (AT_FDCWD, tmp_file_path, AT_FDCWD, pause_pid_file_path, RENAME_NOREPLACE) < 0) + { + unlink (tmp_file_path); + kill (pid, SIGKILL); + _exit (EXIT_FAILURE); + } + + do + r = write (p[1], "0", 1); + while (r < 0 && errno == EINTR); + close (p[1]); + + _exit (EXIT_SUCCESS); + } + else + { + int null; + + close (p[1]); + + null = open ("/dev/null", O_RDWR); + if (null >= 0) + { + dup2 (null, 0); + dup2 (null, 1); + dup2 (null, 2); + close (null); + } + + for (fd = 3; fd < open_files_max_fd + 16; fd++) + close (fd); + + setenv ("_PODMAN_PAUSE", "1", 1); + execlp (argv[0], NULL); + + /* If the execve fails, then do the pause here. */ + do_pause (); + _exit (EXIT_FAILURE); + } + } +} + int -reexec_userns_join (int userns, int mountns) +reexec_userns_join (int userns, int mountns, char *pause_pid_file_path) { pid_t ppid = getpid (); char uid[16]; @@ -221,6 +365,12 @@ reexec_userns_join (int userns, int mountns) } free (cwd); + if (pause_pid_file_path && pause_pid_file_path[0] != '\0') + { + /* We ignore errors here as we didn't create the namespace anyway. */ + create_pause_process (pause_pid_file_path, argv); + } + execvp (argv[0], argv); _exit (EXIT_FAILURE); @@ -246,7 +396,7 @@ check_proc_sys_userns_file (const char *path) } int -reexec_in_user_namespace (int ready) +reexec_in_user_namespace (int ready, char *pause_pid_file_path) { int ret; pid_t pid; @@ -328,29 +478,45 @@ reexec_in_user_namespace (int ready) fprintf (stderr, "cannot read from sync pipe: %s\n", strerror (errno)); _exit (EXIT_FAILURE); } - close (ready); - if (b != '1') + if (b != '0') _exit (EXIT_FAILURE); if (syscall_setresgid (0, 0, 0) < 0) { fprintf (stderr, "cannot setresgid: %s\n", strerror (errno)); + write (ready, "1", 1); _exit (EXIT_FAILURE); } if (syscall_setresuid (0, 0, 0) < 0) { fprintf (stderr, "cannot setresuid: %s\n", strerror (errno)); + write (ready, "1", 1); _exit (EXIT_FAILURE); } if (chdir (cwd) < 0) { fprintf (stderr, "cannot chdir: %s\n", strerror (errno)); + write (ready, "1", 1); _exit (EXIT_FAILURE); } free (cwd); + if (pause_pid_file_path && pause_pid_file_path[0] != '\0') + { + if (create_pause_process (pause_pid_file_path, argv) < 0) + { + write (ready, "2", 1); + _exit (EXIT_FAILURE); + } + } + + do + ret = write (ready, "0", 1) < 0; + while (ret < 0 && errno == EINTR); + close (ready); + execvp (argv[0], argv); _exit (EXIT_FAILURE); |