summaryrefslogtreecommitdiff
path: root/pkg/rootless/rootless_linux.c
diff options
context:
space:
mode:
authorGiuseppe Scrivano <gscrivan@redhat.com>2019-05-08 13:49:07 +0200
committerGiuseppe Scrivano <gscrivan@redhat.com>2019-05-17 20:48:24 +0200
commit791d53a21421fba249156ea3a503e9e04a4912e4 (patch)
treed56e5f5ec94837075fb006b79891c9eabbe3b651 /pkg/rootless/rootless_linux.c
parent2e0fef51b3928337ef46629b4627ff1700a918d1 (diff)
downloadpodman-791d53a21421fba249156ea3a503e9e04a4912e4.tar.gz
podman-791d53a21421fba249156ea3a503e9e04a4912e4.tar.bz2
podman-791d53a21421fba249156ea3a503e9e04a4912e4.zip
rootless: use a pause process
use a pause process to keep the user and mount namespace alive. The pause process is created immediately on reload, and all successive Podman processes will refer to it for joining the user&mount namespace. This solves all the race conditions we had on joining the correct namespaces using the conmon processes. As a fallback if the join fails for any reason (e.g. the pause process was killed), then we try to join the running containers as we were doing before. Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
Diffstat (limited to 'pkg/rootless/rootless_linux.c')
-rw-r--r--pkg/rootless/rootless_linux.c240
1 files changed, 203 insertions, 37 deletions
diff --git a/pkg/rootless/rootless_linux.c b/pkg/rootless/rootless_linux.c
index 1d32b1adb..b87deb86e 100644
--- a/pkg/rootless/rootless_linux.c
+++ b/pkg/rootless/rootless_linux.c
@@ -17,6 +17,22 @@
#include <sys/prctl.h>
#include <dirent.h>
#include <sys/select.h>
+#include <stdio.h>
+
+#ifndef RENAME_NOREPLACE
+# define RENAME_NOREPLACE (1 << 0)
+
+int renameat2 (int olddirfd, const char *oldpath, int newdirfd, const char *newpath, unsigned int flags)
+{
+# ifdef __NR_renameat2
+ return (int) syscall (__NR_renameat2, olddirfd, oldpath, newdirfd, newpath, flags);
+# else
+ /* no way to implement it atomically. */
+ errno = ENOSYS;
+ return -1;
+# endif
+}
+#endif
static const char *_max_user_namespaces = "/proc/sys/user/max_user_namespaces";
static const char *_unprivileged_user_namespaces = "/proc/sys/kernel/unprivileged_userns_clone";
@@ -24,32 +40,6 @@ static const char *_unprivileged_user_namespaces = "/proc/sys/kernel/unprivilege
static int open_files_max_fd;
fd_set open_files_set;
-static void __attribute__((constructor)) init()
-{
- DIR *d;
-
- /* Store how many FDs were open before the Go runtime kicked in. */
- d = opendir ("/proc/self/fd");
- if (d)
- {
- struct dirent *ent;
-
- FD_ZERO (&open_files_set);
- for (ent = readdir (d); ent; ent = readdir (d))
- {
- int fd = atoi (ent->d_name);
- if (fd != dirfd (d))
- {
- if (fd > open_files_max_fd)
- open_files_max_fd = fd;
- FD_SET (fd, &open_files_set);
- }
- }
- closedir (d);
- }
-}
-
-
static int
syscall_setresuid (uid_t ruid, uid_t euid, uid_t suid)
{
@@ -62,14 +52,12 @@ syscall_setresgid (gid_t rgid, gid_t egid, gid_t sgid)
return (int) syscall (__NR_setresgid, rgid, egid, sgid);
}
-static int
-syscall_clone (unsigned long flags, void *child_stack)
+static void
+do_pause ()
{
-#if defined(__s390__) || defined(__CRIS__)
- return (int) syscall (__NR_clone, child_stack, flags);
-#else
- return (int) syscall (__NR_clone, flags, child_stack);
-#endif
+ prctl (PR_SET_NAME, "podman pause", NULL, NULL, NULL);
+ while (1)
+ pause ();
}
static char **
@@ -139,8 +127,164 @@ get_cmd_line_args (pid_t pid)
return argv;
}
+static void __attribute__((constructor)) init()
+{
+ const char *xdg_runtime_dir;
+ const char *pause;
+ DIR *d;
+
+ pause = getenv ("_PODMAN_PAUSE");
+ if (pause && pause[0])
+ {
+ do_pause ();
+ _exit (EXIT_FAILURE);
+ }
+
+ /* Store how many FDs were open before the Go runtime kicked in. */
+ d = opendir ("/proc/self/fd");
+ if (d)
+ {
+ struct dirent *ent;
+
+ FD_ZERO (&open_files_set);
+ for (ent = readdir (d); ent; ent = readdir (d))
+ {
+ int fd = atoi (ent->d_name);
+ if (fd != dirfd (d))
+ {
+ if (fd > open_files_max_fd)
+ open_files_max_fd = fd;
+ FD_SET (fd, &open_files_set);
+ }
+ }
+ closedir (d);
+ }
+}
+
+static int
+syscall_clone (unsigned long flags, void *child_stack)
+{
+#if defined(__s390__) || defined(__CRIS__)
+ return (int) syscall (__NR_clone, child_stack, flags);
+#else
+ return (int) syscall (__NR_clone, flags, child_stack);
+#endif
+}
+
+static int
+create_pause_process (const char *pause_pid_file_path, char **argv)
+{
+ int r, p[2];
+
+ if (pipe (p) < 0)
+ _exit (EXIT_FAILURE);
+
+ r = fork ();
+ if (r < 0)
+ _exit (EXIT_FAILURE);
+
+ if (r)
+ {
+ char b;
+
+ close (p[1]);
+ /* Block until we write the pid file. */
+ do
+ r = read (p[0], &b, 1);
+ while (r < 0 && errno == EINTR);
+ close (p[0]);
+
+ return r == 1 && b == '0' ? 0 : -1;
+ }
+ else
+ {
+ int fd;
+ pid_t pid;
+
+ close (p[0]);
+
+ setsid ();
+ pid = fork ();
+ if (r < 0)
+ _exit (EXIT_FAILURE);
+
+ if (pid)
+ {
+ char pid_str[12];
+ char *tmp_file_path = NULL;
+
+ sprintf (pid_str, "%d", pid);
+
+ asprintf (&tmp_file_path, "%s.XXXXXX", pause_pid_file_path);
+ if (tmp_file_path == NULL)
+ {
+ kill (pid, SIGKILL);
+ _exit (EXIT_FAILURE);
+ }
+
+ fd = mkstemp (tmp_file_path);
+ if (fd < 0)
+ {
+ kill (pid, SIGKILL);
+ _exit (EXIT_FAILURE);
+ }
+
+ do
+ r = write (fd, pid_str, strlen (pid_str));
+ while (r < 0 && errno == EINTR);
+ if (r < 0)
+ {
+ kill (pid, SIGKILL);
+ _exit (EXIT_FAILURE);
+ }
+ close (fd);
+
+ /* There can be another process at this point trying to configure the user namespace and the pause
+ process, do not override the pid file if it already exists. */
+ if (renameat2 (AT_FDCWD, tmp_file_path, AT_FDCWD, pause_pid_file_path, RENAME_NOREPLACE) < 0)
+ {
+ unlink (tmp_file_path);
+ kill (pid, SIGKILL);
+ _exit (EXIT_FAILURE);
+ }
+
+ do
+ r = write (p[1], "0", 1);
+ while (r < 0 && errno == EINTR);
+ close (p[1]);
+
+ _exit (EXIT_SUCCESS);
+ }
+ else
+ {
+ int null;
+
+ close (p[1]);
+
+ null = open ("/dev/null", O_RDWR);
+ if (null >= 0)
+ {
+ dup2 (null, 0);
+ dup2 (null, 1);
+ dup2 (null, 2);
+ close (null);
+ }
+
+ for (fd = 3; fd < open_files_max_fd + 16; fd++)
+ close (fd);
+
+ setenv ("_PODMAN_PAUSE", "1", 1);
+ execlp (argv[0], NULL);
+
+ /* If the execve fails, then do the pause here. */
+ do_pause ();
+ _exit (EXIT_FAILURE);
+ }
+ }
+}
+
int
-reexec_userns_join (int userns, int mountns)
+reexec_userns_join (int userns, int mountns, char *pause_pid_file_path)
{
pid_t ppid = getpid ();
char uid[16];
@@ -221,6 +365,12 @@ reexec_userns_join (int userns, int mountns)
}
free (cwd);
+ if (pause_pid_file_path && pause_pid_file_path[0] != '\0')
+ {
+ /* We ignore errors here as we didn't create the namespace anyway. */
+ create_pause_process (pause_pid_file_path, argv);
+ }
+
execvp (argv[0], argv);
_exit (EXIT_FAILURE);
@@ -246,7 +396,7 @@ check_proc_sys_userns_file (const char *path)
}
int
-reexec_in_user_namespace (int ready)
+reexec_in_user_namespace (int ready, char *pause_pid_file_path)
{
int ret;
pid_t pid;
@@ -328,29 +478,45 @@ reexec_in_user_namespace (int ready)
fprintf (stderr, "cannot read from sync pipe: %s\n", strerror (errno));
_exit (EXIT_FAILURE);
}
- close (ready);
- if (b != '1')
+ if (b != '0')
_exit (EXIT_FAILURE);
if (syscall_setresgid (0, 0, 0) < 0)
{
fprintf (stderr, "cannot setresgid: %s\n", strerror (errno));
+ write (ready, "1", 1);
_exit (EXIT_FAILURE);
}
if (syscall_setresuid (0, 0, 0) < 0)
{
fprintf (stderr, "cannot setresuid: %s\n", strerror (errno));
+ write (ready, "1", 1);
_exit (EXIT_FAILURE);
}
if (chdir (cwd) < 0)
{
fprintf (stderr, "cannot chdir: %s\n", strerror (errno));
+ write (ready, "1", 1);
_exit (EXIT_FAILURE);
}
free (cwd);
+ if (pause_pid_file_path && pause_pid_file_path[0] != '\0')
+ {
+ if (create_pause_process (pause_pid_file_path, argv) < 0)
+ {
+ write (ready, "2", 1);
+ _exit (EXIT_FAILURE);
+ }
+ }
+
+ do
+ ret = write (ready, "0", 1) < 0;
+ while (ret < 0 && errno == EINTR);
+ close (ready);
+
execvp (argv[0], argv);
_exit (EXIT_FAILURE);