summaryrefslogtreecommitdiff
path: root/pkg/rootless
diff options
context:
space:
mode:
authorGiuseppe Scrivano <gscrivan@redhat.com>2019-05-08 13:49:07 +0200
committerGiuseppe Scrivano <gscrivan@redhat.com>2019-05-17 20:48:24 +0200
commit791d53a21421fba249156ea3a503e9e04a4912e4 (patch)
treed56e5f5ec94837075fb006b79891c9eabbe3b651 /pkg/rootless
parent2e0fef51b3928337ef46629b4627ff1700a918d1 (diff)
downloadpodman-791d53a21421fba249156ea3a503e9e04a4912e4.tar.gz
podman-791d53a21421fba249156ea3a503e9e04a4912e4.tar.bz2
podman-791d53a21421fba249156ea3a503e9e04a4912e4.zip
rootless: use a pause process
use a pause process to keep the user and mount namespace alive. The pause process is created immediately on reload, and all successive Podman processes will refer to it for joining the user&mount namespace. This solves all the race conditions we had on joining the correct namespaces using the conmon processes. As a fallback if the join fails for any reason (e.g. the pause process was killed), then we try to join the running containers as we were doing before. Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
Diffstat (limited to 'pkg/rootless')
-rw-r--r--pkg/rootless/rootless_linux.c240
-rw-r--r--pkg/rootless/rootless_linux.go49
-rw-r--r--pkg/rootless/rootless_unsupported.go4
3 files changed, 246 insertions, 47 deletions
diff --git a/pkg/rootless/rootless_linux.c b/pkg/rootless/rootless_linux.c
index 1d32b1adb..b87deb86e 100644
--- a/pkg/rootless/rootless_linux.c
+++ b/pkg/rootless/rootless_linux.c
@@ -17,6 +17,22 @@
#include <sys/prctl.h>
#include <dirent.h>
#include <sys/select.h>
+#include <stdio.h>
+
+#ifndef RENAME_NOREPLACE
+# define RENAME_NOREPLACE (1 << 0)
+
+int renameat2 (int olddirfd, const char *oldpath, int newdirfd, const char *newpath, unsigned int flags)
+{
+# ifdef __NR_renameat2
+ return (int) syscall (__NR_renameat2, olddirfd, oldpath, newdirfd, newpath, flags);
+# else
+ /* no way to implement it atomically. */
+ errno = ENOSYS;
+ return -1;
+# endif
+}
+#endif
static const char *_max_user_namespaces = "/proc/sys/user/max_user_namespaces";
static const char *_unprivileged_user_namespaces = "/proc/sys/kernel/unprivileged_userns_clone";
@@ -24,32 +40,6 @@ static const char *_unprivileged_user_namespaces = "/proc/sys/kernel/unprivilege
static int open_files_max_fd;
fd_set open_files_set;
-static void __attribute__((constructor)) init()
-{
- DIR *d;
-
- /* Store how many FDs were open before the Go runtime kicked in. */
- d = opendir ("/proc/self/fd");
- if (d)
- {
- struct dirent *ent;
-
- FD_ZERO (&open_files_set);
- for (ent = readdir (d); ent; ent = readdir (d))
- {
- int fd = atoi (ent->d_name);
- if (fd != dirfd (d))
- {
- if (fd > open_files_max_fd)
- open_files_max_fd = fd;
- FD_SET (fd, &open_files_set);
- }
- }
- closedir (d);
- }
-}
-
-
static int
syscall_setresuid (uid_t ruid, uid_t euid, uid_t suid)
{
@@ -62,14 +52,12 @@ syscall_setresgid (gid_t rgid, gid_t egid, gid_t sgid)
return (int) syscall (__NR_setresgid, rgid, egid, sgid);
}
-static int
-syscall_clone (unsigned long flags, void *child_stack)
+static void
+do_pause ()
{
-#if defined(__s390__) || defined(__CRIS__)
- return (int) syscall (__NR_clone, child_stack, flags);
-#else
- return (int) syscall (__NR_clone, flags, child_stack);
-#endif
+ prctl (PR_SET_NAME, "podman pause", NULL, NULL, NULL);
+ while (1)
+ pause ();
}
static char **
@@ -139,8 +127,164 @@ get_cmd_line_args (pid_t pid)
return argv;
}
+static void __attribute__((constructor)) init()
+{
+ const char *xdg_runtime_dir;
+ const char *pause;
+ DIR *d;
+
+ pause = getenv ("_PODMAN_PAUSE");
+ if (pause && pause[0])
+ {
+ do_pause ();
+ _exit (EXIT_FAILURE);
+ }
+
+ /* Store how many FDs were open before the Go runtime kicked in. */
+ d = opendir ("/proc/self/fd");
+ if (d)
+ {
+ struct dirent *ent;
+
+ FD_ZERO (&open_files_set);
+ for (ent = readdir (d); ent; ent = readdir (d))
+ {
+ int fd = atoi (ent->d_name);
+ if (fd != dirfd (d))
+ {
+ if (fd > open_files_max_fd)
+ open_files_max_fd = fd;
+ FD_SET (fd, &open_files_set);
+ }
+ }
+ closedir (d);
+ }
+}
+
+static int
+syscall_clone (unsigned long flags, void *child_stack)
+{
+#if defined(__s390__) || defined(__CRIS__)
+ return (int) syscall (__NR_clone, child_stack, flags);
+#else
+ return (int) syscall (__NR_clone, flags, child_stack);
+#endif
+}
+
+static int
+create_pause_process (const char *pause_pid_file_path, char **argv)
+{
+ int r, p[2];
+
+ if (pipe (p) < 0)
+ _exit (EXIT_FAILURE);
+
+ r = fork ();
+ if (r < 0)
+ _exit (EXIT_FAILURE);
+
+ if (r)
+ {
+ char b;
+
+ close (p[1]);
+ /* Block until we write the pid file. */
+ do
+ r = read (p[0], &b, 1);
+ while (r < 0 && errno == EINTR);
+ close (p[0]);
+
+ return r == 1 && b == '0' ? 0 : -1;
+ }
+ else
+ {
+ int fd;
+ pid_t pid;
+
+ close (p[0]);
+
+ setsid ();
+ pid = fork ();
+ if (r < 0)
+ _exit (EXIT_FAILURE);
+
+ if (pid)
+ {
+ char pid_str[12];
+ char *tmp_file_path = NULL;
+
+ sprintf (pid_str, "%d", pid);
+
+ asprintf (&tmp_file_path, "%s.XXXXXX", pause_pid_file_path);
+ if (tmp_file_path == NULL)
+ {
+ kill (pid, SIGKILL);
+ _exit (EXIT_FAILURE);
+ }
+
+ fd = mkstemp (tmp_file_path);
+ if (fd < 0)
+ {
+ kill (pid, SIGKILL);
+ _exit (EXIT_FAILURE);
+ }
+
+ do
+ r = write (fd, pid_str, strlen (pid_str));
+ while (r < 0 && errno == EINTR);
+ if (r < 0)
+ {
+ kill (pid, SIGKILL);
+ _exit (EXIT_FAILURE);
+ }
+ close (fd);
+
+ /* There can be another process at this point trying to configure the user namespace and the pause
+ process, do not override the pid file if it already exists. */
+ if (renameat2 (AT_FDCWD, tmp_file_path, AT_FDCWD, pause_pid_file_path, RENAME_NOREPLACE) < 0)
+ {
+ unlink (tmp_file_path);
+ kill (pid, SIGKILL);
+ _exit (EXIT_FAILURE);
+ }
+
+ do
+ r = write (p[1], "0", 1);
+ while (r < 0 && errno == EINTR);
+ close (p[1]);
+
+ _exit (EXIT_SUCCESS);
+ }
+ else
+ {
+ int null;
+
+ close (p[1]);
+
+ null = open ("/dev/null", O_RDWR);
+ if (null >= 0)
+ {
+ dup2 (null, 0);
+ dup2 (null, 1);
+ dup2 (null, 2);
+ close (null);
+ }
+
+ for (fd = 3; fd < open_files_max_fd + 16; fd++)
+ close (fd);
+
+ setenv ("_PODMAN_PAUSE", "1", 1);
+ execlp (argv[0], NULL);
+
+ /* If the execve fails, then do the pause here. */
+ do_pause ();
+ _exit (EXIT_FAILURE);
+ }
+ }
+}
+
int
-reexec_userns_join (int userns, int mountns)
+reexec_userns_join (int userns, int mountns, char *pause_pid_file_path)
{
pid_t ppid = getpid ();
char uid[16];
@@ -221,6 +365,12 @@ reexec_userns_join (int userns, int mountns)
}
free (cwd);
+ if (pause_pid_file_path && pause_pid_file_path[0] != '\0')
+ {
+ /* We ignore errors here as we didn't create the namespace anyway. */
+ create_pause_process (pause_pid_file_path, argv);
+ }
+
execvp (argv[0], argv);
_exit (EXIT_FAILURE);
@@ -246,7 +396,7 @@ check_proc_sys_userns_file (const char *path)
}
int
-reexec_in_user_namespace (int ready)
+reexec_in_user_namespace (int ready, char *pause_pid_file_path)
{
int ret;
pid_t pid;
@@ -328,29 +478,45 @@ reexec_in_user_namespace (int ready)
fprintf (stderr, "cannot read from sync pipe: %s\n", strerror (errno));
_exit (EXIT_FAILURE);
}
- close (ready);
- if (b != '1')
+ if (b != '0')
_exit (EXIT_FAILURE);
if (syscall_setresgid (0, 0, 0) < 0)
{
fprintf (stderr, "cannot setresgid: %s\n", strerror (errno));
+ write (ready, "1", 1);
_exit (EXIT_FAILURE);
}
if (syscall_setresuid (0, 0, 0) < 0)
{
fprintf (stderr, "cannot setresuid: %s\n", strerror (errno));
+ write (ready, "1", 1);
_exit (EXIT_FAILURE);
}
if (chdir (cwd) < 0)
{
fprintf (stderr, "cannot chdir: %s\n", strerror (errno));
+ write (ready, "1", 1);
_exit (EXIT_FAILURE);
}
free (cwd);
+ if (pause_pid_file_path && pause_pid_file_path[0] != '\0')
+ {
+ if (create_pause_process (pause_pid_file_path, argv) < 0)
+ {
+ write (ready, "2", 1);
+ _exit (EXIT_FAILURE);
+ }
+ }
+
+ do
+ ret = write (ready, "0", 1) < 0;
+ while (ret < 0 && errno == EINTR);
+ close (ready);
+
execvp (argv[0], argv);
_exit (EXIT_FAILURE);
diff --git a/pkg/rootless/rootless_linux.go b/pkg/rootless/rootless_linux.go
index 2c99f41a4..0390bbb6a 100644
--- a/pkg/rootless/rootless_linux.go
+++ b/pkg/rootless/rootless_linux.go
@@ -22,9 +22,10 @@ import (
)
/*
-extern int reexec_in_user_namespace(int ready);
+#include <stdlib.h>
+extern int reexec_in_user_namespace(int ready, char *pause_pid_file_path);
extern int reexec_in_user_namespace_wait(int pid);
-extern int reexec_userns_join(int userns, int mountns);
+extern int reexec_userns_join(int userns, int mountns, char *pause_pid_file_path);
*/
import "C"
@@ -168,11 +169,14 @@ func getUserNSFirstChild(fd uintptr) (*os.File, error) {
// JoinUserAndMountNS re-exec podman in a new userNS and join the user and mount
// namespace of the specified PID without looking up its parent. Useful to join directly
// the conmon process.
-func JoinUserAndMountNS(pid uint) (bool, int, error) {
+func JoinUserAndMountNS(pid uint, pausePid string) (bool, int, error) {
if os.Geteuid() == 0 || os.Getenv("_CONTAINERS_USERNS_CONFIGURED") != "" {
return false, -1, nil
}
+ cPausePid := C.CString(pausePid)
+ defer C.free(unsafe.Pointer(cPausePid))
+
userNS, err := os.Open(fmt.Sprintf("/proc/%d/ns/user", pid))
if err != nil {
return false, -1, err
@@ -189,7 +193,7 @@ func JoinUserAndMountNS(pid uint) (bool, int, error) {
if err != nil {
return false, -1, err
}
- pidC := C.reexec_userns_join(C.int(fd.Fd()), C.int(mountNS.Fd()))
+ pidC := C.reexec_userns_join(C.int(fd.Fd()), C.int(mountNS.Fd()), cPausePid)
if int(pidC) < 0 {
return false, -1, errors.Errorf("cannot re-exec process")
}
@@ -206,7 +210,7 @@ func JoinUserAndMountNS(pid uint) (bool, int, error) {
// into a new user namespace and the return code from the re-executed podman process.
// If podman was re-executed the caller needs to propagate the error code returned by the child
// process.
-func BecomeRootInUserNS() (bool, int, error) {
+func BecomeRootInUserNS(pausePid string) (bool, int, error) {
if os.Geteuid() == 0 || os.Getenv("_CONTAINERS_USERNS_CONFIGURED") != "" {
if os.Getenv("_CONTAINERS_USERNS_CONFIGURED") == "init" {
return false, 0, runInUser()
@@ -214,18 +218,23 @@ func BecomeRootInUserNS() (bool, int, error) {
return false, 0, nil
}
+ cPausePid := C.CString(pausePid)
+ defer C.free(unsafe.Pointer(cPausePid))
+
runtime.LockOSThread()
defer runtime.UnlockOSThread()
- r, w, err := os.Pipe()
+ fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_DGRAM, 0)
if err != nil {
return false, -1, err
}
+ r, w := os.NewFile(uintptr(fds[0]), "sync host"), os.NewFile(uintptr(fds[1]), "sync child")
+
defer r.Close()
defer w.Close()
defer w.Write([]byte("0"))
- pidC := C.reexec_in_user_namespace(C.int(r.Fd()))
+ pidC := C.reexec_in_user_namespace(C.int(r.Fd()), cPausePid)
pid := int(pidC)
if pid < 0 {
return false, -1, errors.Errorf("cannot re-exec process")
@@ -280,11 +289,35 @@ func BecomeRootInUserNS() (bool, int, error) {
}
}
- _, err = w.Write([]byte("1"))
+ _, err = w.Write([]byte("0"))
if err != nil {
return false, -1, errors.Wrapf(err, "write to sync pipe")
}
+ b := make([]byte, 1, 1)
+ _, err = w.Read(b)
+ if err != nil {
+ return false, -1, errors.Wrapf(err, "read from sync pipe")
+ }
+
+ if b[0] == '2' {
+ // We have lost the race for writing the PID file, as probably another
+ // process created a namespace and wrote the PID.
+ // Try to join it.
+ data, err := ioutil.ReadFile(pausePid)
+ if err == nil {
+ pid, err := strconv.ParseUint(string(data), 10, 0)
+ if err == nil {
+ return JoinUserAndMountNS(uint(pid), "")
+ }
+ }
+ return false, -1, errors.Wrapf(err, "error setting up the process")
+ }
+
+ if b[0] != '0' {
+ return false, -1, errors.Wrapf(err, "error setting up the process")
+ }
+
c := make(chan os.Signal, 1)
signals := []os.Signal{}
diff --git a/pkg/rootless/rootless_unsupported.go b/pkg/rootless/rootless_unsupported.go
index 47b5dd7cc..42f8f3aec 100644
--- a/pkg/rootless/rootless_unsupported.go
+++ b/pkg/rootless/rootless_unsupported.go
@@ -15,7 +15,7 @@ func IsRootless() bool {
// into a new user namespace and the return code from the re-executed podman process.
// If podman was re-executed the caller needs to propagate the error code returned by the child
// process. It is a convenience function for BecomeRootInUserNSWithOpts with a default configuration.
-func BecomeRootInUserNS() (bool, int, error) {
+func BecomeRootInUserNS(pausePid string) (bool, int, error) {
return false, -1, errors.New("this function is not supported on this os")
}
@@ -28,6 +28,6 @@ func GetRootlessUID() int {
// namespace of the specified PID without looking up its parent. Useful to join directly
// the conmon process. It is a convenience function for JoinUserAndMountNSWithOpts
// with a default configuration.
-func JoinUserAndMountNS(pid uint) (bool, int, error) {
+func JoinUserAndMountNS(pid uint, pausePid string) (bool, int, error) {
return false, -1, errors.New("this function is not supported on this os")
}