diff options
Diffstat (limited to 'pkg/rootless')
-rw-r--r-- | pkg/rootless/rootless_linux.c | 211 | ||||
-rw-r--r-- | pkg/rootless/rootless_linux.go | 136 | ||||
-rw-r--r-- | pkg/rootless/rootless_unsupported.go | 14 |
3 files changed, 289 insertions, 72 deletions
diff --git a/pkg/rootless/rootless_linux.c b/pkg/rootless/rootless_linux.c index 098ca7830..eb62d55e9 100644 --- a/pkg/rootless/rootless_linux.c +++ b/pkg/rootless/rootless_linux.c @@ -34,6 +34,15 @@ int renameat2 (int olddirfd, const char *oldpath, int newdirfd, const char *newp } #endif +#ifndef TEMP_FAILURE_RETRY +#define TEMP_FAILURE_RETRY(expression) \ + (__extension__ \ + ({ long int __result; \ + do __result = (long int) (expression); \ + while (__result == -1L && errno == EINTR); \ + __result; })) +#endif + static const char *_max_user_namespaces = "/proc/sys/user/max_user_namespaces"; static const char *_unprivileged_user_namespaces = "/proc/sys/kernel/unprivileged_userns_clone"; @@ -69,6 +78,19 @@ rootless_gid () static void do_pause () { + int i; + struct sigaction act; + int const sig[] = + { + SIGALRM, SIGHUP, SIGINT, SIGPIPE, SIGQUIT, SIGTERM, SIGPOLL, + SIGPROF, SIGVTALRM, SIGXCPU, SIGXFSZ, 0 + }; + + act.sa_handler = SIG_IGN; + + for (i = 0; sig[i]; i++) + sigaction (sig[i], &act, NULL); + prctl (PR_SET_NAME, "podman pause", NULL, NULL, NULL); while (1) pause (); @@ -100,9 +122,7 @@ get_cmd_line_args (pid_t pid) return NULL; for (;;) { - do - ret = read (fd, buffer + used, allocated - used); - while (ret < 0 && errno == EINTR); + ret = TEMP_FAILURE_RETRY (read (fd, buffer + used, allocated - used)); if (ret < 0) { free (buffer); @@ -167,7 +187,7 @@ can_use_shortcut () argv = get_cmd_line_args (0); if (argv == NULL) - return NULL; + return false; for (argc = 0; argv[argc]; argc++) { @@ -256,13 +276,15 @@ static void __attribute__((constructor)) init() return; } - r = read (fd, buf, sizeof (buf)); + r = TEMP_FAILURE_RETRY (read (fd, buf, sizeof (buf))); close (fd); if (r < 0) { free (cwd); return; } + buf[r] = '\0'; + pid = strtol (buf, NULL, 10); if (pid == LONG_MAX) { @@ -333,6 +355,23 @@ syscall_clone (unsigned long flags, void *child_stack) #endif } +int +reexec_in_user_namespace_wait (int pid, int options) +{ + pid_t p; + int status; + + p = TEMP_FAILURE_RETRY (waitpid (pid, &status, 0)); + if (p < 0) + return -1; + + if (WIFEXITED (status)) + return WEXITSTATUS (status); + if (WIFSIGNALED (status)) + return 128 + WTERMSIG (status); + return -1; +} + static int create_pause_process (const char *pause_pid_file_path, char **argv) { @@ -351,11 +390,11 @@ create_pause_process (const char *pause_pid_file_path, char **argv) close (p[1]); /* Block until we write the pid file. */ - do - r = read (p[0], &b, 1); - while (r < 0 && errno == EINTR); + r = TEMP_FAILURE_RETRY (read (p[0], &b, 1)); close (p[0]); + reexec_in_user_namespace_wait (r, 0); + return r == 1 && b == '0' ? 0 : -1; } else @@ -391,9 +430,7 @@ create_pause_process (const char *pause_pid_file_path, char **argv) _exit (EXIT_FAILURE); } - do - r = write (fd, pid_str, strlen (pid_str)); - while (r < 0 && errno == EINTR); + r = TEMP_FAILURE_RETRY (write (fd, pid_str, strlen (pid_str))); if (r < 0) { kill (pid, SIGKILL); @@ -410,9 +447,7 @@ create_pause_process (const char *pause_pid_file_path, char **argv) _exit (EXIT_FAILURE); } - do - r = write (p[1], "0", 1); - while (r < 0 && errno == EINTR); + r = TEMP_FAILURE_RETRY (write (p[1], "0", 1)); close (p[1]); _exit (EXIT_SUCCESS); @@ -454,6 +489,7 @@ reexec_userns_join (int userns, int mountns, char *pause_pid_file_path) char **argv; int pid; char *cwd = getcwd (NULL, 0); + sigset_t sigset, oldsigset; if (cwd == NULL) { @@ -487,6 +523,22 @@ reexec_userns_join (int userns, int mountns, char *pause_pid_file_path) return pid; } + if (sigfillset (&sigset) < 0) + { + fprintf (stderr, "cannot fill sigset: %s\n", strerror (errno)); + _exit (EXIT_FAILURE); + } + if (sigdelset (&sigset, SIGCHLD) < 0) + { + fprintf (stderr, "cannot sigdelset(SIGCHLD): %s\n", strerror (errno)); + _exit (EXIT_FAILURE); + } + if (sigprocmask (SIG_BLOCK, &sigset, &oldsigset) < 0) + { + fprintf (stderr, "cannot block signals: %s\n", strerror (errno)); + _exit (EXIT_FAILURE); + } + setenv ("_CONTAINERS_USERNS_CONFIGURED", "init", 1); setenv ("_CONTAINERS_ROOTLESS_UID", uid, 1); setenv ("_CONTAINERS_ROOTLESS_GID", gid, 1); @@ -535,6 +587,11 @@ reexec_userns_join (int userns, int mountns, char *pause_pid_file_path) /* We ignore errors here as we didn't create the namespace anyway. */ create_pause_process (pause_pid_file_path, argv); } + if (sigprocmask (SIG_SETMASK, &oldsigset, NULL) < 0) + { + fprintf (stderr, "cannot block signals: %s\n", strerror (errno)); + _exit (EXIT_FAILURE); + } execvp (argv[0], argv); @@ -560,8 +617,47 @@ check_proc_sys_userns_file (const char *path) } } +static int +copy_file_to_fd (const char *file_to_read, int outfd) +{ + char buf[512]; + int fd; + + fd = open (file_to_read, O_RDONLY); + if (fd < 0) + return fd; + + for (;;) + { + ssize_t r, w, t = 0; + + r = TEMP_FAILURE_RETRY (read (fd, buf, sizeof buf)); + if (r < 0) + { + close (fd); + return r; + } + + if (r == 0) + break; + + while (t < r) + { + w = TEMP_FAILURE_RETRY (write (outfd, &buf[t], r - t)); + if (w < 0) + { + close (fd); + return w; + } + t += w; + } + } + close (fd); + return 0; +} + int -reexec_in_user_namespace (int ready, char *pause_pid_file_path) +reexec_in_user_namespace (int ready, char *pause_pid_file_path, char *file_to_read, int outputfd) { int ret; pid_t pid; @@ -574,6 +670,7 @@ reexec_in_user_namespace (int ready, char *pause_pid_file_path) char *listen_pid = NULL; bool do_socket_activation = false; char *cwd = getcwd (NULL, 0); + sigset_t sigset, oldsigset; if (cwd == NULL) { @@ -584,11 +681,11 @@ reexec_in_user_namespace (int ready, char *pause_pid_file_path) listen_pid = getenv("LISTEN_PID"); listen_fds = getenv("LISTEN_FDS"); - if (listen_pid != NULL && listen_fds != NULL) { - if (strtol(listen_pid, NULL, 10) == getpid()) { - do_socket_activation = true; + if (listen_pid != NULL && listen_fds != NULL) + { + if (strtol(listen_pid, NULL, 10) == getpid()) + do_socket_activation = true; } - } sprintf (uid, "%d", geteuid ()); sprintf (gid, "%d", getegid ()); @@ -621,6 +718,22 @@ reexec_in_user_namespace (int ready, char *pause_pid_file_path) return pid; } + if (sigfillset (&sigset) < 0) + { + fprintf (stderr, "cannot fill sigset: %s\n", strerror (errno)); + _exit (EXIT_FAILURE); + } + if (sigdelset (&sigset, SIGCHLD) < 0) + { + fprintf (stderr, "cannot sigdelset(SIGCHLD): %s\n", strerror (errno)); + _exit (EXIT_FAILURE); + } + if (sigprocmask (SIG_BLOCK, &sigset, &oldsigset) < 0) + { + fprintf (stderr, "cannot block signals: %s\n", strerror (errno)); + _exit (EXIT_FAILURE); + } + argv = get_cmd_line_args (ppid); if (argv == NULL) { @@ -628,19 +741,18 @@ reexec_in_user_namespace (int ready, char *pause_pid_file_path) _exit (EXIT_FAILURE); } - if (do_socket_activation) { - char s[32]; - sprintf (s, "%d", getpid()); - setenv ("LISTEN_PID", s, true); - } + if (do_socket_activation) + { + char s[32]; + sprintf (s, "%d", getpid()); + setenv ("LISTEN_PID", s, true); + } setenv ("_CONTAINERS_USERNS_CONFIGURED", "init", 1); setenv ("_CONTAINERS_ROOTLESS_UID", uid, 1); setenv ("_CONTAINERS_ROOTLESS_GID", gid, 1); - do - ret = read (ready, &b, 1) < 0; - while (ret < 0 && errno == EINTR); + ret = TEMP_FAILURE_RETRY (read (ready, &b, 1)); if (ret < 0) { fprintf (stderr, "cannot read from sync pipe: %s\n", strerror (errno)); @@ -652,21 +764,21 @@ reexec_in_user_namespace (int ready, char *pause_pid_file_path) if (syscall_setresgid (0, 0, 0) < 0) { fprintf (stderr, "cannot setresgid: %s\n", strerror (errno)); - write (ready, "1", 1); + TEMP_FAILURE_RETRY (write (ready, "1", 1)); _exit (EXIT_FAILURE); } if (syscall_setresuid (0, 0, 0) < 0) { fprintf (stderr, "cannot setresuid: %s\n", strerror (errno)); - write (ready, "1", 1); + TEMP_FAILURE_RETRY (write (ready, "1", 1)); _exit (EXIT_FAILURE); } if (chdir (cwd) < 0) { fprintf (stderr, "cannot chdir: %s\n", strerror (errno)); - write (ready, "1", 1); + TEMP_FAILURE_RETRY (write (ready, "1", 1)); _exit (EXIT_FAILURE); } free (cwd); @@ -675,37 +787,28 @@ reexec_in_user_namespace (int ready, char *pause_pid_file_path) { if (create_pause_process (pause_pid_file_path, argv) < 0) { - write (ready, "2", 1); + TEMP_FAILURE_RETRY (write (ready, "2", 1)); _exit (EXIT_FAILURE); } } - do - ret = write (ready, "0", 1) < 0; - while (ret < 0 && errno == EINTR); + ret = TEMP_FAILURE_RETRY (write (ready, "0", 1)); close (ready); - execvp (argv[0], argv); - - _exit (EXIT_FAILURE); -} - -int -reexec_in_user_namespace_wait (int pid) -{ - pid_t p; - int status; + if (sigprocmask (SIG_SETMASK, &oldsigset, NULL) < 0) + { + fprintf (stderr, "cannot block signals: %s\n", strerror (errno)); + _exit (EXIT_FAILURE); + } - do - p = waitpid (pid, &status, 0); - while (p < 0 && errno == EINTR); + if (file_to_read && file_to_read[0]) + { + ret = copy_file_to_fd (file_to_read, outputfd); + close (outputfd); + _exit (ret == 0 ? EXIT_SUCCESS : EXIT_FAILURE); + } - if (p < 0) - return -1; + execvp (argv[0], argv); - if (WIFEXITED (status)) - return WEXITSTATUS (status); - if (WIFSIGNALED (status)) - return 128 + WTERMSIG (status); - return -1; + _exit (EXIT_FAILURE); } diff --git a/pkg/rootless/rootless_linux.go b/pkg/rootless/rootless_linux.go index 9132c0fe5..0cac50fc0 100644 --- a/pkg/rootless/rootless_linux.go +++ b/pkg/rootless/rootless_linux.go @@ -26,8 +26,8 @@ import ( #include <stdlib.h> extern uid_t rootless_uid(); extern uid_t rootless_gid(); -extern int reexec_in_user_namespace(int ready, char *pause_pid_file_path); -extern int reexec_in_user_namespace_wait(int pid); +extern int reexec_in_user_namespace(int ready, char *pause_pid_file_path, char *file_to_read, int fd); +extern int reexec_in_user_namespace_wait(int pid, int options); extern int reexec_userns_join(int userns, int mountns, char *pause_pid_file_path); */ import "C" @@ -169,6 +169,9 @@ func getUserNSFirstChild(fd uintptr) (*os.File, error) { for { nextFd, err := getParentUserNs(fd) if err != nil { + if err == syscall.ENOTTY { + return os.NewFile(fd, "userns child"), nil + } return nil, errors.Wrapf(err, "cannot get parent user namespace") } @@ -194,10 +197,24 @@ func getUserNSFirstChild(fd uintptr) (*os.File, error) { } } -// JoinUserAndMountNS re-exec podman in a new userNS and join the user and mount +func enableLinger(pausePid string) { + if pausePid == "" { + return + } + // If we are trying to write a pause pid file, make sure we can leave processes + // running longer than the user session. + err := exec.Command("loginctl", "enable-linger", fmt.Sprintf("%d", GetRootlessUID())).Run() + if err != nil { + logrus.Warnf("cannot run `loginctl enable-linger` for the current user: %v", err) + } +} + +// joinUserAndMountNS re-exec podman in a new userNS and join the user and mount // namespace of the specified PID without looking up its parent. Useful to join directly // the conmon process. -func JoinUserAndMountNS(pid uint, pausePid string) (bool, int, error) { +func joinUserAndMountNS(pid uint, pausePid string) (bool, int, error) { + enableLinger(pausePid) + if os.Geteuid() == 0 || os.Getenv("_CONTAINERS_USERNS_CONFIGURED") != "" { return false, -1, nil } @@ -226,7 +243,7 @@ func JoinUserAndMountNS(pid uint, pausePid string) (bool, int, error) { return false, -1, errors.Errorf("cannot re-exec process") } - ret := C.reexec_in_user_namespace_wait(pidC) + ret := C.reexec_in_user_namespace_wait(pidC, 0) if ret < 0 { return false, -1, errors.New("error waiting for the re-exec process") } @@ -234,11 +251,7 @@ func JoinUserAndMountNS(pid uint, pausePid string) (bool, int, error) { return true, int(ret), nil } -// BecomeRootInUserNS re-exec podman in a new userNS. It returns whether podman was re-executed -// into a new user namespace and the return code from the re-executed podman process. -// If podman was re-executed the caller needs to propagate the error code returned by the child -// process. -func BecomeRootInUserNS(pausePid string) (bool, int, error) { +func becomeRootInUserNS(pausePid, fileToRead string, fileOutput *os.File) (bool, int, error) { if os.Geteuid() == 0 || os.Getenv("_CONTAINERS_USERNS_CONFIGURED") != "" { if os.Getenv("_CONTAINERS_USERNS_CONFIGURED") == "init" { return false, 0, runInUser() @@ -249,6 +262,13 @@ func BecomeRootInUserNS(pausePid string) (bool, int, error) { cPausePid := C.CString(pausePid) defer C.free(unsafe.Pointer(cPausePid)) + cFileToRead := C.CString(fileToRead) + defer C.free(unsafe.Pointer(cFileToRead)) + var fileOutputFD C.int + if fileOutput != nil { + fileOutputFD = C.int(fileOutput.Fd()) + } + runtime.LockOSThread() defer runtime.UnlockOSThread() @@ -262,7 +282,7 @@ func BecomeRootInUserNS(pausePid string) (bool, int, error) { defer w.Close() defer w.Write([]byte("0")) - pidC := C.reexec_in_user_namespace(C.int(r.Fd()), cPausePid) + pidC := C.reexec_in_user_namespace(C.int(r.Fd()), cPausePid, cFileToRead, fileOutputFD) pid := int(pidC) if pid < 0 { return false, -1, errors.Errorf("cannot re-exec process") @@ -328,6 +348,10 @@ func BecomeRootInUserNS(pausePid string) (bool, int, error) { return false, -1, errors.Wrapf(err, "read from sync pipe") } + if fileOutput != nil { + return true, 0, nil + } + if b[0] == '2' { // We have lost the race for writing the PID file, as probably another // process created a namespace and wrote the PID. @@ -336,7 +360,7 @@ func BecomeRootInUserNS(pausePid string) (bool, int, error) { if err == nil { pid, err := strconv.ParseUint(string(data), 10, 0) if err == nil { - return JoinUserAndMountNS(uint(pid), "") + return joinUserAndMountNS(uint(pid), "") } } return false, -1, errors.Wrapf(err, "error setting up the process") @@ -368,10 +392,96 @@ func BecomeRootInUserNS(pausePid string) (bool, int, error) { } }() - ret := C.reexec_in_user_namespace_wait(pidC) + ret := C.reexec_in_user_namespace_wait(pidC, 0) if ret < 0 { return false, -1, errors.New("error waiting for the re-exec process") } return true, int(ret), nil } + +// BecomeRootInUserNS re-exec podman in a new userNS. It returns whether podman was re-executed +// into a new user namespace and the return code from the re-executed podman process. +// If podman was re-executed the caller needs to propagate the error code returned by the child +// process. +func BecomeRootInUserNS(pausePid string) (bool, int, error) { + enableLinger(pausePid) + return becomeRootInUserNS(pausePid, "", nil) +} + +// TryJoinFromFilePaths attempts to join the namespaces of the pid files in paths. +// This is useful when there are already running containers and we +// don't have a pause process yet. We can use the paths to the conmon +// processes to attempt joining their namespaces. +// If needNewNamespace is set, the file is read from a temporary user +// namespace, this is useful for containers that are running with a +// different uidmap and the unprivileged user has no way to read the +// file owned by the root in the container. +func TryJoinFromFilePaths(pausePidPath string, needNewNamespace bool, paths []string) (bool, int, error) { + if len(paths) == 0 { + return BecomeRootInUserNS(pausePidPath) + } + + var lastErr error + var pausePid int + + for _, path := range paths { + if !needNewNamespace { + data, err := ioutil.ReadFile(path) + if err != nil { + lastErr = err + continue + } + + pausePid, err = strconv.Atoi(string(data)) + if err != nil { + lastErr = errors.Wrapf(err, "cannot parse file %s", path) + continue + } + + lastErr = nil + break + } else { + fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_DGRAM, 0) + if err != nil { + lastErr = err + continue + } + + r, w := os.NewFile(uintptr(fds[0]), "read file"), os.NewFile(uintptr(fds[1]), "write file") + + defer w.Close() + defer r.Close() + + if _, _, err := becomeRootInUserNS("", path, w); err != nil { + lastErr = err + continue + } + + w.Close() + defer func() { + r.Close() + C.reexec_in_user_namespace_wait(-1, 0) + }() + + b := make([]byte, 32) + + n, err := r.Read(b) + if err != nil { + lastErr = errors.Wrapf(err, "cannot read %s\n", path) + continue + } + + pausePid, err = strconv.Atoi(string(b[:n])) + if err == nil { + lastErr = nil + break + } + } + } + if lastErr != nil { + return false, 0, lastErr + } + + return joinUserAndMountNS(uint(pausePid), pausePidPath) +} diff --git a/pkg/rootless/rootless_unsupported.go b/pkg/rootless/rootless_unsupported.go index 221baff97..c063adee5 100644 --- a/pkg/rootless/rootless_unsupported.go +++ b/pkg/rootless/rootless_unsupported.go @@ -29,10 +29,14 @@ func GetRootlessGID() int { return -1 } -// JoinUserAndMountNS re-exec podman in a new userNS and join the user and mount -// namespace of the specified PID without looking up its parent. Useful to join directly -// the conmon process. It is a convenience function for JoinUserAndMountNSWithOpts -// with a default configuration. -func JoinUserAndMountNS(pid uint, pausePid string) (bool, int, error) { +// TryJoinFromFilePaths attempts to join the namespaces of the pid files in paths. +// This is useful when there are already running containers and we +// don't have a pause process yet. We can use the paths to the conmon +// processes to attempt joining their namespaces. +// If needNewNamespace is set, the file is read from a temporary user +// namespace, this is useful for containers that are running with a +// different uidmap and the unprivileged user has no way to read the +// file owned by the root in the container. +func TryJoinFromFilePaths(pausePidPath string, needNewNamespace bool, paths []string) (bool, int, error) { return false, -1, errors.New("this function is not supported on this os") } |