diff options
Diffstat (limited to 'pkg')
-rw-r--r-- | pkg/rootless/rootless_linux.c | 240 | ||||
-rw-r--r-- | pkg/rootless/rootless_linux.go | 49 | ||||
-rw-r--r-- | pkg/rootless/rootless_unsupported.go | 4 | ||||
-rw-r--r-- | pkg/util/utils_supported.go | 10 | ||||
-rw-r--r-- | pkg/util/utils_windows.go | 6 |
5 files changed, 262 insertions, 47 deletions
diff --git a/pkg/rootless/rootless_linux.c b/pkg/rootless/rootless_linux.c index 1d32b1adb..b87deb86e 100644 --- a/pkg/rootless/rootless_linux.c +++ b/pkg/rootless/rootless_linux.c @@ -17,6 +17,22 @@ #include <sys/prctl.h> #include <dirent.h> #include <sys/select.h> +#include <stdio.h> + +#ifndef RENAME_NOREPLACE +# define RENAME_NOREPLACE (1 << 0) + +int renameat2 (int olddirfd, const char *oldpath, int newdirfd, const char *newpath, unsigned int flags) +{ +# ifdef __NR_renameat2 + return (int) syscall (__NR_renameat2, olddirfd, oldpath, newdirfd, newpath, flags); +# else + /* no way to implement it atomically. */ + errno = ENOSYS; + return -1; +# endif +} +#endif static const char *_max_user_namespaces = "/proc/sys/user/max_user_namespaces"; static const char *_unprivileged_user_namespaces = "/proc/sys/kernel/unprivileged_userns_clone"; @@ -24,32 +40,6 @@ static const char *_unprivileged_user_namespaces = "/proc/sys/kernel/unprivilege static int open_files_max_fd; fd_set open_files_set; -static void __attribute__((constructor)) init() -{ - DIR *d; - - /* Store how many FDs were open before the Go runtime kicked in. */ - d = opendir ("/proc/self/fd"); - if (d) - { - struct dirent *ent; - - FD_ZERO (&open_files_set); - for (ent = readdir (d); ent; ent = readdir (d)) - { - int fd = atoi (ent->d_name); - if (fd != dirfd (d)) - { - if (fd > open_files_max_fd) - open_files_max_fd = fd; - FD_SET (fd, &open_files_set); - } - } - closedir (d); - } -} - - static int syscall_setresuid (uid_t ruid, uid_t euid, uid_t suid) { @@ -62,14 +52,12 @@ syscall_setresgid (gid_t rgid, gid_t egid, gid_t sgid) return (int) syscall (__NR_setresgid, rgid, egid, sgid); } -static int -syscall_clone (unsigned long flags, void *child_stack) +static void +do_pause () { -#if defined(__s390__) || defined(__CRIS__) - return (int) syscall (__NR_clone, child_stack, flags); -#else - return (int) syscall (__NR_clone, flags, child_stack); -#endif + prctl (PR_SET_NAME, "podman pause", NULL, NULL, NULL); + while (1) + pause (); } static char ** @@ -139,8 +127,164 @@ get_cmd_line_args (pid_t pid) return argv; } +static void __attribute__((constructor)) init() +{ + const char *xdg_runtime_dir; + const char *pause; + DIR *d; + + pause = getenv ("_PODMAN_PAUSE"); + if (pause && pause[0]) + { + do_pause (); + _exit (EXIT_FAILURE); + } + + /* Store how many FDs were open before the Go runtime kicked in. */ + d = opendir ("/proc/self/fd"); + if (d) + { + struct dirent *ent; + + FD_ZERO (&open_files_set); + for (ent = readdir (d); ent; ent = readdir (d)) + { + int fd = atoi (ent->d_name); + if (fd != dirfd (d)) + { + if (fd > open_files_max_fd) + open_files_max_fd = fd; + FD_SET (fd, &open_files_set); + } + } + closedir (d); + } +} + +static int +syscall_clone (unsigned long flags, void *child_stack) +{ +#if defined(__s390__) || defined(__CRIS__) + return (int) syscall (__NR_clone, child_stack, flags); +#else + return (int) syscall (__NR_clone, flags, child_stack); +#endif +} + +static int +create_pause_process (const char *pause_pid_file_path, char **argv) +{ + int r, p[2]; + + if (pipe (p) < 0) + _exit (EXIT_FAILURE); + + r = fork (); + if (r < 0) + _exit (EXIT_FAILURE); + + if (r) + { + char b; + + close (p[1]); + /* Block until we write the pid file. */ + do + r = read (p[0], &b, 1); + while (r < 0 && errno == EINTR); + close (p[0]); + + return r == 1 && b == '0' ? 0 : -1; + } + else + { + int fd; + pid_t pid; + + close (p[0]); + + setsid (); + pid = fork (); + if (r < 0) + _exit (EXIT_FAILURE); + + if (pid) + { + char pid_str[12]; + char *tmp_file_path = NULL; + + sprintf (pid_str, "%d", pid); + + asprintf (&tmp_file_path, "%s.XXXXXX", pause_pid_file_path); + if (tmp_file_path == NULL) + { + kill (pid, SIGKILL); + _exit (EXIT_FAILURE); + } + + fd = mkstemp (tmp_file_path); + if (fd < 0) + { + kill (pid, SIGKILL); + _exit (EXIT_FAILURE); + } + + do + r = write (fd, pid_str, strlen (pid_str)); + while (r < 0 && errno == EINTR); + if (r < 0) + { + kill (pid, SIGKILL); + _exit (EXIT_FAILURE); + } + close (fd); + + /* There can be another process at this point trying to configure the user namespace and the pause + process, do not override the pid file if it already exists. */ + if (renameat2 (AT_FDCWD, tmp_file_path, AT_FDCWD, pause_pid_file_path, RENAME_NOREPLACE) < 0) + { + unlink (tmp_file_path); + kill (pid, SIGKILL); + _exit (EXIT_FAILURE); + } + + do + r = write (p[1], "0", 1); + while (r < 0 && errno == EINTR); + close (p[1]); + + _exit (EXIT_SUCCESS); + } + else + { + int null; + + close (p[1]); + + null = open ("/dev/null", O_RDWR); + if (null >= 0) + { + dup2 (null, 0); + dup2 (null, 1); + dup2 (null, 2); + close (null); + } + + for (fd = 3; fd < open_files_max_fd + 16; fd++) + close (fd); + + setenv ("_PODMAN_PAUSE", "1", 1); + execlp (argv[0], NULL); + + /* If the execve fails, then do the pause here. */ + do_pause (); + _exit (EXIT_FAILURE); + } + } +} + int -reexec_userns_join (int userns, int mountns) +reexec_userns_join (int userns, int mountns, char *pause_pid_file_path) { pid_t ppid = getpid (); char uid[16]; @@ -221,6 +365,12 @@ reexec_userns_join (int userns, int mountns) } free (cwd); + if (pause_pid_file_path && pause_pid_file_path[0] != '\0') + { + /* We ignore errors here as we didn't create the namespace anyway. */ + create_pause_process (pause_pid_file_path, argv); + } + execvp (argv[0], argv); _exit (EXIT_FAILURE); @@ -246,7 +396,7 @@ check_proc_sys_userns_file (const char *path) } int -reexec_in_user_namespace (int ready) +reexec_in_user_namespace (int ready, char *pause_pid_file_path) { int ret; pid_t pid; @@ -328,29 +478,45 @@ reexec_in_user_namespace (int ready) fprintf (stderr, "cannot read from sync pipe: %s\n", strerror (errno)); _exit (EXIT_FAILURE); } - close (ready); - if (b != '1') + if (b != '0') _exit (EXIT_FAILURE); if (syscall_setresgid (0, 0, 0) < 0) { fprintf (stderr, "cannot setresgid: %s\n", strerror (errno)); + write (ready, "1", 1); _exit (EXIT_FAILURE); } if (syscall_setresuid (0, 0, 0) < 0) { fprintf (stderr, "cannot setresuid: %s\n", strerror (errno)); + write (ready, "1", 1); _exit (EXIT_FAILURE); } if (chdir (cwd) < 0) { fprintf (stderr, "cannot chdir: %s\n", strerror (errno)); + write (ready, "1", 1); _exit (EXIT_FAILURE); } free (cwd); + if (pause_pid_file_path && pause_pid_file_path[0] != '\0') + { + if (create_pause_process (pause_pid_file_path, argv) < 0) + { + write (ready, "2", 1); + _exit (EXIT_FAILURE); + } + } + + do + ret = write (ready, "0", 1) < 0; + while (ret < 0 && errno == EINTR); + close (ready); + execvp (argv[0], argv); _exit (EXIT_FAILURE); diff --git a/pkg/rootless/rootless_linux.go b/pkg/rootless/rootless_linux.go index 2c99f41a4..0390bbb6a 100644 --- a/pkg/rootless/rootless_linux.go +++ b/pkg/rootless/rootless_linux.go @@ -22,9 +22,10 @@ import ( ) /* -extern int reexec_in_user_namespace(int ready); +#include <stdlib.h> +extern int reexec_in_user_namespace(int ready, char *pause_pid_file_path); extern int reexec_in_user_namespace_wait(int pid); -extern int reexec_userns_join(int userns, int mountns); +extern int reexec_userns_join(int userns, int mountns, char *pause_pid_file_path); */ import "C" @@ -168,11 +169,14 @@ func getUserNSFirstChild(fd uintptr) (*os.File, error) { // JoinUserAndMountNS re-exec podman in a new userNS and join the user and mount // namespace of the specified PID without looking up its parent. Useful to join directly // the conmon process. -func JoinUserAndMountNS(pid uint) (bool, int, error) { +func JoinUserAndMountNS(pid uint, pausePid string) (bool, int, error) { if os.Geteuid() == 0 || os.Getenv("_CONTAINERS_USERNS_CONFIGURED") != "" { return false, -1, nil } + cPausePid := C.CString(pausePid) + defer C.free(unsafe.Pointer(cPausePid)) + userNS, err := os.Open(fmt.Sprintf("/proc/%d/ns/user", pid)) if err != nil { return false, -1, err @@ -189,7 +193,7 @@ func JoinUserAndMountNS(pid uint) (bool, int, error) { if err != nil { return false, -1, err } - pidC := C.reexec_userns_join(C.int(fd.Fd()), C.int(mountNS.Fd())) + pidC := C.reexec_userns_join(C.int(fd.Fd()), C.int(mountNS.Fd()), cPausePid) if int(pidC) < 0 { return false, -1, errors.Errorf("cannot re-exec process") } @@ -206,7 +210,7 @@ func JoinUserAndMountNS(pid uint) (bool, int, error) { // into a new user namespace and the return code from the re-executed podman process. // If podman was re-executed the caller needs to propagate the error code returned by the child // process. -func BecomeRootInUserNS() (bool, int, error) { +func BecomeRootInUserNS(pausePid string) (bool, int, error) { if os.Geteuid() == 0 || os.Getenv("_CONTAINERS_USERNS_CONFIGURED") != "" { if os.Getenv("_CONTAINERS_USERNS_CONFIGURED") == "init" { return false, 0, runInUser() @@ -214,18 +218,23 @@ func BecomeRootInUserNS() (bool, int, error) { return false, 0, nil } + cPausePid := C.CString(pausePid) + defer C.free(unsafe.Pointer(cPausePid)) + runtime.LockOSThread() defer runtime.UnlockOSThread() - r, w, err := os.Pipe() + fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_DGRAM, 0) if err != nil { return false, -1, err } + r, w := os.NewFile(uintptr(fds[0]), "sync host"), os.NewFile(uintptr(fds[1]), "sync child") + defer r.Close() defer w.Close() defer w.Write([]byte("0")) - pidC := C.reexec_in_user_namespace(C.int(r.Fd())) + pidC := C.reexec_in_user_namespace(C.int(r.Fd()), cPausePid) pid := int(pidC) if pid < 0 { return false, -1, errors.Errorf("cannot re-exec process") @@ -280,11 +289,35 @@ func BecomeRootInUserNS() (bool, int, error) { } } - _, err = w.Write([]byte("1")) + _, err = w.Write([]byte("0")) if err != nil { return false, -1, errors.Wrapf(err, "write to sync pipe") } + b := make([]byte, 1, 1) + _, err = w.Read(b) + if err != nil { + return false, -1, errors.Wrapf(err, "read from sync pipe") + } + + if b[0] == '2' { + // We have lost the race for writing the PID file, as probably another + // process created a namespace and wrote the PID. + // Try to join it. + data, err := ioutil.ReadFile(pausePid) + if err == nil { + pid, err := strconv.ParseUint(string(data), 10, 0) + if err == nil { + return JoinUserAndMountNS(uint(pid), "") + } + } + return false, -1, errors.Wrapf(err, "error setting up the process") + } + + if b[0] != '0' { + return false, -1, errors.Wrapf(err, "error setting up the process") + } + c := make(chan os.Signal, 1) signals := []os.Signal{} diff --git a/pkg/rootless/rootless_unsupported.go b/pkg/rootless/rootless_unsupported.go index 47b5dd7cc..42f8f3aec 100644 --- a/pkg/rootless/rootless_unsupported.go +++ b/pkg/rootless/rootless_unsupported.go @@ -15,7 +15,7 @@ func IsRootless() bool { // into a new user namespace and the return code from the re-executed podman process. // If podman was re-executed the caller needs to propagate the error code returned by the child // process. It is a convenience function for BecomeRootInUserNSWithOpts with a default configuration. -func BecomeRootInUserNS() (bool, int, error) { +func BecomeRootInUserNS(pausePid string) (bool, int, error) { return false, -1, errors.New("this function is not supported on this os") } @@ -28,6 +28,6 @@ func GetRootlessUID() int { // namespace of the specified PID without looking up its parent. Useful to join directly // the conmon process. It is a convenience function for JoinUserAndMountNSWithOpts // with a default configuration. -func JoinUserAndMountNS(pid uint) (bool, int, error) { +func JoinUserAndMountNS(pid uint, pausePid string) (bool, int, error) { return false, -1, errors.New("this function is not supported on this os") } diff --git a/pkg/util/utils_supported.go b/pkg/util/utils_supported.go index 8b98658c2..3d9140a23 100644 --- a/pkg/util/utils_supported.go +++ b/pkg/util/utils_supported.go @@ -82,3 +82,13 @@ func GetRootlessRuntimeDir() (string, error) { } return rootlessRuntimeDir, nil } + +// GetRootlessPauseProcessPidPath returns the path to the file that holds the pid for +// the pause process +func GetRootlessPauseProcessPidPath() (string, error) { + runtimeDir, err := GetRootlessRuntimeDir() + if err != nil { + return "", err + } + return filepath.Join(runtimeDir, "libpod", "pause.pid"), nil +} diff --git a/pkg/util/utils_windows.go b/pkg/util/utils_windows.go index b33733da9..3faa6f10c 100644 --- a/pkg/util/utils_windows.go +++ b/pkg/util/utils_windows.go @@ -15,3 +15,9 @@ func GetRootlessRuntimeDir() (string, error) { func IsCgroup2UnifiedMode() (bool, error) { return false, errors.New("this function is not implemented for windows") } + +// GetRootlessPauseProcessPidPath returns the path to the file that holds the pid for +// the pause process +func GetRootlessPauseProcessPidPath() (string, error) { + return "", errors.New("this function is not implemented for windows") +} |