aboutsummaryrefslogtreecommitdiff
path: root/pkg/rootless
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/rootless')
-rw-r--r--pkg/rootless/rootless_linux.c412
-rw-r--r--pkg/rootless/rootless_linux.go57
-rw-r--r--pkg/rootless/rootless_unsupported.go4
3 files changed, 416 insertions, 57 deletions
diff --git a/pkg/rootless/rootless_linux.c b/pkg/rootless/rootless_linux.c
index 1d32b1adb..a08cfd36a 100644
--- a/pkg/rootless/rootless_linux.c
+++ b/pkg/rootless/rootless_linux.c
@@ -17,38 +17,29 @@
#include <sys/prctl.h>
#include <dirent.h>
#include <sys/select.h>
+#include <stdio.h>
+
+#ifndef RENAME_NOREPLACE
+# define RENAME_NOREPLACE (1 << 0)
+
+int renameat2 (int olddirfd, const char *oldpath, int newdirfd, const char *newpath, unsigned int flags)
+{
+# ifdef __NR_renameat2
+ return (int) syscall (__NR_renameat2, olddirfd, oldpath, newdirfd, newpath, flags);
+# else
+ /* no way to implement it atomically. */
+ errno = ENOSYS;
+ return -1;
+# endif
+}
+#endif
static const char *_max_user_namespaces = "/proc/sys/user/max_user_namespaces";
static const char *_unprivileged_user_namespaces = "/proc/sys/kernel/unprivileged_userns_clone";
static int open_files_max_fd;
fd_set open_files_set;
-
-static void __attribute__((constructor)) init()
-{
- DIR *d;
-
- /* Store how many FDs were open before the Go runtime kicked in. */
- d = opendir ("/proc/self/fd");
- if (d)
- {
- struct dirent *ent;
-
- FD_ZERO (&open_files_set);
- for (ent = readdir (d); ent; ent = readdir (d))
- {
- int fd = atoi (ent->d_name);
- if (fd != dirfd (d))
- {
- if (fd > open_files_max_fd)
- open_files_max_fd = fd;
- FD_SET (fd, &open_files_set);
- }
- }
- closedir (d);
- }
-}
-
+static uid_t rootless_uid_init;
static int
syscall_setresuid (uid_t ruid, uid_t euid, uid_t suid)
@@ -62,14 +53,18 @@ syscall_setresgid (gid_t rgid, gid_t egid, gid_t sgid)
return (int) syscall (__NR_setresgid, rgid, egid, sgid);
}
-static int
-syscall_clone (unsigned long flags, void *child_stack)
+uid_t
+rootless_uid ()
{
-#if defined(__s390__) || defined(__CRIS__)
- return (int) syscall (__NR_clone, child_stack, flags);
-#else
- return (int) syscall (__NR_clone, flags, child_stack);
-#endif
+ return rootless_uid_init;
+}
+
+static void
+do_pause ()
+{
+ prctl (PR_SET_NAME, "podman pause", NULL, NULL, NULL);
+ while (1)
+ pause ();
}
static char **
@@ -84,7 +79,10 @@ get_cmd_line_args (pid_t pid)
int i, argc = 0;
char **argv;
- sprintf (path, "/proc/%d/cmdline", pid);
+ if (pid)
+ sprintf (path, "/proc/%d/cmdline", pid);
+ else
+ strcpy (path, "/proc/self/cmdline");
fd = open (path, O_RDONLY);
if (fd < 0)
return NULL;
@@ -99,7 +97,10 @@ get_cmd_line_args (pid_t pid)
ret = read (fd, buffer + used, allocated - used);
while (ret < 0 && errno == EINTR);
if (ret < 0)
- return NULL;
+ {
+ free (buffer);
+ return NULL;
+ }
if (ret == 0)
break;
@@ -109,11 +110,12 @@ get_cmd_line_args (pid_t pid)
{
allocated += 512;
char *tmp = realloc (buffer, allocated);
- if (buffer == NULL) {
- free(buffer);
- return NULL;
- }
- buffer=tmp;
+ if (buffer == NULL)
+ {
+ free (buffer);
+ return NULL;
+ }
+ buffer = tmp;
}
}
close (fd);
@@ -122,11 +124,17 @@ get_cmd_line_args (pid_t pid)
if (buffer[i] == '\0')
argc++;
if (argc == 0)
- return NULL;
+ {
+ free (buffer);
+ return NULL;
+ }
argv = malloc (sizeof (char *) * (argc + 1));
if (argv == NULL)
- return NULL;
+ {
+ free (buffer);
+ return NULL;
+ }
argc = 0;
argv[argc++] = buffer;
@@ -139,8 +147,296 @@ get_cmd_line_args (pid_t pid)
return argv;
}
+static bool
+can_use_shortcut ()
+{
+ int argc;
+ char **argv;
+ bool ret = true;
+
+#ifdef DISABLE_JOIN_SHORTCUT
+ return false;
+#endif
+
+ argv = get_cmd_line_args (0);
+ if (argv == NULL)
+ return NULL;
+
+ for (argc = 0; argv[argc]; argc++)
+ {
+ if (argc == 0 || argv[argc][0] == '-')
+ continue;
+
+ if (strcmp (argv[argc], "mount") == 0
+ || strcmp (argv[argc], "search") == 0
+ || strcmp (argv[argc], "system") == 0)
+ {
+ ret = false;
+ break;
+ }
+ }
+
+ free (argv[0]);
+ free (argv);
+ return ret;
+}
+
+static void __attribute__((constructor)) init()
+{
+ const char *xdg_runtime_dir;
+ const char *pause;
+ DIR *d;
+
+ pause = getenv ("_PODMAN_PAUSE");
+ if (pause && pause[0])
+ {
+ do_pause ();
+ _exit (EXIT_FAILURE);
+ }
+
+ /* Store how many FDs were open before the Go runtime kicked in. */
+ d = opendir ("/proc/self/fd");
+ if (d)
+ {
+ struct dirent *ent;
+
+ FD_ZERO (&open_files_set);
+ for (ent = readdir (d); ent; ent = readdir (d))
+ {
+ int fd = atoi (ent->d_name);
+ if (fd != dirfd (d))
+ {
+ if (fd > open_files_max_fd)
+ open_files_max_fd = fd;
+ FD_SET (fd, &open_files_set);
+ }
+ }
+ closedir (d);
+ }
+
+ /* Shortcut. If we are able to join the pause pid file, do it now so we don't
+ need to re-exec. */
+ xdg_runtime_dir = getenv ("XDG_RUNTIME_DIR");
+ if (xdg_runtime_dir && xdg_runtime_dir[0] && can_use_shortcut ())
+ {
+ int r;
+ int fd;
+ long pid;
+ char buf[12];
+ uid_t uid;
+ char path[PATH_MAX];
+ const char *const suffix = "/libpod/pause.pid";
+ char *cwd = getcwd (NULL, 0);
+
+ if (cwd == NULL)
+ {
+ fprintf (stderr, "error getting current working directory: %s\n", strerror (errno));
+ _exit (EXIT_FAILURE);
+ }
+
+ if (strlen (xdg_runtime_dir) >= PATH_MAX - strlen (suffix))
+ {
+ fprintf (stderr, "invalid value for XDG_RUNTIME_DIR: %s", strerror (ENAMETOOLONG));
+ exit (EXIT_FAILURE);
+ }
+
+ sprintf (path, "%s%s", xdg_runtime_dir, suffix);
+ fd = open (path, O_RDONLY);
+ if (fd < 0)
+ {
+ free (cwd);
+ return;
+ }
+
+ r = read (fd, buf, sizeof (buf));
+ close (fd);
+ if (r < 0)
+ {
+ free (cwd);
+ return;
+ }
+ pid = strtol (buf, NULL, 10);
+ if (pid == LONG_MAX)
+ {
+ free (cwd);
+ return;
+ }
+
+ uid = geteuid ();
+
+ sprintf (path, "/proc/%d/ns/user", pid);
+ fd = open (path, O_RDONLY);
+ if (fd < 0 || setns (fd, 0) < 0)
+ {
+ free (cwd);
+ return;
+ }
+ close (fd);
+
+ /* Errors here cannot be ignored as we already joined a ns. */
+ sprintf (path, "/proc/%d/ns/mnt", pid);
+ fd = open (path, O_RDONLY);
+ if (fd < 0)
+ {
+ fprintf (stderr, "cannot open %s: %s", path, strerror (errno));
+ exit (EXIT_FAILURE);
+ }
+
+ r = setns (fd, 0);
+ if (r < 0)
+ {
+ fprintf (stderr, "cannot join mount namespace for %d: %s", pid, strerror (errno));
+ exit (EXIT_FAILURE);
+ }
+ close (fd);
+
+ if (syscall_setresgid (0, 0, 0) < 0)
+ {
+ fprintf (stderr, "cannot setresgid: %s\n", strerror (errno));
+ _exit (EXIT_FAILURE);
+ }
+
+ if (syscall_setresuid (0, 0, 0) < 0)
+ {
+ fprintf (stderr, "cannot setresuid: %s\n", strerror (errno));
+ _exit (EXIT_FAILURE);
+ }
+
+ if (chdir (cwd) < 0)
+ {
+ fprintf (stderr, "cannot chdir: %s\n", strerror (errno));
+ _exit (EXIT_FAILURE);
+ }
+
+ free (cwd);
+ rootless_uid_init = uid;
+ }
+}
+
+static int
+syscall_clone (unsigned long flags, void *child_stack)
+{
+#if defined(__s390__) || defined(__CRIS__)
+ return (int) syscall (__NR_clone, child_stack, flags);
+#else
+ return (int) syscall (__NR_clone, flags, child_stack);
+#endif
+}
+
+static int
+create_pause_process (const char *pause_pid_file_path, char **argv)
+{
+ int r, p[2];
+
+ if (pipe (p) < 0)
+ _exit (EXIT_FAILURE);
+
+ r = fork ();
+ if (r < 0)
+ _exit (EXIT_FAILURE);
+
+ if (r)
+ {
+ char b;
+
+ close (p[1]);
+ /* Block until we write the pid file. */
+ do
+ r = read (p[0], &b, 1);
+ while (r < 0 && errno == EINTR);
+ close (p[0]);
+
+ return r == 1 && b == '0' ? 0 : -1;
+ }
+ else
+ {
+ int fd;
+ pid_t pid;
+
+ close (p[0]);
+
+ setsid ();
+ pid = fork ();
+ if (r < 0)
+ _exit (EXIT_FAILURE);
+
+ if (pid)
+ {
+ char pid_str[12];
+ char *tmp_file_path = NULL;
+
+ sprintf (pid_str, "%d", pid);
+
+ asprintf (&tmp_file_path, "%s.XXXXXX", pause_pid_file_path);
+ if (tmp_file_path == NULL)
+ {
+ kill (pid, SIGKILL);
+ _exit (EXIT_FAILURE);
+ }
+
+ fd = mkstemp (tmp_file_path);
+ if (fd < 0)
+ {
+ kill (pid, SIGKILL);
+ _exit (EXIT_FAILURE);
+ }
+
+ do
+ r = write (fd, pid_str, strlen (pid_str));
+ while (r < 0 && errno == EINTR);
+ if (r < 0)
+ {
+ kill (pid, SIGKILL);
+ _exit (EXIT_FAILURE);
+ }
+ close (fd);
+
+ /* There can be another process at this point trying to configure the user namespace and the pause
+ process, do not override the pid file if it already exists. */
+ if (renameat2 (AT_FDCWD, tmp_file_path, AT_FDCWD, pause_pid_file_path, RENAME_NOREPLACE) < 0)
+ {
+ unlink (tmp_file_path);
+ kill (pid, SIGKILL);
+ _exit (EXIT_FAILURE);
+ }
+
+ do
+ r = write (p[1], "0", 1);
+ while (r < 0 && errno == EINTR);
+ close (p[1]);
+
+ _exit (EXIT_SUCCESS);
+ }
+ else
+ {
+ int null;
+
+ close (p[1]);
+
+ null = open ("/dev/null", O_RDWR);
+ if (null >= 0)
+ {
+ dup2 (null, 0);
+ dup2 (null, 1);
+ dup2 (null, 2);
+ close (null);
+ }
+
+ for (fd = 3; fd < open_files_max_fd + 16; fd++)
+ close (fd);
+
+ setenv ("_PODMAN_PAUSE", "1", 1);
+ execlp (argv[0], NULL);
+
+ /* If the execve fails, then do the pause here. */
+ do_pause ();
+ _exit (EXIT_FAILURE);
+ }
+ }
+}
+
int
-reexec_userns_join (int userns, int mountns)
+reexec_userns_join (int userns, int mountns, char *pause_pid_file_path)
{
pid_t ppid = getpid ();
char uid[16];
@@ -200,7 +496,7 @@ reexec_userns_join (int userns, int mountns)
fprintf (stderr, "cannot setns: %s\n", strerror (errno));
_exit (EXIT_FAILURE);
}
- close (userns);
+ close (mountns);
if (syscall_setresgid (0, 0, 0) < 0)
{
@@ -221,6 +517,12 @@ reexec_userns_join (int userns, int mountns)
}
free (cwd);
+ if (pause_pid_file_path && pause_pid_file_path[0] != '\0')
+ {
+ /* We ignore errors here as we didn't create the namespace anyway. */
+ create_pause_process (pause_pid_file_path, argv);
+ }
+
execvp (argv[0], argv);
_exit (EXIT_FAILURE);
@@ -246,7 +548,7 @@ check_proc_sys_userns_file (const char *path)
}
int
-reexec_in_user_namespace (int ready)
+reexec_in_user_namespace (int ready, char *pause_pid_file_path)
{
int ret;
pid_t pid;
@@ -328,29 +630,45 @@ reexec_in_user_namespace (int ready)
fprintf (stderr, "cannot read from sync pipe: %s\n", strerror (errno));
_exit (EXIT_FAILURE);
}
- close (ready);
- if (b != '1')
+ if (b != '0')
_exit (EXIT_FAILURE);
if (syscall_setresgid (0, 0, 0) < 0)
{
fprintf (stderr, "cannot setresgid: %s\n", strerror (errno));
+ write (ready, "1", 1);
_exit (EXIT_FAILURE);
}
if (syscall_setresuid (0, 0, 0) < 0)
{
fprintf (stderr, "cannot setresuid: %s\n", strerror (errno));
+ write (ready, "1", 1);
_exit (EXIT_FAILURE);
}
if (chdir (cwd) < 0)
{
fprintf (stderr, "cannot chdir: %s\n", strerror (errno));
+ write (ready, "1", 1);
_exit (EXIT_FAILURE);
}
free (cwd);
+ if (pause_pid_file_path && pause_pid_file_path[0] != '\0')
+ {
+ if (create_pause_process (pause_pid_file_path, argv) < 0)
+ {
+ write (ready, "2", 1);
+ _exit (EXIT_FAILURE);
+ }
+ }
+
+ do
+ ret = write (ready, "0", 1) < 0;
+ while (ret < 0 && errno == EINTR);
+ close (ready);
+
execvp (argv[0], argv);
_exit (EXIT_FAILURE);
diff --git a/pkg/rootless/rootless_linux.go b/pkg/rootless/rootless_linux.go
index 2c99f41a4..ddf881368 100644
--- a/pkg/rootless/rootless_linux.go
+++ b/pkg/rootless/rootless_linux.go
@@ -22,9 +22,12 @@ import (
)
/*
-extern int reexec_in_user_namespace(int ready);
+#cgo remoteclient CFLAGS: -DDISABLE_JOIN_SHORTCUT
+#include <stdlib.h>
+extern uid_t rootless_uid();
+extern int reexec_in_user_namespace(int ready, char *pause_pid_file_path);
extern int reexec_in_user_namespace_wait(int pid);
-extern int reexec_userns_join(int userns, int mountns);
+extern int reexec_userns_join(int userns, int mountns, char *pause_pid_file_path);
*/
import "C"
@@ -45,6 +48,12 @@ var (
// IsRootless tells us if we are running in rootless mode
func IsRootless() bool {
isRootlessOnce.Do(func() {
+ rootlessUIDInit := int(C.rootless_uid())
+ if rootlessUIDInit != 0 {
+ // This happens if we joined the user+mount namespace as part of
+ os.Setenv("_CONTAINERS_USERNS_CONFIGURED", "done")
+ os.Setenv("_CONTAINERS_ROOTLESS_UID", fmt.Sprintf("%d", rootlessUIDInit))
+ }
isRootless = os.Geteuid() != 0 || os.Getenv("_CONTAINERS_USERNS_CONFIGURED") != ""
})
return isRootless
@@ -168,11 +177,14 @@ func getUserNSFirstChild(fd uintptr) (*os.File, error) {
// JoinUserAndMountNS re-exec podman in a new userNS and join the user and mount
// namespace of the specified PID without looking up its parent. Useful to join directly
// the conmon process.
-func JoinUserAndMountNS(pid uint) (bool, int, error) {
+func JoinUserAndMountNS(pid uint, pausePid string) (bool, int, error) {
if os.Geteuid() == 0 || os.Getenv("_CONTAINERS_USERNS_CONFIGURED") != "" {
return false, -1, nil
}
+ cPausePid := C.CString(pausePid)
+ defer C.free(unsafe.Pointer(cPausePid))
+
userNS, err := os.Open(fmt.Sprintf("/proc/%d/ns/user", pid))
if err != nil {
return false, -1, err
@@ -189,7 +201,7 @@ func JoinUserAndMountNS(pid uint) (bool, int, error) {
if err != nil {
return false, -1, err
}
- pidC := C.reexec_userns_join(C.int(fd.Fd()), C.int(mountNS.Fd()))
+ pidC := C.reexec_userns_join(C.int(fd.Fd()), C.int(mountNS.Fd()), cPausePid)
if int(pidC) < 0 {
return false, -1, errors.Errorf("cannot re-exec process")
}
@@ -206,7 +218,7 @@ func JoinUserAndMountNS(pid uint) (bool, int, error) {
// into a new user namespace and the return code from the re-executed podman process.
// If podman was re-executed the caller needs to propagate the error code returned by the child
// process.
-func BecomeRootInUserNS() (bool, int, error) {
+func BecomeRootInUserNS(pausePid string) (bool, int, error) {
if os.Geteuid() == 0 || os.Getenv("_CONTAINERS_USERNS_CONFIGURED") != "" {
if os.Getenv("_CONTAINERS_USERNS_CONFIGURED") == "init" {
return false, 0, runInUser()
@@ -214,18 +226,23 @@ func BecomeRootInUserNS() (bool, int, error) {
return false, 0, nil
}
+ cPausePid := C.CString(pausePid)
+ defer C.free(unsafe.Pointer(cPausePid))
+
runtime.LockOSThread()
defer runtime.UnlockOSThread()
- r, w, err := os.Pipe()
+ fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_DGRAM, 0)
if err != nil {
return false, -1, err
}
+ r, w := os.NewFile(uintptr(fds[0]), "sync host"), os.NewFile(uintptr(fds[1]), "sync child")
+
defer r.Close()
defer w.Close()
defer w.Write([]byte("0"))
- pidC := C.reexec_in_user_namespace(C.int(r.Fd()))
+ pidC := C.reexec_in_user_namespace(C.int(r.Fd()), cPausePid)
pid := int(pidC)
if pid < 0 {
return false, -1, errors.Errorf("cannot re-exec process")
@@ -280,11 +297,35 @@ func BecomeRootInUserNS() (bool, int, error) {
}
}
- _, err = w.Write([]byte("1"))
+ _, err = w.Write([]byte("0"))
if err != nil {
return false, -1, errors.Wrapf(err, "write to sync pipe")
}
+ b := make([]byte, 1, 1)
+ _, err = w.Read(b)
+ if err != nil {
+ return false, -1, errors.Wrapf(err, "read from sync pipe")
+ }
+
+ if b[0] == '2' {
+ // We have lost the race for writing the PID file, as probably another
+ // process created a namespace and wrote the PID.
+ // Try to join it.
+ data, err := ioutil.ReadFile(pausePid)
+ if err == nil {
+ pid, err := strconv.ParseUint(string(data), 10, 0)
+ if err == nil {
+ return JoinUserAndMountNS(uint(pid), "")
+ }
+ }
+ return false, -1, errors.Wrapf(err, "error setting up the process")
+ }
+
+ if b[0] != '0' {
+ return false, -1, errors.Wrapf(err, "error setting up the process")
+ }
+
c := make(chan os.Signal, 1)
signals := []os.Signal{}
diff --git a/pkg/rootless/rootless_unsupported.go b/pkg/rootless/rootless_unsupported.go
index 47b5dd7cc..42f8f3aec 100644
--- a/pkg/rootless/rootless_unsupported.go
+++ b/pkg/rootless/rootless_unsupported.go
@@ -15,7 +15,7 @@ func IsRootless() bool {
// into a new user namespace and the return code from the re-executed podman process.
// If podman was re-executed the caller needs to propagate the error code returned by the child
// process. It is a convenience function for BecomeRootInUserNSWithOpts with a default configuration.
-func BecomeRootInUserNS() (bool, int, error) {
+func BecomeRootInUserNS(pausePid string) (bool, int, error) {
return false, -1, errors.New("this function is not supported on this os")
}
@@ -28,6 +28,6 @@ func GetRootlessUID() int {
// namespace of the specified PID without looking up its parent. Useful to join directly
// the conmon process. It is a convenience function for JoinUserAndMountNSWithOpts
// with a default configuration.
-func JoinUserAndMountNS(pid uint) (bool, int, error) {
+func JoinUserAndMountNS(pid uint, pausePid string) (bool, int, error) {
return false, -1, errors.New("this function is not supported on this os")
}