aboutsummaryrefslogtreecommitdiff
path: root/libpod
diff options
context:
space:
mode:
authorGiuseppe Scrivano <gscrivan@redhat.com>2019-04-11 11:51:02 +0200
committerGiuseppe Scrivano <gscrivan@redhat.com>2019-04-11 15:40:00 +0200
commitb7800889fbd0d411d15ba8d1ddab58d02fd105ae (patch)
tree3d2e96a588b1c54016ea32a989d3c982751046c8 /libpod
parent6cd6eb6768bb936e87309c61d9cf131350274700 (diff)
downloadpodman-b7800889fbd0d411d15ba8d1ddab58d02fd105ae.tar.gz
podman-b7800889fbd0d411d15ba8d1ddab58d02fd105ae.tar.bz2
podman-b7800889fbd0d411d15ba8d1ddab58d02fd105ae.zip
userns: prevent /sys/kernel/* paths in the container
when we run in a user namespace, there are cases where we have not enough privileges to mount a fresh sysfs on /sys. To circumvent this limitation, we rbind /sys from the host. This carries inside of the container also some mounts we probably don't want to. We are also limited by the kernel to use rbind instead of bind, as allowing a bind would uncover paths that were not previously visible. This is a slimmed down version of the intermediate mount namespace logic we had before, where we only set /sys to slave, so the umounts done to the storage by the cleanup process are propagated back to the host. We also don't setup any new directory, so there is no additional cleanup to do. Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
Diffstat (limited to 'libpod')
-rw-r--r--libpod/oci_linux.go53
1 files changed, 53 insertions, 0 deletions
diff --git a/libpod/oci_linux.go b/libpod/oci_linux.go
index 8c0abad80..01f7c3649 100644
--- a/libpod/oci_linux.go
+++ b/libpod/oci_linux.go
@@ -3,15 +3,20 @@
package libpod
import (
+ "fmt"
"os"
"os/exec"
"path/filepath"
+ "runtime"
"strings"
"syscall"
"github.com/containerd/cgroups"
+ "github.com/containers/libpod/pkg/rootless"
"github.com/containers/libpod/utils"
+ pmount "github.com/containers/storage/pkg/mount"
spec "github.com/opencontainers/runtime-spec/specs-go"
+ "github.com/pkg/errors"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
@@ -91,6 +96,54 @@ func (r *OCIRuntime) createContainer(ctr *Container, cgroupParent string, restor
return err
}
}
+
+ // if we are running a non privileged container, be sure to umount some kernel paths so they are not
+ // bind mounted inside the container at all.
+ if !ctr.config.Privileged && !rootless.IsRootless() {
+ ch := make(chan error)
+ go func() {
+ runtime.LockOSThread()
+ err := func() error {
+ fd, err := os.Open(fmt.Sprintf("/proc/%d/task/%d/ns/mnt", os.Getpid(), unix.Gettid()))
+ if err != nil {
+ return err
+ }
+ defer fd.Close()
+
+ // create a new mountns on the current thread
+ if err = unix.Unshare(unix.CLONE_NEWNS); err != nil {
+ return err
+ }
+ defer unix.Setns(int(fd.Fd()), unix.CLONE_NEWNS)
+
+ // don't spread our mounts around. We are setting only /sys to be slave
+ // so that the cleanup process is still able to umount the storage and the
+ // changes are propagated to the host.
+ err = unix.Mount("/sys", "/sys", "none", unix.MS_REC|unix.MS_SLAVE, "")
+ if err != nil {
+ return errors.Wrapf(err, "cannot make /sys slave")
+ }
+
+ mounts, err := pmount.GetMounts()
+ if err != nil {
+ return err
+ }
+ for _, m := range mounts {
+ if !strings.HasPrefix(m.Mountpoint, "/sys/kernel") {
+ continue
+ }
+ err = unix.Unmount(m.Mountpoint, 0)
+ if err != nil {
+ return errors.Wrapf(err, "cannot unmount %s", m.Mountpoint)
+ }
+ }
+ return r.createOCIContainer(ctr, cgroupParent, restoreOptions)
+ }()
+ ch <- err
+ }()
+ err := <-ch
+ return err
+ }
}
return r.createOCIContainer(ctr, cgroupParent, restoreOptions)
}