diff options
author | baude <bbaude@redhat.com> | 2018-02-14 12:51:06 -0600 |
---|---|---|
committer | Atomic Bot <atomic-devel@projectatomic.io> | 2018-02-15 00:20:47 +0000 |
commit | be9ed1cfacc19d1ad3c09e10481da445615b8b8e (patch) | |
tree | 1c0c01daf5b43c6139e37408be601475c1dcea41 /vendor/github.com/opencontainers/runc/libcontainer | |
parent | d051dc38d81920c94c37b20ceba0d33b35299bca (diff) | |
download | podman-be9ed1cfacc19d1ad3c09e10481da445615b8b8e.tar.gz podman-be9ed1cfacc19d1ad3c09e10481da445615b8b8e.tar.bz2 podman-be9ed1cfacc19d1ad3c09e10481da445615b8b8e.zip |
Privileged containers should inherit host devices
When running a privileged container, it should inherit the same
devices the host has.
Signed-off-by: baude <bbaude@redhat.com>
Closes: #330
Approved by: mheon
Diffstat (limited to 'vendor/github.com/opencontainers/runc/libcontainer')
56 files changed, 1670 insertions, 840 deletions
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor.go b/vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor.go index 82ed1a68a..7fff0627f 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor.go @@ -2,15 +2,10 @@ package apparmor -// #cgo LDFLAGS: -lapparmor -// #include <sys/apparmor.h> -// #include <stdlib.h> -import "C" import ( "fmt" "io/ioutil" "os" - "unsafe" ) // IsEnabled returns true if apparmor is enabled for the host. @@ -24,16 +19,36 @@ func IsEnabled() bool { return false } +func setprocattr(attr, value string) error { + // Under AppArmor you can only change your own attr, so use /proc/self/ + // instead of /proc/<tid>/ like libapparmor does + path := fmt.Sprintf("/proc/self/attr/%s", attr) + + f, err := os.OpenFile(path, os.O_WRONLY, 0) + if err != nil { + return err + } + defer f.Close() + + _, err = fmt.Fprintf(f, "%s", value) + return err +} + +// changeOnExec reimplements aa_change_onexec from libapparmor in Go +func changeOnExec(name string) error { + value := "exec " + name + if err := setprocattr("exec", value); err != nil { + return fmt.Errorf("apparmor failed to apply profile: %s", err) + } + return nil +} + // ApplyProfile will apply the profile with the specified name to the process after // the next exec. func ApplyProfile(name string) error { if name == "" { return nil } - cName := C.CString(name) - defer C.free(unsafe.Pointer(cName)) - if _, err := C.aa_change_onexec(cName); err != nil { - return fmt.Errorf("apparmor failed to apply profile: %s", err) - } - return nil + + return changeOnExec(name) } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go index 22d82acb4..43bdccf3e 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go @@ -145,8 +145,17 @@ func (m *Manager) Apply(pid int) (err error) { m.Paths[sys.Name()] = p if err := sys.Apply(d); err != nil { + if os.IsPermission(err) && m.Cgroups.Path == "" { + // If we didn't set a cgroup path, then let's defer the error here + // until we know whether we have set limits or not. + // If we hadn't set limits, then it's ok that we couldn't join this cgroup, because + // it will have the same limits as its parent. + delete(m.Paths, sys.Name()) + continue + } return err } + } return nil } @@ -198,6 +207,10 @@ func (m *Manager) Set(container *configs.Config) error { for _, sys := range subsystems { path := paths[sys.Name()] if err := sys.Set(path, container.Cgroups); err != nil { + if path == "" { + // cgroup never applied + return fmt.Errorf("cannot set limits on the %s cgroup, as the container has not joined it", sys.Name()) + } return err } } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go index e70dfe3b9..4b19f8a97 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go @@ -29,11 +29,15 @@ func (s *FreezerGroup) Apply(d *cgroupData) error { func (s *FreezerGroup) Set(path string, cgroup *configs.Cgroup) error { switch cgroup.Resources.Freezer { case configs.Frozen, configs.Thawed: - if err := writeFile(path, "freezer.state", string(cgroup.Resources.Freezer)); err != nil { - return err - } - for { + // In case this loop does not exit because it doesn't get the expected + // state, let's write again this state, hoping it's going to be properly + // set this time. Otherwise, this loop could run infinitely, waiting for + // a state change that would never happen. + if err := writeFile(path, "freezer.state", string(cgroup.Resources.Freezer)); err != nil { + return err + } + state, err := readFile(path, "freezer.state") if err != nil { return err @@ -41,6 +45,7 @@ func (s *FreezerGroup) Set(path string, cgroup *configs.Cgroup) error { if strings.TrimSpace(state) == string(cgroup.Resources.Freezer) { break } + time.Sleep(1 * time.Millisecond) } case configs.Undefined: diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/rootless/rootless.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/rootless/rootless.go deleted file mode 100644 index b1efbfd99..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/rootless/rootless.go +++ /dev/null @@ -1,128 +0,0 @@ -// +build linux - -package rootless - -import ( - "fmt" - - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/cgroups/fs" - "github.com/opencontainers/runc/libcontainer/configs" - "github.com/opencontainers/runc/libcontainer/configs/validate" -) - -// TODO: This is copied from libcontainer/cgroups/fs, which duplicates this code -// needlessly. We should probably export this list. - -var subsystems = []subsystem{ - &fs.CpusetGroup{}, - &fs.DevicesGroup{}, - &fs.MemoryGroup{}, - &fs.CpuGroup{}, - &fs.CpuacctGroup{}, - &fs.PidsGroup{}, - &fs.BlkioGroup{}, - &fs.HugetlbGroup{}, - &fs.NetClsGroup{}, - &fs.NetPrioGroup{}, - &fs.PerfEventGroup{}, - &fs.FreezerGroup{}, - &fs.NameGroup{GroupName: "name=systemd"}, -} - -type subsystem interface { - // Name returns the name of the subsystem. - Name() string - - // Returns the stats, as 'stats', corresponding to the cgroup under 'path'. - GetStats(path string, stats *cgroups.Stats) error -} - -// The noop cgroup manager is used for rootless containers, because we currently -// cannot manage cgroups if we are in a rootless setup. This manager is chosen -// by factory if we are in rootless mode. We error out if any cgroup options are -// set in the config -- this may change in the future with upcoming kernel features -// like the cgroup namespace. - -type Manager struct { - Cgroups *configs.Cgroup - Paths map[string]string -} - -func (m *Manager) Apply(pid int) error { - // If there are no cgroup settings, there's nothing to do. - if m.Cgroups == nil { - return nil - } - - // We can't set paths. - // TODO(cyphar): Implement the case where the runner of a rootless container - // owns their own cgroup, which would allow us to set up a - // cgroup for each path. - if m.Cgroups.Paths != nil { - return fmt.Errorf("cannot change cgroup path in rootless container") - } - - // We load the paths into the manager. - paths := make(map[string]string) - for _, sys := range subsystems { - name := sys.Name() - - path, err := cgroups.GetOwnCgroupPath(name) - if err != nil { - // Ignore paths we couldn't resolve. - continue - } - - paths[name] = path - } - - m.Paths = paths - return nil -} - -func (m *Manager) GetPaths() map[string]string { - return m.Paths -} - -func (m *Manager) Set(container *configs.Config) error { - // We have to re-do the validation here, since someone might decide to - // update a rootless container. - return validate.New().Validate(container) -} - -func (m *Manager) GetPids() ([]int, error) { - dir, err := cgroups.GetOwnCgroupPath("devices") - if err != nil { - return nil, err - } - return cgroups.GetPids(dir) -} - -func (m *Manager) GetAllPids() ([]int, error) { - dir, err := cgroups.GetOwnCgroupPath("devices") - if err != nil { - return nil, err - } - return cgroups.GetAllPids(dir) -} - -func (m *Manager) GetStats() (*cgroups.Stats, error) { - // TODO(cyphar): We can make this work if we figure out a way to allow usage - // of cgroups with a rootless container. While this doesn't - // actually require write access to a cgroup directory, the - // statistics are not useful if they can be affected by - // non-container processes. - return nil, fmt.Errorf("cannot get cgroup stats in rootless container") -} - -func (m *Manager) Freeze(state configs.FreezerState) error { - // TODO(cyphar): We can make this work if we figure out a way to allow usage - // of cgroups with a rootless container. - return fmt.Errorf("cannot use freezer cgroup in rootless container") -} - -func (m *Manager) Destroy() error { - // We don't have to do anything here because we didn't do any setup. - return nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_nosystemd.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_nosystemd.go index 7de9ae605..a65d8e443 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_nosystemd.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_nosystemd.go @@ -1,4 +1,4 @@ -// +build !linux +// +build !linux static_build package systemd @@ -43,7 +43,7 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) { } func (m *Manager) Set(container *configs.Config) error { - return nil, fmt.Errorf("Systemd not supported") + return fmt.Errorf("Systemd not supported") } func (m *Manager) Freeze(state configs.FreezerState) error { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go index b010b4b32..45bd3acce 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go @@ -1,4 +1,4 @@ -// +build linux +// +build linux,!static_build package systemd @@ -260,7 +260,7 @@ func (m *Manager) Apply(pid int) error { if c.Resources.Memory != 0 { properties = append(properties, - newProp("MemoryLimit", c.Resources.Memory)) + newProp("MemoryLimit", uint64(c.Resources.Memory))) } if c.Resources.CpuShares != 0 { @@ -271,6 +271,13 @@ func (m *Manager) Apply(pid int) error { // cpu.cfs_quota_us and cpu.cfs_period_us are controlled by systemd. if c.Resources.CpuQuota != 0 && c.Resources.CpuPeriod != 0 { cpuQuotaPerSecUSec := uint64(c.Resources.CpuQuota*1000000) / c.Resources.CpuPeriod + // systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota + // (integer percentage of CPU) internally. This means that if a fractional percent of + // CPU is indicated by Resources.CpuQuota, we need to round up to the nearest + // 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect. + if cpuQuotaPerSecUSec%10000 != 0 { + cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000 + } properties = append(properties, newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec)) } @@ -288,10 +295,13 @@ func (m *Manager) Apply(pid int) error { } } - if _, err := theConn.StartTransientUnit(unitName, "replace", properties, nil); err != nil && !isUnitExists(err) { + statusChan := make(chan string) + if _, err := theConn.StartTransientUnit(unitName, "replace", properties, statusChan); err != nil && !isUnitExists(err) { return err } + <-statusChan + if err := joinCgroups(c, pid); err != nil { return err } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/compat_1.5_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/compat_1.5_linux.go deleted file mode 100644 index c7bdf1f60..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/compat_1.5_linux.go +++ /dev/null @@ -1,10 +0,0 @@ -// +build linux,!go1.5 - -package libcontainer - -import "syscall" - -// GidMappingsEnableSetgroups was added in Go 1.5, so do nothing when building -// with earlier versions -func enableSetgroups(sys *syscall.SysProcAttr) { -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unsupported.go deleted file mode 100644 index 95e2830a4..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unsupported.go +++ /dev/null @@ -1,6 +0,0 @@ -// +build !windows,!linux,!freebsd - -package configs - -type Cgroup struct { -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go index 269fffff3..3cae4fd8d 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go @@ -187,6 +187,10 @@ type Config struct { // Rootless specifies whether the container is a rootless container. Rootless bool `json:"rootless"` + + // IntelRdt specifies settings for Intel RDT/CAT group that the container is placed into + // to limit the resources (e.g., L3 cache) the container has available + IntelRdt *IntelRdt `json:"intel_rdt,omitempty"` } type Hooks struct { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go index 4d348d217..e4f423c52 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go @@ -1,4 +1,4 @@ -// +build linux freebsd +// +build linux package configs diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go new file mode 100644 index 000000000..36bd5f96a --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go @@ -0,0 +1,7 @@ +package configs + +type IntelRdt struct { + // The schema for L3 cache id and capacity bitmask (CBM) + // Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..." + L3CacheSchema string `json:"l3_cache_schema,omitempty"` +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go index 0cebfaf80..7a9f33b71 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go @@ -21,13 +21,6 @@ func (v *ConfigValidator) rootless(config *configs.Config) error { if err := rootlessMount(config); err != nil { return err } - // Currently, cgroups cannot effectively be used in rootless containers. - // The new cgroup namespace doesn't really help us either because it doesn't - // have nice interactions with the user namespace (we're working with upstream - // to fix this). - if err := rootlessCgroup(config); err != nil { - return err - } // XXX: We currently can't verify the user config at all, because // configs.Config doesn't store the user-related configs. So this @@ -36,37 +29,27 @@ func (v *ConfigValidator) rootless(config *configs.Config) error { return nil } -func rootlessMappings(config *configs.Config) error { - rootuid, err := config.HostRootUID() - if err != nil { - return fmt.Errorf("failed to get root uid from uidMappings: %v", err) +func hasIDMapping(id int, mappings []configs.IDMap) bool { + for _, m := range mappings { + if id >= m.ContainerID && id < m.ContainerID+m.Size { + return true + } } + return false +} + +func rootlessMappings(config *configs.Config) error { if euid := geteuid(); euid != 0 { if !config.Namespaces.Contains(configs.NEWUSER) { return fmt.Errorf("rootless containers require user namespaces") } - if rootuid != euid { - return fmt.Errorf("rootless containers cannot map container root to a different host user") - } - } - - rootgid, err := config.HostRootGID() - if err != nil { - return fmt.Errorf("failed to get root gid from gidMappings: %v", err) } - // Similar to the above test, we need to make sure that we aren't trying to - // map to a group ID that we don't have the right to be. - if rootgid != getegid() { - return fmt.Errorf("rootless containers cannot map container root to a different host group") + if len(config.UidMappings) == 0 { + return fmt.Errorf("rootless containers requires at least one UID mapping") } - - // We can only map one user and group inside a container (our own). - if len(config.UidMappings) != 1 || config.UidMappings[0].Size != 1 { - return fmt.Errorf("rootless containers cannot map more than one user") - } - if len(config.GidMappings) != 1 || config.GidMappings[0].Size != 1 { - return fmt.Errorf("rootless containers cannot map more than one group") + if len(config.GidMappings) == 0 { + return fmt.Errorf("rootless containers requires at least one UID mapping") } return nil @@ -104,11 +87,28 @@ func rootlessMount(config *configs.Config) error { // Check that the options list doesn't contain any uid= or gid= entries // that don't resolve to root. for _, opt := range strings.Split(mount.Data, ",") { - if strings.HasPrefix(opt, "uid=") && opt != "uid=0" { - return fmt.Errorf("cannot specify uid= mount options in rootless containers where argument isn't 0") + if strings.HasPrefix(opt, "uid=") { + var uid int + n, err := fmt.Sscanf(opt, "uid=%d", &uid) + if n != 1 || err != nil { + // Ignore unknown mount options. + continue + } + if !hasIDMapping(uid, config.UidMappings) { + return fmt.Errorf("cannot specify uid= mount options for unmapped uid in rootless containers") + } } - if strings.HasPrefix(opt, "gid=") && opt != "gid=0" { - return fmt.Errorf("cannot specify gid= mount options in rootless containers where argument isn't 0") + + if strings.HasPrefix(opt, "gid=") { + var gid int + n, err := fmt.Sscanf(opt, "gid=%d", &gid) + if n != 1 || err != nil { + // Ignore unknown mount options. + continue + } + if !hasIDMapping(gid, config.GidMappings) { + return fmt.Errorf("cannot specify gid= mount options for unmapped gid in rootless containers") + } } } } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go index 828434544..cbbba9a03 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go @@ -7,6 +7,7 @@ import ( "strings" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/intelrdt" selinux "github.com/opencontainers/selinux/go-selinux" ) @@ -40,6 +41,9 @@ func (v *ConfigValidator) Validate(config *configs.Config) error { if err := v.sysctl(config); err != nil { return err } + if err := v.intelrdt(config); err != nil { + return err + } if config.Rootless { if err := v.rootless(config); err != nil { return err @@ -153,6 +157,19 @@ func (v *ConfigValidator) sysctl(config *configs.Config) error { return nil } +func (v *ConfigValidator) intelrdt(config *configs.Config) error { + if config.IntelRdt != nil { + if !intelrdt.IsEnabled() { + return fmt.Errorf("intelRdt is specified in config, but Intel RDT feature is not supported or enabled") + } + if config.IntelRdt.L3CacheSchema == "" { + return fmt.Errorf("intelRdt is specified in config, but intelRdt.l3CacheSchema is empty") + } + } + + return nil +} + func isSymbolicLink(path string) (bool, error) { fi, err := os.Lstat(path) if err != nil { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/console.go b/vendor/github.com/opencontainers/runc/libcontainer/console.go deleted file mode 100644 index 917acc702..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/console.go +++ /dev/null @@ -1,17 +0,0 @@ -package libcontainer - -import ( - "io" - "os" -) - -// Console represents a pseudo TTY. -type Console interface { - io.ReadWriteCloser - - // Path returns the filesystem path to the slave side of the pty. - Path() string - - // Fd returns the fd for the master of the pty. - File() *os.File -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/console_freebsd.go b/vendor/github.com/opencontainers/runc/libcontainer/console_freebsd.go deleted file mode 100644 index b7166a31f..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/console_freebsd.go +++ /dev/null @@ -1,13 +0,0 @@ -// +build freebsd - -package libcontainer - -import ( - "errors" -) - -// newConsole returns an initialized console that can be used within a container by copying bytes -// from the master side to the slave that is attached as the tty for the container's init process. -func newConsole() (Console, error) { - return nil, errors.New("libcontainer console is not supported on FreeBSD") -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/console_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/console_linux.go index f70de3848..9997e93ed 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/console_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/console_linux.go @@ -1,71 +1,14 @@ package libcontainer import ( - "fmt" "os" - "unsafe" "golang.org/x/sys/unix" ) -func ConsoleFromFile(f *os.File) Console { - return &linuxConsole{ - master: f, - } -} - -// newConsole returns an initialized console that can be used within a container by copying bytes -// from the master side to the slave that is attached as the tty for the container's init process. -func newConsole() (Console, error) { - master, err := os.OpenFile("/dev/ptmx", unix.O_RDWR|unix.O_NOCTTY|unix.O_CLOEXEC, 0) - if err != nil { - return nil, err - } - console, err := ptsname(master) - if err != nil { - return nil, err - } - if err := unlockpt(master); err != nil { - return nil, err - } - return &linuxConsole{ - slavePath: console, - master: master, - }, nil -} - -// linuxConsole is a linux pseudo TTY for use within a container. -type linuxConsole struct { - master *os.File - slavePath string -} - -func (c *linuxConsole) File() *os.File { - return c.master -} - -func (c *linuxConsole) Path() string { - return c.slavePath -} - -func (c *linuxConsole) Read(b []byte) (int, error) { - return c.master.Read(b) -} - -func (c *linuxConsole) Write(b []byte) (int, error) { - return c.master.Write(b) -} - -func (c *linuxConsole) Close() error { - if m := c.master; m != nil { - return m.Close() - } - return nil -} - // mount initializes the console inside the rootfs mounting with the specified mount label // and applying the correct ownership of the console. -func (c *linuxConsole) mount() error { +func mountConsole(slavePath string) error { oldMask := unix.Umask(0000) defer unix.Umask(oldMask) f, err := os.Create("/dev/console") @@ -75,17 +18,20 @@ func (c *linuxConsole) mount() error { if f != nil { f.Close() } - return unix.Mount(c.slavePath, "/dev/console", "bind", unix.MS_BIND, "") + return unix.Mount(slavePath, "/dev/console", "bind", unix.MS_BIND, "") } // dupStdio opens the slavePath for the console and dups the fds to the current // processes stdio, fd 0,1,2. -func (c *linuxConsole) dupStdio() error { - slave, err := c.open(unix.O_RDWR) +func dupStdio(slavePath string) error { + fd, err := unix.Open(slavePath, unix.O_RDWR, 0) if err != nil { - return err + return &os.PathError{ + Op: "open", + Path: slavePath, + Err: err, + } } - fd := int(slave.Fd()) for _, i := range []int{0, 1, 2} { if err := unix.Dup3(fd, i, 0); err != nil { return err @@ -93,60 +39,3 @@ func (c *linuxConsole) dupStdio() error { } return nil } - -// open is a clone of os.OpenFile without the O_CLOEXEC used to open the pty slave. -func (c *linuxConsole) open(flag int) (*os.File, error) { - r, e := unix.Open(c.slavePath, flag, 0) - if e != nil { - return nil, &os.PathError{ - Op: "open", - Path: c.slavePath, - Err: e, - } - } - return os.NewFile(uintptr(r), c.slavePath), nil -} - -func ioctl(fd uintptr, flag, data uintptr) error { - if _, _, err := unix.Syscall(unix.SYS_IOCTL, fd, flag, data); err != 0 { - return err - } - return nil -} - -// unlockpt unlocks the slave pseudoterminal device corresponding to the master pseudoterminal referred to by f. -// unlockpt should be called before opening the slave side of a pty. -func unlockpt(f *os.File) error { - var u int32 - return ioctl(f.Fd(), unix.TIOCSPTLCK, uintptr(unsafe.Pointer(&u))) -} - -// ptsname retrieves the name of the first available pts for the given master. -func ptsname(f *os.File) (string, error) { - n, err := unix.IoctlGetInt(int(f.Fd()), unix.TIOCGPTN) - if err != nil { - return "", err - } - return fmt.Sprintf("/dev/pts/%d", n), nil -} - -// SaneTerminal sets the necessary tty_ioctl(4)s to ensure that a pty pair -// created by us acts normally. In particular, a not-very-well-known default of -// Linux unix98 ptys is that they have +onlcr by default. While this isn't a -// problem for terminal emulators, because we relay data from the terminal we -// also relay that funky line discipline. -func SaneTerminal(terminal *os.File) error { - termios, err := unix.IoctlGetTermios(int(terminal.Fd()), unix.TCGETS) - if err != nil { - return fmt.Errorf("ioctl(tty, tcgets): %s", err.Error()) - } - - // Set -onlcr so we don't have to deal with \r. - termios.Oflag &^= unix.ONLCR - - if err := unix.IoctlSetTermios(int(terminal.Fd()), unix.TCSETS, termios); err != nil { - return fmt.Errorf("ioctl(tty, tcsets): %s", err.Error()) - } - - return nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/console_solaris.go b/vendor/github.com/opencontainers/runc/libcontainer/console_solaris.go deleted file mode 100644 index e5ca54599..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/console_solaris.go +++ /dev/null @@ -1,11 +0,0 @@ -package libcontainer - -import ( - "errors" -) - -// newConsole returns an initialized console that can be used within a container by copying bytes -// from the master side to the slave that is attached as the tty for the container's init process. -func newConsole() (Console, error) { - return nil, errors.New("libcontainer console is not supported on Solaris") -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/console_windows.go b/vendor/github.com/opencontainers/runc/libcontainer/console_windows.go deleted file mode 100644 index c61e866a5..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/console_windows.go +++ /dev/null @@ -1,30 +0,0 @@ -package libcontainer - -// newConsole returns an initialized console that can be used within a container -func newConsole() (Console, error) { - return &windowsConsole{}, nil -} - -// windowsConsole is a Windows pseudo TTY for use within a container. -type windowsConsole struct { -} - -func (c *windowsConsole) Fd() uintptr { - return 0 -} - -func (c *windowsConsole) Path() string { - return "" -} - -func (c *windowsConsole) Read(b []byte) (int, error) { - return 0, nil -} - -func (c *windowsConsole) Write(b []byte) (int, error) { - return 0, nil -} - -func (c *windowsConsole) Close() error { - return nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go index d7e7516e5..cfb05b43a 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go @@ -5,6 +5,7 @@ package libcontainer import ( "bytes" "encoding/json" + "errors" "fmt" "io" "io/ioutil" @@ -21,6 +22,7 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/criurpc" + "github.com/opencontainers/runc/libcontainer/intelrdt" "github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/runc/libcontainer/utils" @@ -38,10 +40,14 @@ type linuxContainer struct { root string config *configs.Config cgroupManager cgroups.Manager + intelRdtManager intelrdt.Manager + initPath string initArgs []string initProcess parentProcess initProcessStartTime uint64 criuPath string + newuidmapPath string + newgidmapPath string m sync.Mutex criuVersion int state containerState @@ -67,6 +73,9 @@ type State struct { // Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore ExternalDescriptors []string `json:"external_descriptors,omitempty"` + + // Intel RDT "resource control" filesystem path + IntelRdtPath string `json:"intel_rdt_path"` } // Container is a libcontainer container object. @@ -163,6 +172,11 @@ func (c *linuxContainer) Stats() (*Stats, error) { if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil { return stats, newSystemErrorWithCause(err, "getting container stats from cgroups") } + if c.intelRdtManager != nil { + if stats.IntelRdtStats, err = c.intelRdtManager.GetStats(); err != nil { + return stats, newSystemErrorWithCause(err, "getting container's Intel RDT stats") + } + } for _, iface := range c.config.Networks { switch iface.Type { case "veth": @@ -186,8 +200,26 @@ func (c *linuxContainer) Set(config configs.Config) error { if status == Stopped { return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning) } + if err := c.cgroupManager.Set(&config); err != nil { + // Set configs back + if err2 := c.cgroupManager.Set(c.config); err2 != nil { + logrus.Warnf("Setting back cgroup configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2) + } + return err + } + if c.intelRdtManager != nil { + if err := c.intelRdtManager.Set(&config); err != nil { + // Set configs back + if err2 := c.intelRdtManager.Set(c.config); err2 != nil { + logrus.Warnf("Setting back intelrdt configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2) + } + return err + } + } + // After config setting succeed, update config and states c.config = &config - return c.cgroupManager.Set(c.config) + _, err = c.updateState(nil) + return err } func (c *linuxContainer) Start(process *Process) error { @@ -236,20 +268,71 @@ func (c *linuxContainer) Exec() error { func (c *linuxContainer) exec() error { path := filepath.Join(c.root, execFifoFilename) - f, err := os.OpenFile(path, os.O_RDONLY, 0) - if err != nil { - return newSystemErrorWithCause(err, "open exec fifo for reading") + + fifoOpen := make(chan struct{}) + select { + case <-awaitProcessExit(c.initProcess.pid(), fifoOpen): + return errors.New("container process is already dead") + case result := <-awaitFifoOpen(path): + close(fifoOpen) + if result.err != nil { + return result.err + } + f := result.file + defer f.Close() + if err := readFromExecFifo(f); err != nil { + return err + } + return os.Remove(path) } - defer f.Close() - data, err := ioutil.ReadAll(f) +} + +func readFromExecFifo(execFifo io.Reader) error { + data, err := ioutil.ReadAll(execFifo) if err != nil { return err } - if len(data) > 0 { - os.Remove(path) - return nil + if len(data) <= 0 { + return fmt.Errorf("cannot start an already running container") } - return fmt.Errorf("cannot start an already running container") + return nil +} + +func awaitProcessExit(pid int, exit <-chan struct{}) <-chan struct{} { + isDead := make(chan struct{}) + go func() { + for { + select { + case <-exit: + return + case <-time.After(time.Millisecond * 100): + stat, err := system.Stat(pid) + if err != nil || stat.State == system.Zombie { + close(isDead) + return + } + } + } + }() + return isDead +} + +func awaitFifoOpen(path string) <-chan openResult { + fifoOpened := make(chan openResult) + go func() { + f, err := os.OpenFile(path, os.O_RDONLY, 0) + if err != nil { + fifoOpened <- openResult{err: newSystemErrorWithCause(err, "open exec fifo for reading")} + return + } + fifoOpened <- openResult{file: f} + }() + return fifoOpened +} + +type openResult struct { + file *os.File + err error } func (c *linuxContainer) start(process *Process, isInit bool) error { @@ -259,7 +342,7 @@ func (c *linuxContainer) start(process *Process, isInit bool) error { } if err := parent.start(); err != nil { // terminate the process to ensure that it properly is reaped. - if err := parent.terminate(); err != nil { + if err := ignoreTerminateErrors(parent.terminate()); err != nil { logrus.Warn(err) } return newSystemErrorWithCause(err, "starting container process") @@ -277,15 +360,17 @@ func (c *linuxContainer) start(process *Process, isInit bool) error { c.initProcessStartTime = state.InitProcessStartTime if c.config.Hooks != nil { + bundle, annotations := utils.Annotations(c.config.Labels) s := configs.HookState{ - Version: c.config.Version, - ID: c.id, - Pid: parent.pid(), - Bundle: utils.SearchLabels(c.config.Labels, "bundle"), + Version: c.config.Version, + ID: c.id, + Pid: parent.pid(), + Bundle: bundle, + Annotations: annotations, } for i, hook := range c.config.Hooks.Poststart { if err := hook.Run(s); err != nil { - if err := parent.terminate(); err != nil { + if err := ignoreTerminateErrors(parent.terminate()); err != nil { logrus.Warn(err) } return newSystemErrorWithCausef(err, "running poststart hook %d", i) @@ -341,6 +426,23 @@ func (c *linuxContainer) deleteExecFifo() { os.Remove(fifoName) } +// includeExecFifo opens the container's execfifo as a pathfd, so that the +// container cannot access the statedir (and the FIFO itself remains +// un-opened). It then adds the FifoFd to the given exec.Cmd as an inherited +// fd, with _LIBCONTAINER_FIFOFD set to its fd number. +func (c *linuxContainer) includeExecFifo(cmd *exec.Cmd) error { + fifoName := filepath.Join(c.root, execFifoFilename) + fifoFd, err := unix.Open(fifoName, unix.O_PATH|unix.O_CLOEXEC, 0) + if err != nil { + return err + } + + cmd.ExtraFiles = append(cmd.ExtraFiles, os.NewFile(uintptr(fifoFd), fifoName)) + cmd.Env = append(cmd.Env, + fmt.Sprintf("_LIBCONTAINER_FIFOFD=%d", stdioFdCount+len(cmd.ExtraFiles)-1)) + return nil +} + func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProcess, error) { parentPipe, childPipe, err := utils.NewSockPair("init") if err != nil { @@ -354,22 +456,20 @@ func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProces return c.newSetnsProcess(p, cmd, parentPipe, childPipe) } - // We only set up rootDir if we're not doing a `runc exec`. The reason for - // this is to avoid cases where a racing, unprivileged process inside the - // container can get access to the statedir file descriptor (which would - // allow for container rootfs escape). - rootDir, err := os.Open(c.root) - if err != nil { - return nil, err + // We only set up fifoFd if we're not doing a `runc exec`. The historic + // reason for this is that previously we would pass a dirfd that allowed + // for container rootfs escape (and not doing it in `runc exec` avoided + // that problem), but we no longer do that. However, there's no need to do + // this for `runc exec` so we just keep it this way to be safe. + if err := c.includeExecFifo(cmd); err != nil { + return nil, newSystemErrorWithCause(err, "including execfifo in cmd.Exec setup") } - cmd.ExtraFiles = append(cmd.ExtraFiles, rootDir) - cmd.Env = append(cmd.Env, - fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1)) - return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir) + return c.newInitProcess(p, cmd, parentPipe, childPipe) } func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) { - cmd := exec.Command(c.initArgs[0], c.initArgs[1:]...) + cmd := exec.Command(c.initPath, c.initArgs[1:]...) + cmd.Args[0] = c.initArgs[0] cmd.Stdin = p.Stdin cmd.Stdout = p.Stdout cmd.Stderr = p.Stderr @@ -397,7 +497,7 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec. return cmd, nil } -func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess, error) { +func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) { cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard)) nsMaps := make(map[configs.NamespaceType]string) for _, ns := range c.config.Namespaces { @@ -411,16 +511,16 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c return nil, err } return &initProcess{ - cmd: cmd, - childPipe: childPipe, - parentPipe: parentPipe, - manager: c.cgroupManager, - config: c.newInitConfig(p), - container: c, - process: p, - bootstrapData: data, - sharePidns: sharePidns, - rootDir: rootDir, + cmd: cmd, + childPipe: childPipe, + parentPipe: parentPipe, + manager: c.cgroupManager, + intelRdtManager: c.intelRdtManager, + config: c.newInitConfig(p), + container: c, + process: p, + bootstrapData: data, + sharePidns: sharePidns, }, nil } @@ -439,6 +539,7 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, return &setnsProcess{ cmd: cmd, cgroupPaths: c.cgroupManager.GetPaths(), + intelRdtPath: state.IntelRdtPath, childPipe: childPipe, parentPipe: parentPipe, config: c.newInitConfig(p), @@ -477,6 +578,8 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig { cfg.Rlimits = process.Rlimits } cfg.CreateConsole = process.ConsoleSocket != nil + cfg.ConsoleWidth = process.ConsoleWidth + cfg.ConsoleHeight = process.ConsoleHeight return cfg } @@ -578,9 +681,24 @@ func (c *linuxContainer) checkCriuFeatures(criuOpts *CriuOpts, rpcOpts *criurpc. logrus.Debugf("Feature check says: %s", criuFeatures) missingFeatures := false - if *criuFeat.MemTrack && !*criuFeatures.MemTrack { - missingFeatures = true - logrus.Debugf("CRIU does not support MemTrack") + // The outer if checks if the fields actually exist + if (criuFeat.MemTrack != nil) && + (criuFeatures.MemTrack != nil) { + // The inner if checks if they are set to true + if *criuFeat.MemTrack && !*criuFeatures.MemTrack { + missingFeatures = true + logrus.Debugf("CRIU does not support MemTrack") + } + } + + // This needs to be repeated for every new feature check. + // Is there a way to put this in a function. Reflection? + if (criuFeat.LazyPages != nil) && + (criuFeatures.LazyPages != nil) { + if *criuFeat.LazyPages && !*criuFeatures.LazyPages { + missingFeatures = true + logrus.Debugf("CRIU does not support LazyPages") + } } if missingFeatures { @@ -610,9 +728,9 @@ func parseCriuVersion(path string) (int, error) { return 0, fmt.Errorf("Unable to parse the CRIU version: %s", path) } - n, err := fmt.Sscanf(string(version), "GitID: v%d.%d.%d", &x, &y, &z) // 1.5.2 + n, err := fmt.Sscanf(version, "GitID: v%d.%d.%d", &x, &y, &z) // 1.5.2 if err != nil { - n, err = fmt.Sscanf(string(version), "GitID: v%d.%d", &x, &y) // 1.6 + n, err = fmt.Sscanf(version, "GitID: v%d.%d", &x, &y) // 1.6 y++ } else { z++ @@ -736,6 +854,25 @@ func (c *linuxContainer) addMaskPaths(req *criurpc.CriuReq) error { } req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) } + return nil +} + +func waitForCriuLazyServer(r *os.File, status string) error { + + data := make([]byte, 1) + _, err := r.Read(data) + if err != nil { + return err + } + fd, err := os.OpenFile(status, os.O_TRUNC|os.O_WRONLY, os.ModeAppend) + if err != nil { + return err + } + _, err = fd.Write(data) + if err != nil { + return err + } + fd.Close() return nil } @@ -802,6 +939,8 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { FileLocks: proto.Bool(criuOpts.FileLocks), EmptyNs: proto.Uint32(criuOpts.EmptyNs), OrphanPtsMaster: proto.Bool(true), + AutoDedup: proto.Bool(criuOpts.AutoDedup), + LazyPages: proto.Bool(criuOpts.LazyPages), } fcg := c.cgroupManager.GetPaths()["freezer"] @@ -852,6 +991,24 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { Opts: &rpcOpts, } + if criuOpts.LazyPages { + // lazy migration requested; check if criu supports it + feat := criurpc.CriuFeatures{ + LazyPages: proto.Bool(true), + } + + if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil { + return err + } + + statusRead, statusWrite, err := os.Pipe() + if err != nil { + return err + } + rpcOpts.StatusFd = proto.Int32(int32(statusWrite.Fd())) + go waitForCriuLazyServer(statusRead, criuOpts.StatusFd) + } + //no need to dump these information in pre-dump if !criuOpts.PreDump { for _, m := range c.config.Mounts { @@ -1003,6 +1160,8 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { FileLocks: proto.Bool(criuOpts.FileLocks), EmptyNs: proto.Uint32(criuOpts.EmptyNs), OrphanPtsMaster: proto.Bool(true), + AutoDedup: proto.Bool(criuOpts.AutoDedup), + LazyPages: proto.Bool(criuOpts.LazyPages), }, } @@ -1331,11 +1490,13 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc } case notify.GetScript() == "setup-namespaces": if c.config.Hooks != nil { + bundle, annotations := utils.Annotations(c.config.Labels) s := configs.HookState{ - Version: c.config.Version, - ID: c.id, - Pid: int(notify.GetPid()), - Bundle: utils.SearchLabels(c.config.Labels, "bundle"), + Version: c.config.Version, + ID: c.id, + Pid: int(notify.GetPid()), + Bundle: bundle, + Annotations: annotations, } for i, hook := range c.config.Hooks.Prestart { if err := hook.Run(s); err != nil { @@ -1380,7 +1541,7 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc defer master.Close() // While we can access console.master, using the API is a good idea. - if err := utils.SendFd(process.ConsoleSocket, master); err != nil { + if err := utils.SendFd(process.ConsoleSocket, master.Name(), master.Fd()); err != nil { return err } } @@ -1388,7 +1549,9 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc } func (c *linuxContainer) updateState(process parentProcess) (*State, error) { - c.initProcess = process + if process != nil { + c.initProcess = process + } state, err := c.currentState() if err != nil { return nil, err @@ -1493,6 +1656,10 @@ func (c *linuxContainer) currentState() (*State, error) { startTime, _ = c.initProcess.startTime() externalDescriptors = c.initProcess.externalDescriptors() } + intelRdtPath, err := intelrdt.GetIntelRdtPath(c.ID()) + if err != nil { + intelRdtPath = "" + } state := &State{ BaseState: BaseState{ ID: c.ID(), @@ -1503,6 +1670,7 @@ func (c *linuxContainer) currentState() (*State, error) { }, Rootless: c.config.Rootless, CgroupPaths: c.cgroupManager.GetPaths(), + IntelRdtPath: intelRdtPath, NamespacePaths: make(map[configs.NamespaceType]string), ExternalDescriptors: externalDescriptors, } @@ -1601,6 +1769,12 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na if !joinExistingUser { // write uid mappings if len(c.config.UidMappings) > 0 { + if c.config.Rootless && c.newuidmapPath != "" { + r.AddData(&Bytemsg{ + Type: UidmapPathAttr, + Value: []byte(c.newuidmapPath), + }) + } b, err := encodeIDMapping(c.config.UidMappings) if err != nil { return nil, err @@ -1621,6 +1795,12 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na Type: GidmapAttr, Value: b, }) + if c.config.Rootless && c.newgidmapPath != "" { + r.AddData(&Bytemsg{ + Type: GidmapPathAttr, + Value: []byte(c.newgidmapPath), + }) + } // The following only applies if we are root. if !c.config.Rootless { // check if we have CAP_SETGID to setgroup properly @@ -1652,3 +1832,18 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na return bytes.NewReader(r.Serialize()), nil } + +// ignoreTerminateErrors returns nil if the given err matches an error known +// to indicate that the terminate occurred successfully or err was nil, otherwise +// err is returned unaltered. +func ignoreTerminateErrors(err error) error { + if err == nil { + return nil + } + s := err.Error() + switch { + case strings.Contains(s, "process already finished"), strings.Contains(s, "Wait was already called"): + return nil + } + return err +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/container_solaris.go b/vendor/github.com/opencontainers/runc/libcontainer/container_solaris.go deleted file mode 100644 index bb84ff740..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/container_solaris.go +++ /dev/null @@ -1,20 +0,0 @@ -package libcontainer - -// State represents a running container's state -type State struct { - BaseState - - // Platform specific fields below here -} - -// A libcontainer container object. -// -// Each container is thread-safe within the same process. Since a container can -// be destroyed by a separate process, any function may return that the container -// was not found. -type Container interface { - BaseContainer - - // Methods below here are platform specific - -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/container_windows.go b/vendor/github.com/opencontainers/runc/libcontainer/container_windows.go deleted file mode 100644 index bb84ff740..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/container_windows.go +++ /dev/null @@ -1,20 +0,0 @@ -package libcontainer - -// State represents a running container's state -type State struct { - BaseState - - // Platform specific fields below here -} - -// A libcontainer container object. -// -// Each container is thread-safe within the same process. Since a container can -// be destroyed by a separate process, any function may return that the container -// was not found. -type Container interface { - BaseContainer - - // Methods below here are platform specific - -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/criu_opts_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/criu_opts_linux.go index 9423d2464..a2e344fc4 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/criu_opts_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/criu_opts_linux.go @@ -23,7 +23,7 @@ type VethPairName struct { type CriuOpts struct { ImagesDirectory string // directory for storing image files WorkDirectory string // directory to cd and write logs/pidfiles/stats to - ParentImage string // direcotry for storing parent image files in pre-dump and dump + ParentImage string // directory for storing parent image files in pre-dump and dump LeaveRunning bool // leave container in running state after checkpoint TcpEstablished bool // checkpoint/restore established TCP connections ExternalUnixConnections bool // allow external unix connections @@ -34,4 +34,7 @@ type CriuOpts struct { VethPairs []VethPairName // pass the veth to criu when restore ManageCgroupsMode cgMode // dump or restore cgroup mode EmptyNs uint32 // don't c/r properties for namespace from this mask + AutoDedup bool // auto deduplication for incremental dumps + LazyPages bool // restore memory pages lazily using userfaultfd + StatusFd string // fd for feedback when lazy server is ready } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/criu_opts_windows.go b/vendor/github.com/opencontainers/runc/libcontainer/criu_opts_windows.go deleted file mode 100644 index bc9207703..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/criu_opts_windows.go +++ /dev/null @@ -1,6 +0,0 @@ -package libcontainer - -// TODO Windows: This can ultimately be entirely factored out as criu is -// a Unix concept not relevant on Windows. -type CriuOpts struct { -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/devices/devices_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/devices/devices.go index 461dc097c..361925890 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/devices/devices_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/devices/devices.go @@ -28,6 +28,15 @@ func DeviceFromPath(path, permissions string) (*configs.Device, error) { if err != nil { return nil, err } + + var ( + devNumber = stat.Rdev + major = unix.Major(devNumber) + ) + if major == 0 { + return nil, ErrNotADevice + } + var ( devType rune mode = stat.Mode @@ -37,21 +46,16 @@ func DeviceFromPath(path, permissions string) (*configs.Device, error) { devType = 'b' case mode&unix.S_IFCHR == unix.S_IFCHR: devType = 'c' - default: - return nil, ErrNotADevice } - devNumber := int(stat.Rdev) - uid := stat.Uid - gid := stat.Gid return &configs.Device{ Type: devType, Path: path, - Major: Major(devNumber), - Minor: Minor(devNumber), + Major: int64(major), + Minor: int64(unix.Minor(devNumber)), Permissions: permissions, FileMode: os.FileMode(mode), - Uid: uid, - Gid: gid, + Uid: stat.Uid, + Gid: stat.Gid, }, nil } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/devices/devices_unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/devices/devices_unsupported.go deleted file mode 100644 index 6649b9f2d..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/devices/devices_unsupported.go +++ /dev/null @@ -1,3 +0,0 @@ -// +build !linux - -package devices diff --git a/vendor/github.com/opencontainers/runc/libcontainer/devices/number.go b/vendor/github.com/opencontainers/runc/libcontainer/devices/number.go deleted file mode 100644 index 885b6e5dd..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/devices/number.go +++ /dev/null @@ -1,24 +0,0 @@ -// +build linux freebsd - -package devices - -/* - -This code provides support for manipulating linux device numbers. It should be replaced by normal syscall functions once http://code.google.com/p/go/issues/detail?id=8106 is solved. - -You can read what they are here: - - - http://www.makelinux.net/ldd3/chp-3-sect-2 - - http://www.linux-tutorial.info/modules.php?name=MContent&pageid=94 - -Note! These are NOT the same as the MAJOR(dev_t device);, MINOR(dev_t device); and MKDEV(int major, int minor); functions as defined in <linux/kdev_t.h> as the representation of device numbers used by go is different than the one used internally to the kernel! - https://github.com/torvalds/linux/blob/master/include/linux/kdev_t.h#L9 - -*/ - -func Major(devNumber int) int64 { - return int64((devNumber >> 8) & 0xfff) -} - -func Minor(devNumber int) int64 { - return int64((devNumber & 0xff) | ((devNumber >> 12) & 0xfff00)) -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go index 42b6f5a05..7d53d5e04 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go @@ -11,13 +11,13 @@ import ( "runtime/debug" "strconv" - "github.com/docker/docker/pkg/mount" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fs" - "github.com/opencontainers/runc/libcontainer/cgroups/rootless" "github.com/opencontainers/runc/libcontainer/cgroups/systemd" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs/validate" + "github.com/opencontainers/runc/libcontainer/intelrdt" + "github.com/opencontainers/runc/libcontainer/mount" "github.com/opencontainers/runc/libcontainer/utils" "golang.org/x/sys/unix" @@ -72,15 +72,15 @@ func Cgroupfs(l *LinuxFactory) error { return nil } -// RootlessCgroups is an options func to configure a LinuxFactory to -// return containers that use the "rootless" cgroup manager, which will -// fail to do any operations not possible to do with an unprivileged user. -// It should only be used in conjunction with rootless containers. -func RootlessCgroups(l *LinuxFactory) error { - l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { - return &rootless.Manager{ - Cgroups: config, - Paths: paths, +// IntelRdtfs is an options func to configure a LinuxFactory to return +// containers that use the Intel RDT "resource control" filesystem to +// create and manage Intel Xeon platform shared resources (e.g., L3 cache). +func IntelRdtFs(l *LinuxFactory) error { + l.NewIntelRdtManager = func(config *configs.Config, id string, path string) intelrdt.Manager { + return &intelrdt.IntelRdtManager{ + Config: config, + Id: id, + Path: path, } } return nil @@ -119,12 +119,16 @@ func New(root string, options ...func(*LinuxFactory) error) (Factory, error) { } l := &LinuxFactory{ Root: root, - InitArgs: []string{"/proc/self/exe", "init"}, + InitPath: "/proc/self/exe", + InitArgs: []string{os.Args[0], "init"}, Validator: validate.New(), CriuPath: "criu", } Cgroupfs(l) for _, opt := range options { + if opt == nil { + continue + } if err := opt(l); err != nil { return nil, err } @@ -137,6 +141,10 @@ type LinuxFactory struct { // Root directory for the factory to store state. Root string + // InitPath is the path for calling the init responsibilities for spawning + // a container. + InitPath string + // InitArgs are arguments for calling the init responsibilities for spawning // a container. InitArgs []string @@ -145,11 +153,19 @@ type LinuxFactory struct { // containers. CriuPath string + // New{u,g}uidmapPath is the path to the binaries used for mapping with + // rootless containers. + NewuidmapPath string + NewgidmapPath string + // Validator provides validation to container configurations. Validator validate.Validator // NewCgroupsManager returns an initialized cgroups manager for a single container. NewCgroupsManager func(config *configs.Cgroup, paths map[string]string) cgroups.Manager + + // NewIntelRdtManager returns an initialized Intel RDT manager for a single container. + NewIntelRdtManager func(config *configs.Config, id string, path string) intelrdt.Manager } func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) { @@ -174,17 +190,20 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err if err := os.Chown(containerRoot, unix.Geteuid(), unix.Getegid()); err != nil { return nil, newGenericError(err, SystemError) } - if config.Rootless { - RootlessCgroups(l) - } c := &linuxContainer{ id: id, root: containerRoot, config: config, + initPath: l.InitPath, initArgs: l.InitArgs, criuPath: l.CriuPath, + newuidmapPath: l.NewuidmapPath, + newgidmapPath: l.NewgidmapPath, cgroupManager: l.NewCgroupsManager(config.Cgroups, nil), } + if intelrdt.IsEnabled() { + c.intelRdtManager = l.NewIntelRdtManager(config, id, "") + } c.state = &stoppedState{c: c} return c, nil } @@ -203,17 +222,16 @@ func (l *LinuxFactory) Load(id string) (Container, error) { processStartTime: state.InitProcessStartTime, fds: state.ExternalDescriptors, } - // We have to use the RootlessManager. - if state.Rootless { - RootlessCgroups(l) - } c := &linuxContainer{ initProcess: r, initProcessStartTime: state.InitProcessStartTime, id: id, config: &state.Config, + initPath: l.InitPath, initArgs: l.InitArgs, criuPath: l.CriuPath, + newuidmapPath: l.NewuidmapPath, + newgidmapPath: l.NewgidmapPath, cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths), root: containerRoot, created: state.Created, @@ -222,6 +240,9 @@ func (l *LinuxFactory) Load(id string) (Container, error) { if err := c.refreshState(); err != nil { return nil, err } + if intelrdt.IsEnabled() { + c.intelRdtManager = l.NewIntelRdtManager(&state.Config, id, state.IntelRdtPath) + } return c, nil } @@ -233,10 +254,10 @@ func (l *LinuxFactory) Type() string { // This is a low level implementation detail of the reexec and should not be consumed externally func (l *LinuxFactory) StartInitialization() (err error) { var ( - pipefd, rootfd int + pipefd, fifofd int consoleSocket *os.File envInitPipe = os.Getenv("_LIBCONTAINER_INITPIPE") - envStateDir = os.Getenv("_LIBCONTAINER_STATEDIR") + envFifoFd = os.Getenv("_LIBCONTAINER_FIFOFD") envConsole = os.Getenv("_LIBCONTAINER_CONSOLE") ) @@ -252,11 +273,11 @@ func (l *LinuxFactory) StartInitialization() (err error) { ) defer pipe.Close() - // Only init processes have STATEDIR. - rootfd = -1 + // Only init processes have FIFOFD. + fifofd = -1 if it == initStandard { - if rootfd, err = strconv.Atoi(envStateDir); err != nil { - return fmt.Errorf("unable to convert _LIBCONTAINER_STATEDIR=%s to int: %s", envStateDir, err) + if fifofd, err = strconv.Atoi(envFifoFd); err != nil { + return fmt.Errorf("unable to convert _LIBCONTAINER_FIFOFD=%s to int: %s", envFifoFd, err) } } @@ -291,7 +312,7 @@ func (l *LinuxFactory) StartInitialization() (err error) { } }() - i, err := newContainerInit(it, pipe, consoleSocket, rootfd) + i, err := newContainerInit(it, pipe, consoleSocket, fifofd) if err != nil { return err } @@ -323,3 +344,21 @@ func (l *LinuxFactory) validateID(id string) error { return nil } + +// NewuidmapPath returns an option func to configure a LinuxFactory with the +// provided .. +func NewuidmapPath(newuidmapPath string) func(*LinuxFactory) error { + return func(l *LinuxFactory) error { + l.NewuidmapPath = newuidmapPath + return nil + } +} + +// NewgidmapPath returns an option func to configure a LinuxFactory with the +// provided .. +func NewgidmapPath(newgidmapPath string) func(*LinuxFactory) error { + return func(l *LinuxFactory) error { + l.NewgidmapPath = newgidmapPath + return nil + } +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go index 63afd28eb..2770be307 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go @@ -12,15 +12,16 @@ import ( "syscall" // only for Errno "unsafe" + "golang.org/x/sys/unix" + + "github.com/containerd/console" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/runc/libcontainer/user" "github.com/opencontainers/runc/libcontainer/utils" - "github.com/sirupsen/logrus" "github.com/vishvananda/netlink" - "golang.org/x/sys/unix" ) type initType string @@ -31,7 +32,8 @@ const ( ) type pid struct { - Pid int `json:"pid"` + Pid int `json:"pid"` + PidFirstChild int `json:"pid_first"` } // network is an internal struct used to setup container networks. @@ -60,6 +62,8 @@ type initConfig struct { ContainerId string `json:"containerid"` Rlimits []configs.Rlimit `json:"rlimits"` CreateConsole bool `json:"create_console"` + ConsoleWidth uint16 `json:"console_width"` + ConsoleHeight uint16 `json:"console_height"` Rootless bool `json:"rootless"` } @@ -67,7 +71,7 @@ type initer interface { Init() error } -func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, stateDirFD int) (initer, error) { +func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd int) (initer, error) { var config *initConfig if err := json.NewDecoder(pipe).Decode(&config); err != nil { return nil, err @@ -88,7 +92,7 @@ func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, stateDi consoleSocket: consoleSocket, parentPid: unix.Getppid(), config: config, - stateDirFD: stateDirFD, + fifoFd: fifoFd, }, nil } return nil, fmt.Errorf("unknown init type %q", t) @@ -169,29 +173,38 @@ func setupConsole(socket *os.File, config *initConfig, mount bool) error { // however, that setupUser (specifically fixStdioPermissions) *will* change // the UID owner of the console to be the user the process will run as (so // they can actually control their console). - console, err := newConsole() + + pty, slavePath, err := console.NewPty() if err != nil { return err } - // After we return from here, we don't need the console anymore. - defer console.Close() - linuxConsole, ok := console.(*linuxConsole) - if !ok { - return fmt.Errorf("failed to cast console to *linuxConsole") + if config.ConsoleHeight != 0 && config.ConsoleWidth != 0 { + err = pty.Resize(console.WinSize{ + Height: config.ConsoleHeight, + Width: config.ConsoleWidth, + }) + + if err != nil { + return err + } } + + // After we return from here, we don't need the console anymore. + defer pty.Close() + // Mount the console inside our rootfs. if mount { - if err := linuxConsole.mount(); err != nil { + if err := mountConsole(slavePath); err != nil { return err } } // While we can access console.master, using the API is a good idea. - if err := utils.SendFd(socket, linuxConsole.File()); err != nil { + if err := utils.SendFd(socket, pty.Name(), pty.Fd()); err != nil { return err } // Now, dup over all the things. - return linuxConsole.dupStdio() + return dupStdio(slavePath) } // syncParentReady sends to the given pipe a JSON payload which indicates that @@ -260,25 +273,27 @@ func setupUser(config *initConfig) error { } } - if config.Rootless { - if execUser.Uid != 0 { - return fmt.Errorf("cannot run as a non-root user in a rootless container") - } - - if execUser.Gid != 0 { - return fmt.Errorf("cannot run as a non-root group in a rootless container") - } + // Rather than just erroring out later in setuid(2) and setgid(2), check + // that the user is mapped here. + if _, err := config.Config.HostUID(execUser.Uid); err != nil { + return fmt.Errorf("cannot set uid to unmapped user in user namespace") + } + if _, err := config.Config.HostGID(execUser.Gid); err != nil { + return fmt.Errorf("cannot set gid to unmapped user in user namespace") + } - // We cannot set any additional groups in a rootless container and thus we - // bail if the user asked us to do so. TODO: We currently can't do this - // earlier, but if libcontainer.Process.User was typesafe this might work. + if config.Rootless { + // We cannot set any additional groups in a rootless container and thus + // we bail if the user asked us to do so. TODO: We currently can't do + // this check earlier, but if libcontainer.Process.User was typesafe + // this might work. if len(addGroups) > 0 { return fmt.Errorf("cannot set any additional groups in a rootless container") } } - // before we change to the container's user make sure that the processes STDIO - // is correctly owned by the user that we are switching to. + // Before we change to the container's user make sure that the processes + // STDIO is correctly owned by the user that we are switching to. if err := fixStdioPermissions(config, execUser); err != nil { return err } @@ -297,7 +312,6 @@ func setupUser(config *initConfig) error { if err := system.Setgid(execUser.Gid); err != nil { return err } - if err := system.Setuid(execUser.Uid); err != nil { return err } @@ -334,14 +348,6 @@ func fixStdioPermissions(config *initConfig, u *user.ExecUser) error { continue } - // Skip chown if s.Gid is actually an unmapped gid in the host. While - // this is a bit dodgy if it just so happens that the console _is_ - // owned by overflow_gid, there's no way for us to disambiguate this as - // a userspace program. - if _, err := config.Config.HostGID(int(s.Gid)); err != nil { - continue - } - // We only change the uid owner (as it is possible for the mount to // prefer a different gid, and there's no reason for us to change it). // The reason why we don't just leave the default uid=X mount setup is @@ -349,6 +355,15 @@ func fixStdioPermissions(config *initConfig, u *user.ExecUser) error { // this code, you couldn't effectively run as a non-root user inside a // container and also have a console set up. if err := unix.Fchown(int(fd), u.Uid, int(s.Gid)); err != nil { + // If we've hit an EINVAL then s.Gid isn't mapped in the user + // namespace. If we've hit an EPERM then the inode's current owner + // is not mapped in our user namespace (in particular, + // privileged_wrt_inode_uidgid() has failed). In either case, we + // are in a configuration where it's better for us to just not + // touch the stdio rather than bail at this point. + if err == unix.EINVAL || err == unix.EPERM { + continue + } return err } } @@ -479,6 +494,16 @@ func signalAllProcesses(m cgroups.Manager, s os.Signal) error { logrus.Warn(err) } + subreaper, err := system.GetSubreaper() + if err != nil { + // The error here means that PR_GET_CHILD_SUBREAPER is not + // supported because this code might run on a kernel older + // than 3.4. We don't want to throw an error in that case, + // and we simplify things, considering there is no subreaper + // set. + subreaper = 0 + } + for _, p := range procs { if s != unix.SIGKILL { if ok, err := isWaitable(p.Pid); err != nil { @@ -492,9 +517,16 @@ func signalAllProcesses(m cgroups.Manager, s os.Signal) error { } } - if _, err := p.Wait(); err != nil { - if !isNoChildren(err) { - logrus.Warn("wait: ", err) + // In case a subreaper has been setup, this code must not + // wait for the process. Otherwise, we cannot be sure the + // current process will be reaped by the subreaper, while + // the subreaper might be waiting for this process in order + // to retrieve its exit code. + if subreaper == 0 { + if _, err := p.Wait(); err != nil { + if !isNoChildren(err) { + logrus.Warn("wait: ", err) + } } } } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/intelrdt.go b/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/intelrdt.go new file mode 100644 index 000000000..487c630af --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/intelrdt.go @@ -0,0 +1,553 @@ +// +build linux + +package intelrdt + +import ( + "bufio" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +/* + * About Intel RDT/CAT feature: + * Intel platforms with new Xeon CPU support Resource Director Technology (RDT). + * Intel Cache Allocation Technology (CAT) is a sub-feature of RDT. Currently L3 + * Cache is the only resource that is supported in RDT. + * + * This feature provides a way for the software to restrict cache allocation to a + * defined 'subset' of L3 cache which may be overlapping with other 'subsets'. + * The different subsets are identified by class of service (CLOS) and each CLOS + * has a capacity bitmask (CBM). + * + * For more information about Intel RDT/CAT can be found in the section 17.17 + * of Intel Software Developer Manual. + * + * About Intel RDT/CAT kernel interface: + * In Linux 4.10 kernel or newer, the interface is defined and exposed via + * "resource control" filesystem, which is a "cgroup-like" interface. + * + * Comparing with cgroups, it has similar process management lifecycle and + * interfaces in a container. But unlike cgroups' hierarchy, it has single level + * filesystem layout. + * + * Intel RDT "resource control" filesystem hierarchy: + * mount -t resctrl resctrl /sys/fs/resctrl + * tree /sys/fs/resctrl + * /sys/fs/resctrl/ + * |-- info + * | |-- L3 + * | |-- cbm_mask + * | |-- min_cbm_bits + * | |-- num_closids + * |-- cpus + * |-- schemata + * |-- tasks + * |-- <container_id> + * |-- cpus + * |-- schemata + * |-- tasks + * + * For runc, we can make use of `tasks` and `schemata` configuration for L3 cache + * resource constraints. + * + * The file `tasks` has a list of tasks that belongs to this group (e.g., + * <container_id>" group). Tasks can be added to a group by writing the task ID + * to the "tasks" file (which will automatically remove them from the previous + * group to which they belonged). New tasks created by fork(2) and clone(2) are + * added to the same group as their parent. If a pid is not in any sub group, it is + * in root group. + * + * The file `schemata` has allocation bitmasks/values for L3 cache on each socket, + * which contains L3 cache id and capacity bitmask (CBM). + * Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..." + * For example, on a two-socket machine, L3's schema line could be `L3:0=ff;1=c0` + * which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0. + * + * The valid L3 cache CBM is a *contiguous bits set* and number of bits that can + * be set is less than the max bit. The max bits in the CBM is varied among + * supported Intel Xeon platforms. In Intel RDT "resource control" filesystem + * layout, the CBM in a group should be a subset of the CBM in root. Kernel will + * check if it is valid when writing. e.g., 0xfffff in root indicates the max bits + * of CBM is 20 bits, which mapping to entire L3 cache capacity. Some valid CBM + * values to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc. + * + * For more information about Intel RDT/CAT kernel interface: + * https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt + * + * An example for runc: + * Consider a two-socket machine with two L3 caches where the default CBM is + * 0xfffff and the max CBM length is 20 bits. With this configuration, tasks + * inside the container only have access to the "upper" 80% of L3 cache id 0 and + * the "lower" 50% L3 cache id 1: + * + * "linux": { + * "intelRdt": { + * "l3CacheSchema": "L3:0=ffff0;1=3ff" + * } + * } + */ + +type Manager interface { + // Applies Intel RDT configuration to the process with the specified pid + Apply(pid int) error + + // Returns statistics for Intel RDT + GetStats() (*Stats, error) + + // Destroys the Intel RDT 'container_id' group + Destroy() error + + // Returns Intel RDT path to save in a state file and to be able to + // restore the object later + GetPath() string + + // Set Intel RDT "resource control" filesystem as configured. + Set(container *configs.Config) error +} + +// This implements interface Manager +type IntelRdtManager struct { + mu sync.Mutex + Config *configs.Config + Id string + Path string +} + +const ( + IntelRdtTasks = "tasks" +) + +var ( + // The absolute root path of the Intel RDT "resource control" filesystem + intelRdtRoot string + intelRdtRootLock sync.Mutex + + // The flag to indicate if Intel RDT is supported + isEnabled bool +) + +type intelRdtData struct { + root string + config *configs.Config + pid int +} + +// Check if Intel RDT is enabled in init() +func init() { + // 1. Check if hardware and kernel support Intel RDT/CAT feature + // "cat_l3" flag is set if supported + isFlagSet, err := parseCpuInfoFile("/proc/cpuinfo") + if !isFlagSet || err != nil { + isEnabled = false + return + } + + // 2. Check if Intel RDT "resource control" filesystem is mounted + // The user guarantees to mount the filesystem + isEnabled = isIntelRdtMounted() +} + +// Return the mount point path of Intel RDT "resource control" filesysem +func findIntelRdtMountpointDir() (string, error) { + f, err := os.Open("/proc/self/mountinfo") + if err != nil { + return "", err + } + defer f.Close() + + s := bufio.NewScanner(f) + for s.Scan() { + text := s.Text() + fields := strings.Split(text, " ") + // Safe as mountinfo encodes mountpoints with spaces as \040. + index := strings.Index(text, " - ") + postSeparatorFields := strings.Fields(text[index+3:]) + numPostFields := len(postSeparatorFields) + + // This is an error as we can't detect if the mount is for "Intel RDT" + if numPostFields == 0 { + return "", fmt.Errorf("Found no fields post '-' in %q", text) + } + + if postSeparatorFields[0] == "resctrl" { + // Check that the mount is properly formated. + if numPostFields < 3 { + return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text) + } + + return fields[4], nil + } + } + if err := s.Err(); err != nil { + return "", err + } + + return "", NewNotFoundError("Intel RDT") +} + +// Gets the root path of Intel RDT "resource control" filesystem +func getIntelRdtRoot() (string, error) { + intelRdtRootLock.Lock() + defer intelRdtRootLock.Unlock() + + if intelRdtRoot != "" { + return intelRdtRoot, nil + } + + root, err := findIntelRdtMountpointDir() + if err != nil { + return "", err + } + + if _, err := os.Stat(root); err != nil { + return "", err + } + + intelRdtRoot = root + return intelRdtRoot, nil +} + +func isIntelRdtMounted() bool { + _, err := getIntelRdtRoot() + if err != nil { + return false + } + + return true +} + +func parseCpuInfoFile(path string) (bool, error) { + f, err := os.Open(path) + if err != nil { + return false, err + } + defer f.Close() + + s := bufio.NewScanner(f) + for s.Scan() { + if err := s.Err(); err != nil { + return false, err + } + + text := s.Text() + flags := strings.Split(text, " ") + + // "cat_l3" flag is set if Intel RDT/CAT is supported + for _, flag := range flags { + if flag == "cat_l3" { + return true, nil + } + } + } + return false, nil +} + +func parseUint(s string, base, bitSize int) (uint64, error) { + value, err := strconv.ParseUint(s, base, bitSize) + if err != nil { + intValue, intErr := strconv.ParseInt(s, base, bitSize) + // 1. Handle negative values greater than MinInt64 (and) + // 2. Handle negative values lesser than MinInt64 + if intErr == nil && intValue < 0 { + return 0, nil + } else if intErr != nil && intErr.(*strconv.NumError).Err == strconv.ErrRange && intValue < 0 { + return 0, nil + } + + return value, err + } + + return value, nil +} + +// Gets a single uint64 value from the specified file. +func getIntelRdtParamUint(path, file string) (uint64, error) { + fileName := filepath.Join(path, file) + contents, err := ioutil.ReadFile(fileName) + if err != nil { + return 0, err + } + + res, err := parseUint(strings.TrimSpace(string(contents)), 10, 64) + if err != nil { + return res, fmt.Errorf("unable to parse %q as a uint from file %q", string(contents), fileName) + } + return res, nil +} + +// Gets a string value from the specified file +func getIntelRdtParamString(path, file string) (string, error) { + contents, err := ioutil.ReadFile(filepath.Join(path, file)) + if err != nil { + return "", err + } + + return strings.TrimSpace(string(contents)), nil +} + +func readTasksFile(dir string) ([]int, error) { + f, err := os.Open(filepath.Join(dir, IntelRdtTasks)) + if err != nil { + return nil, err + } + defer f.Close() + + var ( + s = bufio.NewScanner(f) + out = []int{} + ) + + for s.Scan() { + if t := s.Text(); t != "" { + pid, err := strconv.Atoi(t) + if err != nil { + return nil, err + } + out = append(out, pid) + } + } + return out, nil +} + +func writeFile(dir, file, data string) error { + if dir == "" { + return fmt.Errorf("no such directory for %s", file) + } + if err := ioutil.WriteFile(filepath.Join(dir, file), []byte(data+"\n"), 0700); err != nil { + return fmt.Errorf("failed to write %v to %v: %v", data, file, err) + } + return nil +} + +func getIntelRdtData(c *configs.Config, pid int) (*intelRdtData, error) { + rootPath, err := getIntelRdtRoot() + if err != nil { + return nil, err + } + return &intelRdtData{ + root: rootPath, + config: c, + pid: pid, + }, nil +} + +// Get the read-only L3 cache information +func getL3CacheInfo() (*L3CacheInfo, error) { + l3CacheInfo := &L3CacheInfo{} + + rootPath, err := getIntelRdtRoot() + if err != nil { + return l3CacheInfo, err + } + + path := filepath.Join(rootPath, "info", "L3") + cbmMask, err := getIntelRdtParamString(path, "cbm_mask") + if err != nil { + return l3CacheInfo, err + } + minCbmBits, err := getIntelRdtParamUint(path, "min_cbm_bits") + if err != nil { + return l3CacheInfo, err + } + numClosids, err := getIntelRdtParamUint(path, "num_closids") + if err != nil { + return l3CacheInfo, err + } + + l3CacheInfo.CbmMask = cbmMask + l3CacheInfo.MinCbmBits = minCbmBits + l3CacheInfo.NumClosids = numClosids + + return l3CacheInfo, nil +} + +// WriteIntelRdtTasks writes the specified pid into the "tasks" file +func WriteIntelRdtTasks(dir string, pid int) error { + if dir == "" { + return fmt.Errorf("no such directory for %s", IntelRdtTasks) + } + + // Dont attach any pid if -1 is specified as a pid + if pid != -1 { + if err := ioutil.WriteFile(filepath.Join(dir, IntelRdtTasks), []byte(strconv.Itoa(pid)), 0700); err != nil { + return fmt.Errorf("failed to write %v to %v: %v", pid, IntelRdtTasks, err) + } + } + return nil +} + +// Check if Intel RDT is enabled +func IsEnabled() bool { + return isEnabled +} + +// Get the 'container_id' path in Intel RDT "resource control" filesystem +func GetIntelRdtPath(id string) (string, error) { + rootPath, err := getIntelRdtRoot() + if err != nil { + return "", err + } + + path := filepath.Join(rootPath, id) + return path, nil +} + +// Applies Intel RDT configuration to the process with the specified pid +func (m *IntelRdtManager) Apply(pid int) (err error) { + // If intelRdt is not specified in config, we do nothing + if m.Config.IntelRdt == nil { + return nil + } + d, err := getIntelRdtData(m.Config, pid) + if err != nil && !IsNotFound(err) { + return err + } + + m.mu.Lock() + defer m.mu.Unlock() + path, err := d.join(m.Id) + if err != nil { + return err + } + + m.Path = path + return nil +} + +// Destroys the Intel RDT 'container_id' group +func (m *IntelRdtManager) Destroy() error { + m.mu.Lock() + defer m.mu.Unlock() + if err := os.RemoveAll(m.Path); err != nil { + return err + } + m.Path = "" + return nil +} + +// Returns Intel RDT path to save in a state file and to be able to +// restore the object later +func (m *IntelRdtManager) GetPath() string { + if m.Path == "" { + m.Path, _ = GetIntelRdtPath(m.Id) + } + return m.Path +} + +// Returns statistics for Intel RDT +func (m *IntelRdtManager) GetStats() (*Stats, error) { + // If intelRdt is not specified in config + if m.Config.IntelRdt == nil { + return nil, nil + } + + m.mu.Lock() + defer m.mu.Unlock() + stats := NewStats() + + // The read-only L3 cache information + l3CacheInfo, err := getL3CacheInfo() + if err != nil { + return nil, err + } + stats.L3CacheInfo = l3CacheInfo + + // The read-only L3 cache schema in root + rootPath, err := getIntelRdtRoot() + if err != nil { + return nil, err + } + tmpRootStrings, err := getIntelRdtParamString(rootPath, "schemata") + if err != nil { + return nil, err + } + // L3 cache schema is in the first line + schemaRootStrings := strings.Split(tmpRootStrings, "\n") + stats.L3CacheSchemaRoot = schemaRootStrings[0] + + // The L3 cache schema in 'container_id' group + tmpStrings, err := getIntelRdtParamString(m.GetPath(), "schemata") + if err != nil { + return nil, err + } + // L3 cache schema is in the first line + schemaStrings := strings.Split(tmpStrings, "\n") + stats.L3CacheSchema = schemaStrings[0] + + return stats, nil +} + +// Set Intel RDT "resource control" filesystem as configured. +func (m *IntelRdtManager) Set(container *configs.Config) error { + path := m.GetPath() + + // About L3 cache schema file: + // The schema has allocation masks/values for L3 cache on each socket, + // which contains L3 cache id and capacity bitmask (CBM). + // Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..." + // For example, on a two-socket machine, L3's schema line could be: + // L3:0=ff;1=c0 + // Which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0. + // + // About L3 cache CBM validity: + // The valid L3 cache CBM is a *contiguous bits set* and number of + // bits that can be set is less than the max bit. The max bits in the + // CBM is varied among supported Intel Xeon platforms. In Intel RDT + // "resource control" filesystem layout, the CBM in a group should + // be a subset of the CBM in root. Kernel will check if it is valid + // when writing. + // e.g., 0xfffff in root indicates the max bits of CBM is 20 bits, + // which mapping to entire L3 cache capacity. Some valid CBM values + // to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc. + if container.IntelRdt != nil { + l3CacheSchema := container.IntelRdt.L3CacheSchema + if l3CacheSchema != "" { + if err := writeFile(path, "schemata", l3CacheSchema); err != nil { + return err + } + } + } + + return nil +} + +func (raw *intelRdtData) join(id string) (string, error) { + path := filepath.Join(raw.root, id) + if err := os.MkdirAll(path, 0755); err != nil { + return "", err + } + + if err := WriteIntelRdtTasks(path, raw.pid); err != nil { + return "", err + } + return path, nil +} + +type NotFoundError struct { + ResourceControl string +} + +func (e *NotFoundError) Error() string { + return fmt.Sprintf("mountpoint for %s not found", e.ResourceControl) +} + +func NewNotFoundError(res string) error { + return &NotFoundError{ + ResourceControl: res, + } +} + +func IsNotFound(err error) bool { + if err == nil { + return false + } + _, ok := err.(*NotFoundError) + return ok +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/stats.go b/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/stats.go new file mode 100644 index 000000000..095c0a380 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/stats.go @@ -0,0 +1,24 @@ +// +build linux + +package intelrdt + +type L3CacheInfo struct { + CbmMask string `json:"cbm_mask,omitempty"` + MinCbmBits uint64 `json:"min_cbm_bits,omitempty"` + NumClosids uint64 `json:"num_closids,omitempty"` +} + +type Stats struct { + // The read-only L3 cache information + L3CacheInfo *L3CacheInfo `json:"l3_cache_info,omitempty"` + + // The read-only L3 cache schema in root + L3CacheSchemaRoot string `json:"l3_cache_schema_root,omitempty"` + + // The L3 cache schema in 'container_id' group + L3CacheSchema string `json:"l3_cache_schema,omitempty"` +} + +func NewStats() *Stats { + return &Stats{} +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/keys/keyctl.go b/vendor/github.com/opencontainers/runc/libcontainer/keys/keyctl.go index 82ffa7a88..ce8b4e6b0 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/keys/keyctl.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/keys/keyctl.go @@ -29,7 +29,7 @@ func ModKeyringPerm(ringId KeySerial, mask, setbits uint32) error { return err } - res := strings.Split(string(dest), ";") + res := strings.Split(dest, ";") if len(res) < 5 { return fmt.Errorf("Destination buffer for key description is too small") } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/message_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/message_linux.go index 8829b71ad..ab453cde9 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/message_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/message_linux.go @@ -18,6 +18,8 @@ const ( SetgroupAttr uint16 = 27285 OomScoreAdjAttr uint16 = 27286 RootlessAttr uint16 = 27287 + UidmapPathAttr uint16 = 27288 + GidmapPathAttr uint16 = 27289 ) type Int32msg struct { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/mount/mount.go b/vendor/github.com/opencontainers/runc/libcontainer/mount/mount.go new file mode 100644 index 000000000..e8965e081 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/mount/mount.go @@ -0,0 +1,23 @@ +package mount + +// GetMounts retrieves a list of mounts for the current running process. +func GetMounts() ([]*Info, error) { + return parseMountTable() +} + +// Mounted looks at /proc/self/mountinfo to determine of the specified +// mountpoint has been mounted +func Mounted(mountpoint string) (bool, error) { + entries, err := parseMountTable() + if err != nil { + return false, err + } + + // Search the table for the mountpoint + for _, e := range entries { + if e.Mountpoint == mountpoint { + return true, nil + } + } + return false, nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/mount/mount_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/mount/mount_linux.go new file mode 100644 index 000000000..1e5191928 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/mount/mount_linux.go @@ -0,0 +1,82 @@ +// +build linux + +package mount + +import ( + "bufio" + "fmt" + "io" + "os" + "strings" +) + +const ( + /* 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue + (1)(2)(3) (4) (5) (6) (7) (8) (9) (10) (11) + + (1) mount ID: unique identifier of the mount (may be reused after umount) + (2) parent ID: ID of parent (or of self for the top of the mount tree) + (3) major:minor: value of st_dev for files on filesystem + (4) root: root of the mount within the filesystem + (5) mount point: mount point relative to the process's root + (6) mount options: per mount options + (7) optional fields: zero or more fields of the form "tag[:value]" + (8) separator: marks the end of the optional fields + (9) filesystem type: name of filesystem of the form "type[.subtype]" + (10) mount source: filesystem specific information or "none" + (11) super options: per super block options*/ + mountinfoFormat = "%d %d %d:%d %s %s %s %s" +) + +// Parse /proc/self/mountinfo because comparing Dev and ino does not work from +// bind mounts +func parseMountTable() ([]*Info, error) { + f, err := os.Open("/proc/self/mountinfo") + if err != nil { + return nil, err + } + defer f.Close() + + return parseInfoFile(f) +} + +func parseInfoFile(r io.Reader) ([]*Info, error) { + var ( + s = bufio.NewScanner(r) + out = []*Info{} + ) + + for s.Scan() { + if err := s.Err(); err != nil { + return nil, err + } + + var ( + p = &Info{} + text = s.Text() + optionalFields string + ) + + if _, err := fmt.Sscanf(text, mountinfoFormat, + &p.ID, &p.Parent, &p.Major, &p.Minor, + &p.Root, &p.Mountpoint, &p.Opts, &optionalFields); err != nil { + return nil, fmt.Errorf("Scanning '%s' failed: %s", text, err) + } + // Safe as mountinfo encodes mountpoints with spaces as \040. + index := strings.Index(text, " - ") + postSeparatorFields := strings.Fields(text[index+3:]) + if len(postSeparatorFields) < 3 { + return nil, fmt.Errorf("Error found less than 3 fields post '-' in %q", text) + } + + if optionalFields != "-" { + p.Optional = optionalFields + } + + p.Fstype = postSeparatorFields[0] + p.Source = postSeparatorFields[1] + p.VfsOpts = strings.Join(postSeparatorFields[2:], " ") + out = append(out, p) + } + return out, nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/mount/mountinfo.go b/vendor/github.com/opencontainers/runc/libcontainer/mount/mountinfo.go new file mode 100644 index 000000000..e3fc3535e --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/mount/mountinfo.go @@ -0,0 +1,40 @@ +package mount + +// Info reveals information about a particular mounted filesystem. This +// struct is populated from the content in the /proc/<pid>/mountinfo file. +type Info struct { + // ID is a unique identifier of the mount (may be reused after umount). + ID int + + // Parent indicates the ID of the mount parent (or of self for the top of the + // mount tree). + Parent int + + // Major indicates one half of the device ID which identifies the device class. + Major int + + // Minor indicates one half of the device ID which identifies a specific + // instance of device. + Minor int + + // Root of the mount within the filesystem. + Root string + + // Mountpoint indicates the mount point relative to the process's root. + Mountpoint string + + // Opts represents mount-specific options. + Opts string + + // Optional represents optional fields. + Optional string + + // Fstype indicates the type of filesystem, such as EXT3. + Fstype string + + // Source indicates filesystem specific information or "none". + Source string + + // VfsOpts represents per super block options. + VfsOpts string +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/notify_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/notify_linux.go index 81587b9b1..47a06783d 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/notify_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/notify_linux.go @@ -44,9 +44,9 @@ func registerMemoryEvent(cgDir string, evName string, arg string) (<-chan struct ch := make(chan struct{}) go func() { defer func() { - close(ch) eventfd.Close() evFile.Close() + close(ch) }() buf := make([]byte, 8) for { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c index 197e6d08e..2c69cee5d 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c +++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c @@ -1,3 +1,4 @@ + #define _GNU_SOURCE #include <endian.h> #include <errno.h> @@ -19,6 +20,7 @@ #include <sys/prctl.h> #include <sys/socket.h> #include <sys/types.h> +#include <sys/wait.h> #include <linux/limits.h> #include <linux/netlink.h> @@ -29,15 +31,15 @@ /* Synchronisation values. */ enum sync_t { - SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */ - SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */ - SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */ - SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */ - SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */ - SYNC_CHILD_READY = 0x45, /* The child or grandchild is ready to return. */ + SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */ + SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */ + SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */ + SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */ + SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */ + SYNC_CHILD_READY = 0x45, /* The child or grandchild is ready to return. */ /* XXX: This doesn't help with segfaults and other such issues. */ - SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */ + SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */ }; /* longjmp() arguments. */ @@ -64,7 +66,13 @@ struct clone_t { struct nlconfig_t { char *data; + + /* Process settings. */ uint32_t cloneflags; + char *oom_score_adj; + size_t oom_score_adj_len; + + /* User namespace settings. */ char *uidmap; size_t uidmap_len; char *gidmap; @@ -72,9 +80,13 @@ struct nlconfig_t { char *namespaces; size_t namespaces_len; uint8_t is_setgroup; + + /* Rootless container settings. */ uint8_t is_rootless; - char *oom_score_adj; - size_t oom_score_adj_len; + char *uidmappath; + size_t uidmappath_len; + char *gidmappath; + size_t gidmappath_len; }; /* @@ -89,6 +101,8 @@ struct nlconfig_t { #define SETGROUP_ATTR 27285 #define OOM_SCORE_ADJ_ATTR 27286 #define ROOTLESS_ATTR 27287 +#define UIDMAPPATH_ATTR 27288 +#define GIDMAPPATH_ATTR 27289 /* * Use the raw syscall for versions of glibc which don't include a function for @@ -152,7 +166,7 @@ static int write_file(char *data, size_t data_len, char *pathfmt, ...) goto out; } -out: + out: close(fd); return ret; } @@ -169,16 +183,16 @@ static void update_setgroups(int pid, enum policy_t setgroup) char *policy; switch (setgroup) { - case SETGROUPS_ALLOW: - policy = "allow"; - break; - case SETGROUPS_DENY: - policy = "deny"; - break; - case SETGROUPS_DEFAULT: - default: - /* Nothing to do. */ - return; + case SETGROUPS_ALLOW: + policy = "allow"; + break; + case SETGROUPS_DENY: + policy = "deny"; + break; + case SETGROUPS_DEFAULT: + default: + /* Nothing to do. */ + return; } if (write_file(policy, strlen(policy), "/proc/%d/setgroups", pid) < 0) { @@ -191,22 +205,96 @@ static void update_setgroups(int pid, enum policy_t setgroup) } } -static void update_uidmap(int pid, char *map, size_t map_len) +static int try_mapping_tool(const char *app, int pid, char *map, size_t map_len) +{ + int child; + + /* + * If @app is NULL, execve will segfault. Just check it here and bail (if + * we're in this path, the caller is already getting desparate and there + * isn't a backup to this failing). This usually would be a configuration + * or programming issue. + */ + if (!app) + bail("mapping tool not present"); + + child = fork(); + if (child < 0) + bail("failed to fork"); + + if (!child) { +#define MAX_ARGV 20 + char *argv[MAX_ARGV]; + char *envp[] = { NULL }; + char pid_fmt[16]; + int argc = 0; + char *next; + + snprintf(pid_fmt, 16, "%d", pid); + + argv[argc++] = (char *)app; + argv[argc++] = pid_fmt; + /* + * Convert the map string into a list of argument that + * newuidmap/newgidmap can understand. + */ + + while (argc < MAX_ARGV) { + if (*map == '\0') { + argv[argc++] = NULL; + break; + } + argv[argc++] = map; + next = strpbrk(map, "\n "); + if (next == NULL) + break; + *next++ = '\0'; + map = next + strspn(next, "\n "); + } + + execve(app, argv, envp); + bail("failed to execv"); + } else { + int status; + + while (true) { + if (waitpid(child, &status, 0) < 0) { + if (errno == EINTR) + continue; + bail("failed to waitpid"); + } + if (WIFEXITED(status) || WIFSIGNALED(status)) + return WEXITSTATUS(status); + } + } + + return -1; +} + +static void update_uidmap(const char *path, int pid, char *map, size_t map_len) { if (map == NULL || map_len <= 0) return; - if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) - bail("failed to update /proc/%d/uid_map", pid); + if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) { + if (errno != EPERM) + bail("failed to update /proc/%d/uid_map", pid); + if (try_mapping_tool(path, pid, map, map_len)) + bail("failed to use newuid map on %d", pid); + } } -static void update_gidmap(int pid, char *map, size_t map_len) +static void update_gidmap(const char *path, int pid, char *map, size_t map_len) { if (map == NULL || map_len <= 0) return; - if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) - bail("failed to update /proc/%d/gid_map", pid); + if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) { + if (errno != EPERM) + bail("failed to update /proc/%d/gid_map", pid); + if (try_mapping_tool(path, pid, map, map_len)) + bail("failed to use newgid map on %d", pid); + } } static void update_oom_score_adj(char *data, size_t len) @@ -230,7 +318,7 @@ static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline)); static int clone_parent(jmp_buf *env, int jmpval) { struct clone_t ca = { - .env = env, + .env = env, .jmpval = jmpval, }; @@ -350,6 +438,14 @@ static void nl_parse(int fd, struct nlconfig_t *config) config->gidmap = current; config->gidmap_len = payload_len; break; + case UIDMAPPATH_ATTR: + config->uidmappath = current; + config->uidmappath_len = payload_len; + break; + case GIDMAPPATH_ATTR: + config->gidmappath = current; + config->gidmappath_len = payload_len; + break; case SETGROUP_ATTR: config->is_setgroup = readint8(current); break; @@ -436,7 +532,7 @@ void nsexec(void) int pipenum; jmp_buf env; int sync_child_pipe[2], sync_grandchild_pipe[2]; - struct nlconfig_t config = {0}; + struct nlconfig_t config = { 0 }; /* * If we don't have an init pipe, just return to the go routine. @@ -533,21 +629,21 @@ void nsexec(void) */ switch (setjmp(env)) { - /* - * Stage 0: We're in the parent. Our job is just to create a new child - * (stage 1: JUMP_CHILD) process and write its uid_map and - * gid_map. That process will go on to create a new process, then - * it will send us its PID which we will send to the bootstrap - * process. - */ - case JUMP_PARENT: { + /* + * Stage 0: We're in the parent. Our job is just to create a new child + * (stage 1: JUMP_CHILD) process and write its uid_map and + * gid_map. That process will go on to create a new process, then + * it will send us its PID which we will send to the bootstrap + * process. + */ + case JUMP_PARENT:{ int len; - pid_t child; + pid_t child, first_child = -1; char buf[JSON_MAX]; bool ready = false; /* For debugging. */ - prctl(PR_SET_NAME, (unsigned long) "runc:[0:PARENT]", 0, 0, 0); + prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0); /* Start the process of getting a container. */ child = clone_parent(&env, JUMP_CHILD); @@ -596,8 +692,8 @@ void nsexec(void) update_setgroups(child, SETGROUPS_DENY); /* Set up mappings. */ - update_uidmap(child, config.uidmap, config.uidmap_len); - update_gidmap(child, config.gidmap, config.gidmap_len); + update_uidmap(config.uidmappath, child, config.uidmap, config.uidmap_len); + update_gidmap(config.gidmappath, child, config.gidmap, config.gidmap_len); s = SYNC_USERMAP_ACK; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { @@ -605,19 +701,19 @@ void nsexec(void) bail("failed to sync with child: write(SYNC_USERMAP_ACK)"); } break; - case SYNC_RECVPID_PLS: { - pid_t old = child; + case SYNC_RECVPID_PLS:{ + first_child = child; /* Get the init_func pid. */ if (read(syncfd, &child, sizeof(child)) != sizeof(child)) { - kill(old, SIGKILL); + kill(first_child, SIGKILL); bail("failed to sync with child: read(childpid)"); } /* Send ACK. */ s = SYNC_RECVPID_ACK; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { - kill(old, SIGKILL); + kill(first_child, SIGKILL); kill(child, SIGKILL); bail("failed to sync with child: write(SYNC_RECVPID_ACK)"); } @@ -665,8 +761,13 @@ void nsexec(void) } } - /* Send the init_func pid back to our parent. */ - len = snprintf(buf, JSON_MAX, "{\"pid\": %d}\n", child); + /* + * Send the init_func pid and the pid of the first child back to our parent. + * + * We need to send both back because we can't reap the first child we created (CLONE_PARENT). + * It becomes the responsibility of our parent to reap the first child. + */ + len = snprintf(buf, JSON_MAX, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child); if (len < 0) { kill(child, SIGKILL); bail("unable to generate JSON for child pid"); @@ -679,16 +780,16 @@ void nsexec(void) exit(0); } - /* - * Stage 1: We're in the first child process. Our job is to join any - * provided namespaces in the netlink payload and unshare all - * of the requested namespaces. If we've been asked to - * CLONE_NEWUSER, we will ask our parent (stage 0) to set up - * our user mappings for us. Then, we create a new child - * (stage 2: JUMP_INIT) for PID namespace. We then send the - * child's PID to our parent (stage 0). - */ - case JUMP_CHILD: { + /* + * Stage 1: We're in the first child process. Our job is to join any + * provided namespaces in the netlink payload and unshare all + * of the requested namespaces. If we've been asked to + * CLONE_NEWUSER, we will ask our parent (stage 0) to set up + * our user mappings for us. Then, we create a new child + * (stage 2: JUMP_INIT) for PID namespace. We then send the + * child's PID to our parent (stage 0). + */ + case JUMP_CHILD:{ pid_t child; enum sync_t s; @@ -697,7 +798,7 @@ void nsexec(void) close(sync_child_pipe[1]); /* For debugging. */ - prctl(PR_SET_NAME, (unsigned long) "runc:[1:CHILD]", 0, 0, 0); + prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0); /* * We need to setns first. We cannot do this earlier (in stage 0) @@ -799,13 +900,13 @@ void nsexec(void) exit(0); } - /* - * Stage 2: We're the final child process, and the only process that will - * actually return to the Go runtime. Our job is to just do the - * final cleanup steps and then return to the Go runtime to allow - * init_linux.go to run. - */ - case JUMP_INIT: { + /* + * Stage 2: We're the final child process, and the only process that will + * actually return to the Go runtime. Our job is to just do the + * final cleanup steps and then return to the Go runtime to allow + * init_linux.go to run. + */ + case JUMP_INIT:{ /* * We're inside the child now, having jumped from the * start_child() code after forking in the parent. @@ -819,7 +920,7 @@ void nsexec(void) close(sync_child_pipe[1]); /* For debugging. */ - prctl(PR_SET_NAME, (unsigned long) "runc:[2:INIT]", 0, 0, 0); + prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0); if (read(syncfd, &s, sizeof(s)) != sizeof(s)) bail("failed to sync with parent: read(SYNC_GRANDCHILD)"); diff --git a/vendor/github.com/opencontainers/runc/libcontainer/process.go b/vendor/github.com/opencontainers/runc/libcontainer/process.go index f1ad08149..86bf7387f 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/process.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/process.go @@ -47,6 +47,10 @@ type Process struct { // ExtraFiles specifies additional open files to be inherited by the container ExtraFiles []*os.File + // Initial sizings for the console + ConsoleWidth uint16 + ConsoleHeight uint16 + // Capabilities specify the capabilities to keep when executing the process inside the container // All capabilities not specified will be dropped from the processes capability mask Capabilities *configs.Capabilities diff --git a/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go index 171685ccd..58980b059 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go @@ -15,6 +15,7 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/intelrdt" "github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/runc/libcontainer/utils" @@ -49,6 +50,7 @@ type setnsProcess struct { parentPipe *os.File childPipe *os.File cgroupPaths map[string]string + intelRdtPath string config *initConfig fds []string process *Process @@ -83,12 +85,20 @@ func (p *setnsProcess) start() (err error) { if err = p.execSetns(); err != nil { return newSystemErrorWithCause(err, "executing setns process") } - // We can't join cgroups if we're in a rootless container. - if !p.config.Rootless && len(p.cgroupPaths) > 0 { + if len(p.cgroupPaths) > 0 { if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil { return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid()) } } + if p.intelRdtPath != "" { + // if Intel RDT "resource control" filesystem path exists + _, err := os.Stat(p.intelRdtPath) + if err == nil { + if err := intelrdt.WriteIntelRdtTasks(p.intelRdtPath, p.pid()); err != nil { + return newSystemErrorWithCausef(err, "adding pid %d to Intel RDT resource control filesystem", p.pid()) + } + } + } // set rlimits, this has to be done here because we lose permissions // to raise the limits once we enter a user-namespace if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil { @@ -141,6 +151,16 @@ func (p *setnsProcess) execSetns() error { p.cmd.Wait() return newSystemErrorWithCause(err, "reading pid from init pipe") } + + // Clean up the zombie parent process + firstChildProcess, err := os.FindProcess(pid.PidFirstChild) + if err != nil { + return err + } + + // Ignore the error in case the child has already been reaped for any reason + _, _ = firstChildProcess.Wait() + process, err := os.FindProcess(pid.Pid) if err != nil { return err @@ -183,17 +203,17 @@ func (p *setnsProcess) setExternalDescriptors(newFds []string) { } type initProcess struct { - cmd *exec.Cmd - parentPipe *os.File - childPipe *os.File - config *initConfig - manager cgroups.Manager - container *linuxContainer - fds []string - process *Process - bootstrapData io.Reader - sharePidns bool - rootDir *os.File + cmd *exec.Cmd + parentPipe *os.File + childPipe *os.File + config *initConfig + manager cgroups.Manager + intelRdtManager intelrdt.Manager + container *linuxContainer + fds []string + process *Process + bootstrapData io.Reader + sharePidns bool } func (p *initProcess) pid() int { @@ -224,6 +244,16 @@ func (p *initProcess) execSetns() error { p.cmd.Wait() return err } + + // Clean up the zombie parent process + firstChildProcess, err := os.FindProcess(pid.PidFirstChild) + if err != nil { + return err + } + + // Ignore the error in case the child has already been reaped for any reason + _, _ = firstChildProcess.Wait() + process, err := os.FindProcess(pid.Pid) if err != nil { return err @@ -238,17 +268,39 @@ func (p *initProcess) start() error { err := p.cmd.Start() p.process.ops = p p.childPipe.Close() - p.rootDir.Close() if err != nil { p.process.ops = nil return newSystemErrorWithCause(err, "starting init process command") } + // Do this before syncing with child so that no children can escape the + // cgroup. We don't need to worry about not doing this and not being root + // because we'd be using the rootless cgroup manager in that case. + if err := p.manager.Apply(p.pid()); err != nil { + return newSystemErrorWithCause(err, "applying cgroup configuration for process") + } + if p.intelRdtManager != nil { + if err := p.intelRdtManager.Apply(p.pid()); err != nil { + return newSystemErrorWithCause(err, "applying Intel RDT configuration for process") + } + } + defer func() { + if err != nil { + // TODO: should not be the responsibility to call here + p.manager.Destroy() + if p.intelRdtManager != nil { + p.intelRdtManager.Destroy() + } + } + }() + if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil { return newSystemErrorWithCause(err, "copying bootstrap data to pipe") } + if err := p.execSetns(); err != nil { return newSystemErrorWithCause(err, "running exec setns process for init") } + // Save the standard descriptor names before the container process // can potentially move them (e.g., via dup2()). If we don't do this now, // we won't know at checkpoint time which file descriptor to look up. @@ -257,18 +309,6 @@ func (p *initProcess) start() error { return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid()) } p.setExternalDescriptors(fds) - // Do this before syncing with child so that no children can escape the - // cgroup. We don't need to worry about not doing this and not being root - // because we'd be using the rootless cgroup manager in that case. - if err := p.manager.Apply(p.pid()); err != nil { - return newSystemErrorWithCause(err, "applying cgroup configuration for process") - } - defer func() { - if err != nil { - // TODO: should not be the responsibility to call here - p.manager.Destroy() - } - }() if err := p.createNetworkInterfaces(); err != nil { return newSystemErrorWithCause(err, "creating network interfaces") } @@ -294,13 +334,20 @@ func (p *initProcess) start() error { if err := p.manager.Set(p.config.Config); err != nil { return newSystemErrorWithCause(err, "setting cgroup config for ready process") } + if p.intelRdtManager != nil { + if err := p.intelRdtManager.Set(p.config.Config); err != nil { + return newSystemErrorWithCause(err, "setting Intel RDT config for ready process") + } + } if p.config.Config.Hooks != nil { + bundle, annotations := utils.Annotations(p.container.config.Labels) s := configs.HookState{ - Version: p.container.config.Version, - ID: p.container.id, - Pid: p.pid(), - Bundle: utils.SearchLabels(p.config.Config.Labels, "bundle"), + Version: p.container.config.Version, + ID: p.container.id, + Pid: p.pid(), + Bundle: bundle, + Annotations: annotations, } for i, hook := range p.config.Config.Hooks.Prestart { if err := hook.Run(s); err != nil { @@ -319,12 +366,19 @@ func (p *initProcess) start() error { if err := p.manager.Set(p.config.Config); err != nil { return newSystemErrorWithCause(err, "setting cgroup config for procHooks process") } + if p.intelRdtManager != nil { + if err := p.intelRdtManager.Set(p.config.Config); err != nil { + return newSystemErrorWithCause(err, "setting Intel RDT config for procHooks process") + } + } if p.config.Config.Hooks != nil { + bundle, annotations := utils.Annotations(p.container.config.Labels) s := configs.HookState{ - Version: p.container.config.Version, - ID: p.container.id, - Pid: p.pid(), - Bundle: utils.SearchLabels(p.config.Config.Labels, "bundle"), + Version: p.container.config.Version, + ID: p.container.id, + Pid: p.pid(), + Bundle: bundle, + Annotations: annotations, } for i, hook := range p.config.Config.Hooks.Prestart { if err := hook.Run(s); err != nil { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go index e2e734a85..73ee2bd69 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go @@ -13,11 +13,11 @@ import ( "strings" "time" - "github.com/docker/docker/pkg/mount" - "github.com/docker/docker/pkg/symlink" + "github.com/cyphar/filepath-securejoin" "github.com/mrunalp/fileutils" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/mount" "github.com/opencontainers/runc/libcontainer/system" libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" "github.com/opencontainers/selinux/go-selinux/label" @@ -40,7 +40,8 @@ func needsSetupDev(config *configs.Config) bool { // prepareRootfs sets up the devices, mount points, and filesystems for use // inside a new mount namespace. It doesn't set anything as ro. You must call // finalizeRootfs after this function to finish setting up the rootfs. -func prepareRootfs(pipe io.ReadWriter, config *configs.Config) (err error) { +func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) { + config := iConfig.Config if err := prepareRoot(config); err != nil { return newSystemErrorWithCause(err, "preparing rootfs") } @@ -80,6 +81,7 @@ func prepareRootfs(pipe io.ReadWriter, config *configs.Config) (err error) { // The hooks are run after the mounts are setup, but before we switch to the new // root, so that the old root is still available in the hooks for any mount // manipulations. + // Note that iConfig.Cwd is not guaranteed to exist here. if err := syncParentHooks(pipe); err != nil { return err } @@ -98,8 +100,10 @@ func prepareRootfs(pipe io.ReadWriter, config *configs.Config) (err error) { if config.NoPivotRoot { err = msMoveRoot(config.Rootfs) - } else { + } else if config.Namespaces.Contains(configs.NEWNS) { err = pivotRoot(config.Rootfs) + } else { + err = chroot(config.Rootfs) } if err != nil { return newSystemErrorWithCause(err, "jailing process inside rootfs") @@ -111,6 +115,14 @@ func prepareRootfs(pipe io.ReadWriter, config *configs.Config) (err error) { } } + if cwd := iConfig.Cwd; cwd != "" { + // Note that spec.Process.Cwd can contain unclean value like "../../../../foo/bar...". + // However, we are safe to call MkDirAll directly because we are in the jail here. + if err := os.MkdirAll(cwd, 0755); err != nil { + return err + } + } + return nil } @@ -230,7 +242,7 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error { // any previous mounts can invalidate the next mount's destination. // this can happen when a user specifies mounts within other mounts to cause breakouts or other // evil stuff to try to escape the container's rootfs. - if dest, err = symlink.FollowSymlinkInScope(dest, rootfs); err != nil { + if dest, err = securejoin.SecureJoin(rootfs, m.Destination); err != nil { return err } if err := checkMountDestination(rootfs, dest); err != nil { @@ -318,7 +330,7 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error { // this can happen when a user specifies mounts within other mounts to cause breakouts or other // evil stuff to try to escape the container's rootfs. var err error - if dest, err = symlink.FollowSymlinkInScope(dest, rootfs); err != nil { + if dest, err = securejoin.SecureJoin(rootfs, m.Destination); err != nil { return err } if err := checkMountDestination(rootfs, dest); err != nil { @@ -668,9 +680,12 @@ func pivotRoot(rootfs string) error { return err } - // Make oldroot rprivate to make sure our unmounts don't propagate to the - // host (and thus bork the machine). - if err := unix.Mount("", ".", "", unix.MS_PRIVATE|unix.MS_REC, ""); err != nil { + // Make oldroot rslave to make sure our unmounts don't propagate to the + // host (and thus bork the machine). We don't use rprivate because this is + // known to cause issues due to races where we still have a reference to a + // mount while a process in the host namespace are trying to operate on + // something they think has no mounts (devicemapper in particular). + if err := unix.Mount("", ".", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil { return err } // Preform the unmount. MNT_DETACH allows us to unmount /proc/self/cwd. @@ -689,6 +704,10 @@ func msMoveRoot(rootfs string) error { if err := unix.Mount(rootfs, "/", "", unix.MS_MOVE, ""); err != nil { return err } + return chroot(rootfs) +} + +func chroot(rootfs string) error { if err := unix.Chroot("."); err != nil { return err } @@ -733,7 +752,14 @@ func remountReadonly(m *configs.Mount) error { flags = m.Flags ) for i := 0; i < 5; i++ { - if err := unix.Mount("", dest, "", uintptr(flags|unix.MS_REMOUNT|unix.MS_RDONLY), ""); err != nil { + // There is a special case in the kernel for + // MS_REMOUNT | MS_BIND, which allows us to change only the + // flags even as an unprivileged user (i.e. user namespace) + // assuming we don't drop any security related flags (nodev, + // nosuid, etc.). So, let's use that case so that we can do + // this re-mount without failing in a userns. + flags |= unix.MS_REMOUNT | unix.MS_BIND | unix.MS_RDONLY + if err := unix.Mount("", dest, "", uintptr(flags), ""); err != nil { switch err { case unix.EBUSY: time.Sleep(100 * time.Millisecond) diff --git a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go index 2523cbf99..d99f3fe64 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go @@ -22,6 +22,11 @@ var ( actErrno = libseccomp.ActErrno.SetReturnCode(int16(unix.EPERM)) ) +const ( + // Linux system calls can have at most 6 arguments + syscallMaxArguments int = 6 +) + // Filters given syscalls in a container, preventing them from being used // Started in the container init process, and carried over to all child processes // Setns calls, however, require a separate invocation, as they are not children @@ -45,11 +50,11 @@ func InitSeccomp(config *configs.Seccomp) error { for _, arch := range config.Architectures { scmpArch, err := libseccomp.GetArchFromString(arch) if err != nil { - return err + return fmt.Errorf("error validating Seccomp architecture: %s", err) } if err := filter.AddArch(scmpArch); err != nil { - return err + return fmt.Errorf("error adding architecture to seccomp filter: %s", err) } } @@ -170,29 +175,55 @@ func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall) error { // Convert the call's action to the libseccomp equivalent callAct, err := getAction(call.Action) if err != nil { - return err + return fmt.Errorf("action in seccomp profile is invalid: %s", err) } // Unconditional match - just add the rule if len(call.Args) == 0 { if err = filter.AddRule(callNum, callAct); err != nil { - return err + return fmt.Errorf("error adding seccomp filter rule for syscall %s: %s", call.Name, err) } } else { - // Conditional match - convert the per-arg rules into library format + // If two or more arguments have the same condition, + // Revert to old behavior, adding each condition as a separate rule + argCounts := make([]uint, syscallMaxArguments) conditions := []libseccomp.ScmpCondition{} for _, cond := range call.Args { newCond, err := getCondition(cond) if err != nil { - return err + return fmt.Errorf("error creating seccomp syscall condition for syscall %s: %s", call.Name, err) } + argCounts[cond.Index] += 1 + conditions = append(conditions, newCond) } - if err = filter.AddRuleConditional(callNum, callAct, conditions); err != nil { - return err + hasMultipleArgs := false + for _, count := range argCounts { + if count > 1 { + hasMultipleArgs = true + break + } + } + + if hasMultipleArgs { + // Revert to old behavior + // Add each condition attached to a separate rule + for _, cond := range conditions { + condArr := []libseccomp.ScmpCondition{cond} + + if err = filter.AddRuleConditional(callNum, callAct, condArr); err != nil { + return fmt.Errorf("error adding seccomp rule for syscall %s: %s", call.Name, err) + } + } + } else { + // No conditions share same argument + // Use new, proper behavior + if err = filter.AddRuleConditional(callNum, callAct, conditions); err != nil { + return fmt.Errorf("error adding seccomp rule for syscall %s: %s", call.Name, err) + } } } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/setgroups_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/setgroups_linux.go deleted file mode 100644 index c7bdb605a..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/setgroups_linux.go +++ /dev/null @@ -1,11 +0,0 @@ -// +build linux,go1.5 - -package libcontainer - -import "syscall" - -// Set the GidMappingsEnableSetgroups member to true, so the process's -// setgroups proc entry wont be set to 'deny' if GidMappings are set -func enableSetgroups(sys *syscall.SysProcAttr) { - sys.GidMappingsEnableSetgroups = true -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go index 35b84219c..096c601e7 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go @@ -47,7 +47,10 @@ func (l *linuxSetnsInit) Init() error { return err } } - if l.config.Config.Seccomp != nil { + // Without NoNewPrivileges seccomp is a privileged operation, so we need to + // do this before dropping capabilities; otherwise do it as late as possible + // just before execve so as few syscalls take place after it as possible. + if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges { if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { return err } @@ -61,5 +64,13 @@ func (l *linuxSetnsInit) Init() error { if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil { return err } + // Set seccomp as close to execve as possible, so as few syscalls take + // place afterward (reducing the amount of syscalls that users need to + // enable in their seccomp profiles). + if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges { + if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { + return newSystemErrorWithCause(err, "init seccomp") + } + } return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ()) } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go index 580b3fe45..02ea753ed 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go @@ -22,7 +22,7 @@ type linuxStandardInit struct { pipe *os.File consoleSocket *os.File parentPid int - stateDirFD int + fifoFd int config *initConfig } @@ -30,15 +30,15 @@ func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) { var newperms uint32 if l.config.Config.Namespaces.Contains(configs.NEWUSER) { - // with user ns we need 'other' search permissions + // With user ns we need 'other' search permissions. newperms = 0x8 } else { - // without user ns we need 'UID' search permissions + // Without user ns we need 'UID' search permissions. newperms = 0x80000 } - // create a unique per session container name that we can - // join in setns; however, other containers can also join it + // Create a unique per session container name that we can join in setns; + // However, other containers can also join it. return fmt.Sprintf("_ses.%s", l.config.ContainerId), 0xffffffff, newperms } @@ -46,12 +46,12 @@ func (l *linuxStandardInit) Init() error { if !l.config.Config.NoNewKeyring { ringname, keepperms, newperms := l.getSessionRingParams() - // do not inherit the parent's session keyring + // Do not inherit the parent's session keyring. sessKeyId, err := keys.JoinSessionKeyring(ringname) if err != nil { return err } - // make session keyring searcheable + // Make session keyring searcheable. if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil { return err } @@ -65,14 +65,9 @@ func (l *linuxStandardInit) Init() error { } label.Init() - - // prepareRootfs() can be executed only for a new mount namespace. - if l.config.Config.Namespaces.Contains(configs.NEWNS) { - if err := prepareRootfs(l.pipe, l.config.Config); err != nil { - return err - } + if err := prepareRootfs(l.pipe, l.config); err != nil { + return err } - // Set up the console. This has to be done *before* we finalize the rootfs, // but *after* we've given the user the chance to set up all of the mounts // they wanted. @@ -150,37 +145,47 @@ func (l *linuxStandardInit) Init() error { if err := pdeath.Restore(); err != nil { return err } - // compare the parent from the initial start of the init process and make sure that it did not change. - // if the parent changes that means it died and we were reparented to something else so we should - // just kill ourself and not cause problems for someone else. + // Compare the parent from the initial start of the init process and make + // sure that it did not change. if the parent changes that means it died + // and we were reparented to something else so we should just kill ourself + // and not cause problems for someone else. if unix.Getppid() != l.parentPid { return unix.Kill(unix.Getpid(), unix.SIGKILL) } - // check for the arg before waiting to make sure it exists and it is returned - // as a create time error. + // Check for the arg before waiting to make sure it exists and it is + // returned as a create time error. name, err := exec.LookPath(l.config.Args[0]) if err != nil { return err } - // close the pipe to signal that we have completed our init. + // Close the pipe to signal that we have completed our init. l.pipe.Close() - // wait for the fifo to be opened on the other side before - // exec'ing the users process. - fd, err := unix.Openat(l.stateDirFD, execFifoFilename, os.O_WRONLY|unix.O_CLOEXEC, 0) + // Wait for the FIFO to be opened on the other side before exec-ing the + // user process. We open it through /proc/self/fd/$fd, because the fd that + // was given to us was an O_PATH fd to the fifo itself. Linux allows us to + // re-open an O_PATH fd through /proc. + fd, err := unix.Open(fmt.Sprintf("/proc/self/fd/%d", l.fifoFd), unix.O_WRONLY|unix.O_CLOEXEC, 0) if err != nil { - return newSystemErrorWithCause(err, "openat exec fifo") + return newSystemErrorWithCause(err, "open exec fifo") } if _, err := unix.Write(fd, []byte("0")); err != nil { return newSystemErrorWithCause(err, "write 0 exec fifo") } + // Close the O_PATH fifofd fd before exec because the kernel resets + // dumpable in the wrong order. This has been fixed in newer kernels, but + // we keep this to ensure CVE-2016-9962 doesn't re-emerge on older kernels. + // N.B. the core issue itself (passing dirfds to the host filesystem) has + // since been resolved. + // https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318 + unix.Close(l.fifoFd) + // Set seccomp as close to execve as possible, so as few syscalls take + // place afterward (reducing the amount of syscalls that users need to + // enable in their seccomp profiles). if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges { if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { return newSystemErrorWithCause(err, "init seccomp") } } - // close the statedir fd before exec because the kernel resets dumpable in the wrong order - // https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318 - unix.Close(l.stateDirFD) if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil { return newSystemErrorWithCause(err, "exec user process") } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/state_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/state_linux.go index 44fa6b43a..b45ce23e4 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/state_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/state_linux.go @@ -45,6 +45,11 @@ func destroy(c *linuxContainer) error { } } err := c.cgroupManager.Destroy() + if c.intelRdtManager != nil { + if ierr := c.intelRdtManager.Destroy(); err == nil { + err = ierr + } + } if rerr := os.RemoveAll(c.root); err == nil { err = rerr } @@ -58,10 +63,12 @@ func destroy(c *linuxContainer) error { func runPoststopHooks(c *linuxContainer) error { if c.config.Hooks != nil { + bundle, annotations := utils.Annotations(c.config.Labels) s := configs.HookState{ - Version: c.config.Version, - ID: c.id, - Bundle: utils.SearchLabels(c.config.Labels, "bundle"), + Version: c.config.Version, + ID: c.id, + Bundle: bundle, + Annotations: annotations, } for _, hook := range c.config.Hooks.Poststop { if err := hook.Run(s); err != nil { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/stats_freebsd.go b/vendor/github.com/opencontainers/runc/libcontainer/stats_freebsd.go deleted file mode 100644 index f8d1d689c..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/stats_freebsd.go +++ /dev/null @@ -1,5 +0,0 @@ -package libcontainer - -type Stats struct { - Interfaces []*NetworkInterface -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/stats_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/stats_linux.go index c629dc67d..29fd641e9 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/stats_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/stats_linux.go @@ -1,8 +1,10 @@ package libcontainer import "github.com/opencontainers/runc/libcontainer/cgroups" +import "github.com/opencontainers/runc/libcontainer/intelrdt" type Stats struct { - Interfaces []*NetworkInterface - CgroupStats *cgroups.Stats + Interfaces []*NetworkInterface + CgroupStats *cgroups.Stats + IntelRdtStats *intelrdt.Stats } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/stats_solaris.go b/vendor/github.com/opencontainers/runc/libcontainer/stats_solaris.go deleted file mode 100644 index da78c1c2e..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/stats_solaris.go +++ /dev/null @@ -1,7 +0,0 @@ -package libcontainer - -// Solaris - TODO - -type Stats struct { - Interfaces []*NetworkInterface -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/stats_windows.go b/vendor/github.com/opencontainers/runc/libcontainer/stats_windows.go deleted file mode 100644 index f8d1d689c..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/stats_windows.go +++ /dev/null @@ -1,5 +0,0 @@ -package libcontainer - -type Stats struct { - Interfaces []*NetworkInterface -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go b/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go index 4837085a7..5f124cd8b 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go @@ -134,3 +134,14 @@ func RunningInUserNS() bool { func SetSubreaper(i int) error { return unix.Prctl(PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0) } + +// GetSubreaper returns the subreaper setting for the calling process +func GetSubreaper() (int, error) { + var i uintptr + + if err := unix.Prctl(unix.PR_GET_CHILD_SUBREAPER, uintptr(unsafe.Pointer(&i)), 0, 0, 0); err != nil { + return -1, err + } + + return int(i), nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_arm.go b/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_32.go index 31ff3deb1..c5ca5d862 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_arm.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_32.go @@ -1,4 +1,5 @@ -// +build linux,arm +// +build linux +// +build 386 arm package system diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_386.go b/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_386.go deleted file mode 100644 index 3f7235ed1..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_386.go +++ /dev/null @@ -1,25 +0,0 @@ -// +build linux,386 - -package system - -import ( - "golang.org/x/sys/unix" -) - -// Setuid sets the uid of the calling thread to the specified uid. -func Setuid(uid int) (err error) { - _, _, e1 := unix.RawSyscall(unix.SYS_SETUID32, uintptr(uid), 0, 0) - if e1 != 0 { - err = e1 - } - return -} - -// Setgid sets the gid of the calling thread to the specified gid. -func Setgid(gid int) (err error) { - _, _, e1 := unix.RawSyscall(unix.SYS_SETGID32, uintptr(gid), 0, 0) - if e1 != 0 { - err = e1 - } - return -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_64.go b/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_64.go index d7891a2ff..11c3faafb 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_64.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_64.go @@ -1,4 +1,5 @@ -// +build linux,arm64 linux,amd64 linux,ppc linux,ppc64 linux,ppc64le linux,s390x +// +build linux +// +build arm64 amd64 mips mipsle mips64 mips64le ppc ppc64 ppc64le s390x package system diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/sysconfig.go b/vendor/github.com/opencontainers/runc/libcontainer/system/sysconfig.go index b3a07cba3..b8434f105 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/system/sysconfig.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/system/sysconfig.go @@ -1,4 +1,4 @@ -// +build cgo,linux cgo,freebsd +// +build cgo,linux package system diff --git a/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unsupported.go deleted file mode 100644 index 4a8d00acb..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unsupported.go +++ /dev/null @@ -1,38 +0,0 @@ -// +build !darwin,!dragonfly,!freebsd,!linux,!netbsd,!openbsd,!solaris - -package user - -import ( - "io" - "syscall" -) - -func GetPasswdPath() (string, error) { - return "", ErrUnsupported -} - -func GetPasswd() (io.ReadCloser, error) { - return nil, ErrUnsupported -} - -func GetGroupPath() (string, error) { - return "", ErrUnsupported -} - -func GetGroup() (io.ReadCloser, error) { - return nil, ErrUnsupported -} - -// CurrentUser looks up the current user by their user id in /etc/passwd. If the -// user cannot be found (or there is no /etc/passwd file on the filesystem), -// then CurrentUser returns an error. -func CurrentUser() (User, error) { - return LookupUid(syscall.Getuid()) -} - -// CurrentGroup looks up the current user's group by their primary group id's -// entry in /etc/passwd. If the group cannot be found (or there is no -// /etc/group file on the filesystem), then CurrentGroup returns an error. -func CurrentGroup() (Group, error) { - return LookupGid(syscall.Getgid()) -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go index 2cbb6491a..c8a9364d5 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go @@ -84,12 +84,10 @@ func RecvFd(socket *os.File) (*os.File, error) { // addition, the file.Name() of the given file will also be sent as // non-auxiliary data in the same payload (allowing to send contextual // information for a file descriptor). -func SendFd(socket, file *os.File) error { - name := []byte(file.Name()) +func SendFd(socket *os.File, name string, fd uintptr) error { if len(name) >= MaxNameLen { - return fmt.Errorf("sendfd: filename too long: %s", file.Name()) + return fmt.Errorf("sendfd: filename too long: %s", name) } - oob := unix.UnixRights(int(file.Fd())) - - return unix.Sendmsg(int(socket.Fd()), name, oob, nil, 0) + oob := unix.UnixRights(int(fd)) + return unix.Sendmsg(int(socket.Fd()), []byte(name), oob, nil, 0) } |