summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--cmd/podman/spec.go46
-rw-r--r--test/e2e/run_privileged_test.go (renamed from test/e2e/privileged_test.go)13
-rw-r--r--vendor.conf4
-rw-r--r--vendor/github.com/containerd/console/LICENSE201
-rw-r--r--vendor/github.com/containerd/console/README.md17
-rw-r--r--vendor/github.com/containerd/console/console.go62
-rw-r--r--vendor/github.com/containerd/console/console_linux.go255
-rw-r--r--vendor/github.com/containerd/console/console_unix.go142
-rw-r--r--vendor/github.com/containerd/console/console_windows.go200
-rw-r--r--vendor/github.com/containerd/console/tc_darwin.go37
-rw-r--r--vendor/github.com/containerd/console/tc_freebsd.go29
-rw-r--r--vendor/github.com/containerd/console/tc_linux.go37
-rw-r--r--vendor/github.com/containerd/console/tc_solaris_cgo.go35
-rw-r--r--vendor/github.com/containerd/console/tc_solaris_nocgo.go31
-rw-r--r--vendor/github.com/containerd/console/tc_unix.go75
-rw-r--r--vendor/github.com/cyphar/filepath-securejoin/LICENSE28
-rw-r--r--vendor/github.com/cyphar/filepath-securejoin/README.md65
-rw-r--r--vendor/github.com/cyphar/filepath-securejoin/join.go135
-rw-r--r--vendor/github.com/cyphar/filepath-securejoin/vendor.conf1
-rw-r--r--vendor/github.com/cyphar/filepath-securejoin/vfs.go41
-rw-r--r--vendor/github.com/opencontainers/runc/README.md2
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor.go37
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go13
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go13
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/cgroups/rootless/rootless.go128
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_nosystemd.go4
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go16
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/compat_1.5_linux.go10
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unsupported.go6
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/configs/config.go4
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go2
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go7
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go68
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go17
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/console.go17
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/console_freebsd.go13
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/console_linux.go129
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/console_solaris.go11
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/console_windows.go30
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/container_linux.go295
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/container_solaris.go20
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/container_windows.go20
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/criu_opts_linux.go5
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/criu_opts_windows.go6
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/devices/devices.go (renamed from vendor/github.com/opencontainers/runc/libcontainer/devices/devices_linux.go)22
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/devices/devices_unsupported.go3
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/devices/number.go24
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go91
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/init_linux.go110
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/intelrdt/intelrdt.go553
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/intelrdt/stats.go24
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/keys/keyctl.go2
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/message_linux.go2
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/mount/mount.go23
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/mount/mount_linux.go82
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/mount/mountinfo.go40
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/notify_linux.go2
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c231
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/process.go4
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/process_linux.go122
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go46
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go47
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/setgroups_linux.go11
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go13
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go59
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/state_linux.go13
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/stats_freebsd.go5
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/stats_linux.go6
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/stats_solaris.go7
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/stats_windows.go5
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/system/linux.go11
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_32.go (renamed from vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_arm.go)3
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_386.go25
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_64.go3
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/system/sysconfig.go2
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unsupported.go38
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go10
-rw-r--r--vendor/github.com/opencontainers/runc/vendor.conf10
78 files changed, 3129 insertions, 847 deletions
diff --git a/cmd/podman/spec.go b/cmd/podman/spec.go
index 3a2402d0e..c5ed2c0d4 100644
--- a/cmd/podman/spec.go
+++ b/cmd/podman/spec.go
@@ -2,6 +2,7 @@ package main
import (
"io/ioutil"
+ "os"
"strconv"
"strings"
@@ -10,6 +11,7 @@ import (
"github.com/docker/docker/pkg/mount"
"github.com/docker/docker/profiles/seccomp"
"github.com/docker/go-units"
+ "github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/devices"
spec "github.com/opencontainers/runtime-spec/specs-go"
"github.com/opencontainers/runtime-tools/generate"
@@ -23,6 +25,9 @@ import (
const cpuPeriod = 100000
+func u32Ptr(i int64) *uint32 { u := uint32(i); return &u }
+func fmPtr(i int64) *os.FileMode { fm := os.FileMode(i); return &fm }
+
func blockAccessToKernelFilesystems(config *createConfig, g *generate.Generator) {
if !config.Privileged {
for _, mp := range []string{
@@ -241,10 +246,19 @@ func createConfigToOCISpec(config *createConfig) (*spec.Spec, error) {
}
// Devices
- for _, device := range config.Devices {
- if err := addDevice(&g, device); err != nil {
+ if config.Privileged {
+ // If privileged, we need to add all the host devices to the
+ // spec. We do not add the user provided ones because we are
+ // already adding them all.
+ if err := config.AddPrivilegedDevices(&g); err != nil {
return nil, err
}
+ } else {
+ for _, device := range config.Devices {
+ if err := addDevice(&g, device); err != nil {
+ return nil, err
+ }
+ }
}
// SECURITY OPTS
@@ -685,3 +699,31 @@ func (c *createConfig) CreatePortBindings() ([]ocicni.PortMapping, error) {
}
return portBindings, nil
}
+
+// AddPrivilegedDevices iterates through host devices and adds all
+// host devices to the spec
+func (c *createConfig) AddPrivilegedDevices(g *generate.Generator) error {
+ hostDevices, err := devices.HostDevices()
+ if err != nil {
+ return err
+ }
+ g.ClearLinuxDevices()
+ for _, d := range hostDevices {
+ g.AddDevice(Device(d))
+ }
+ g.AddLinuxResourcesDevice(true, "", nil, nil, "rwm")
+ return nil
+}
+
+// Device transforms a libcontainer configs.Device to a specs.LinuxDevice object.
+func Device(d *configs.Device) spec.LinuxDevice {
+ return spec.LinuxDevice{
+ Type: string(d.Type),
+ Path: d.Path,
+ Major: d.Major,
+ Minor: d.Minor,
+ FileMode: fmPtr(int64(d.FileMode)),
+ UID: u32Ptr(int64(d.Uid)),
+ GID: u32Ptr(int64(d.Gid)),
+ }
+}
diff --git a/test/e2e/privileged_test.go b/test/e2e/run_privileged_test.go
index b660e1b55..b53be15f0 100644
--- a/test/e2e/privileged_test.go
+++ b/test/e2e/run_privileged_test.go
@@ -73,4 +73,17 @@ var _ = Describe("Podman privileged container tests", func() {
Expect(capAmp[1]).To(Equal(capEff[1]))
})
+ It("podman non-privileged should have very few devices", func() {
+ session := podmanTest.Podman([]string{"run", "busybox", "ls", "-l", "/dev"})
+ session.WaitWithDefaultTimeout()
+ Expect(session.ExitCode()).To(Equal(0))
+ Expect(len(session.OutputToStringArray())).To(Equal(18))
+ })
+
+ It("podman privileged should inherit host devices", func() {
+ session := podmanTest.Podman([]string{"run", "--privileged", ALPINE, "ls", "-l", "/dev"})
+ session.WaitWithDefaultTimeout()
+ Expect(session.ExitCode()).To(Equal(0))
+ Expect(len(session.OutputToStringArray())).To(BeNumerically(">", 20))
+ })
})
diff --git a/vendor.conf b/vendor.conf
index 7df5040ff..559f5c66d 100644
--- a/vendor.conf
+++ b/vendor.conf
@@ -9,7 +9,7 @@ google.golang.org/grpc v1.0.4 https://github.com/grpc/grpc-go
github.com/opencontainers/selinux b29023b86e4a69d1b46b7e7b4e2b6fda03f0b9cd
github.com/opencontainers/go-digest v1.0.0-rc0
github.com/opencontainers/runtime-tools 625e2322645b151a7cbb93a8b42920933e72167f
-github.com/opencontainers/runc 45bde006ca8c90e089894508708bcf0e2cdf9e13
+github.com/opencontainers/runc 6e15bc3f92fd4c58b3285e8f27eaeb6b22d62920
github.com/mrunalp/fileutils master
github.com/vishvananda/netlink master
github.com/vishvananda/netns master
@@ -99,3 +99,5 @@ github.com/containerd/continuity master
github.com/xeipuuv/gojsonschema master
github.com/xeipuuv/gojsonreference master
github.com/xeipuuv/gojsonpointer master
+github.com/cyphar/filepath-securejoin v0.2.1
+github.com/containerd/console 84eeaae905fa414d03e07bcd6c8d3f19e7cf180e
diff --git a/vendor/github.com/containerd/console/LICENSE b/vendor/github.com/containerd/console/LICENSE
new file mode 100644
index 000000000..261eeb9e9
--- /dev/null
+++ b/vendor/github.com/containerd/console/LICENSE
@@ -0,0 +1,201 @@
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/vendor/github.com/containerd/console/README.md b/vendor/github.com/containerd/console/README.md
new file mode 100644
index 000000000..4c56d9d13
--- /dev/null
+++ b/vendor/github.com/containerd/console/README.md
@@ -0,0 +1,17 @@
+# console
+
+[![Build Status](https://travis-ci.org/containerd/console.svg?branch=master)](https://travis-ci.org/containerd/console)
+
+Golang package for dealing with consoles. Light on deps and a simple API.
+
+## Modifying the current process
+
+```go
+current := console.Current()
+defer current.Reset()
+
+if err := current.SetRaw(); err != nil {
+}
+ws, err := current.Size()
+current.Resize(ws)
+```
diff --git a/vendor/github.com/containerd/console/console.go b/vendor/github.com/containerd/console/console.go
new file mode 100644
index 000000000..bf2798fda
--- /dev/null
+++ b/vendor/github.com/containerd/console/console.go
@@ -0,0 +1,62 @@
+package console
+
+import (
+ "errors"
+ "io"
+ "os"
+)
+
+var ErrNotAConsole = errors.New("provided file is not a console")
+
+type Console interface {
+ io.Reader
+ io.Writer
+ io.Closer
+
+ // Resize resizes the console to the provided window size
+ Resize(WinSize) error
+ // ResizeFrom resizes the calling console to the size of the
+ // provided console
+ ResizeFrom(Console) error
+ // SetRaw sets the console in raw mode
+ SetRaw() error
+ // DisableEcho disables echo on the console
+ DisableEcho() error
+ // Reset restores the console to its orignal state
+ Reset() error
+ // Size returns the window size of the console
+ Size() (WinSize, error)
+ // Fd returns the console's file descriptor
+ Fd() uintptr
+ // Name returns the console's file name
+ Name() string
+}
+
+// WinSize specifies the window size of the console
+type WinSize struct {
+ // Height of the console
+ Height uint16
+ // Width of the console
+ Width uint16
+ x uint16
+ y uint16
+}
+
+// Current returns the current processes console
+func Current() Console {
+ c, err := ConsoleFromFile(os.Stdin)
+ if err != nil {
+ // stdin should always be a console for the design
+ // of this function
+ panic(err)
+ }
+ return c
+}
+
+// ConsoleFromFile returns a console using the provided file
+func ConsoleFromFile(f *os.File) (Console, error) {
+ if err := checkConsole(f); err != nil {
+ return nil, err
+ }
+ return newMaster(f)
+}
diff --git a/vendor/github.com/containerd/console/console_linux.go b/vendor/github.com/containerd/console/console_linux.go
new file mode 100644
index 000000000..c96372929
--- /dev/null
+++ b/vendor/github.com/containerd/console/console_linux.go
@@ -0,0 +1,255 @@
+// +build linux
+
+package console
+
+import (
+ "io"
+ "os"
+ "sync"
+
+ "golang.org/x/sys/unix"
+)
+
+const (
+ maxEvents = 128
+)
+
+// Epoller manages multiple epoll consoles using edge-triggered epoll api so we
+// dont have to deal with repeated wake-up of EPOLLER or EPOLLHUP.
+// For more details, see:
+// - https://github.com/systemd/systemd/pull/4262
+// - https://github.com/moby/moby/issues/27202
+//
+// Example usage of Epoller and EpollConsole can be as follow:
+//
+// epoller, _ := NewEpoller()
+// epollConsole, _ := epoller.Add(console)
+// go epoller.Wait()
+// var (
+// b bytes.Buffer
+// wg sync.WaitGroup
+// )
+// wg.Add(1)
+// go func() {
+// io.Copy(&b, epollConsole)
+// wg.Done()
+// }()
+// // perform I/O on the console
+// epollConsole.Shutdown(epoller.CloseConsole)
+// wg.Wait()
+// epollConsole.Close()
+type Epoller struct {
+ efd int
+ mu sync.Mutex
+ fdMapping map[int]*EpollConsole
+}
+
+// NewEpoller returns an instance of epoller with a valid epoll fd.
+func NewEpoller() (*Epoller, error) {
+ efd, err := unix.EpollCreate1(unix.EPOLL_CLOEXEC)
+ if err != nil {
+ return nil, err
+ }
+ return &Epoller{
+ efd: efd,
+ fdMapping: make(map[int]*EpollConsole),
+ }, nil
+}
+
+// Add creates a epoll console based on the provided console. The console will
+// be registered with EPOLLET (i.e. using edge-triggered notification) and its
+// file descriptor will be set to non-blocking mode. After this, user should use
+// the return console to perform I/O.
+func (e *Epoller) Add(console Console) (*EpollConsole, error) {
+ sysfd := int(console.Fd())
+ // Set sysfd to non-blocking mode
+ if err := unix.SetNonblock(sysfd, true); err != nil {
+ return nil, err
+ }
+
+ ev := unix.EpollEvent{
+ Events: unix.EPOLLIN | unix.EPOLLOUT | unix.EPOLLRDHUP | unix.EPOLLET,
+ Fd: int32(sysfd),
+ }
+ if err := unix.EpollCtl(e.efd, unix.EPOLL_CTL_ADD, sysfd, &ev); err != nil {
+ return nil, err
+ }
+ ef := &EpollConsole{
+ Console: console,
+ sysfd: sysfd,
+ readc: sync.NewCond(&sync.Mutex{}),
+ writec: sync.NewCond(&sync.Mutex{}),
+ }
+ e.mu.Lock()
+ e.fdMapping[sysfd] = ef
+ e.mu.Unlock()
+ return ef, nil
+}
+
+// Wait starts the loop to wait for its consoles' notifications and signal
+// appropriate console that it can perform I/O.
+func (e *Epoller) Wait() error {
+ events := make([]unix.EpollEvent, maxEvents)
+ for {
+ n, err := unix.EpollWait(e.efd, events, -1)
+ if err != nil {
+ // EINTR: The call was interrupted by a signal handler before either
+ // any of the requested events occurred or the timeout expired
+ if err == unix.EINTR {
+ continue
+ }
+ return err
+ }
+ for i := 0; i < n; i++ {
+ ev := &events[i]
+ // the console is ready to be read from
+ if ev.Events&(unix.EPOLLIN|unix.EPOLLHUP|unix.EPOLLERR) != 0 {
+ if epfile := e.getConsole(int(ev.Fd)); epfile != nil {
+ epfile.signalRead()
+ }
+ }
+ // the console is ready to be written to
+ if ev.Events&(unix.EPOLLOUT|unix.EPOLLHUP|unix.EPOLLERR) != 0 {
+ if epfile := e.getConsole(int(ev.Fd)); epfile != nil {
+ epfile.signalWrite()
+ }
+ }
+ }
+ }
+}
+
+// Close unregister the console's file descriptor from epoll interface
+func (e *Epoller) CloseConsole(fd int) error {
+ e.mu.Lock()
+ defer e.mu.Unlock()
+ delete(e.fdMapping, fd)
+ return unix.EpollCtl(e.efd, unix.EPOLL_CTL_DEL, fd, &unix.EpollEvent{})
+}
+
+func (e *Epoller) getConsole(sysfd int) *EpollConsole {
+ e.mu.Lock()
+ f := e.fdMapping[sysfd]
+ e.mu.Unlock()
+ return f
+}
+
+// Close the epoll fd
+func (e *Epoller) Close() error {
+ return unix.Close(e.efd)
+}
+
+// EpollConsole acts like a console but register its file descriptor with a
+// epoll fd and uses epoll API to perform I/O.
+type EpollConsole struct {
+ Console
+ readc *sync.Cond
+ writec *sync.Cond
+ sysfd int
+ closed bool
+}
+
+// Read reads up to len(p) bytes into p. It returns the number of bytes read
+// (0 <= n <= len(p)) and any error encountered.
+//
+// If the console's read returns EAGAIN or EIO, we assumes that its a
+// temporary error because the other side went away and wait for the signal
+// generated by epoll event to continue.
+func (ec *EpollConsole) Read(p []byte) (n int, err error) {
+ var read int
+ ec.readc.L.Lock()
+ defer ec.readc.L.Unlock()
+ for {
+ read, err = ec.Console.Read(p[n:])
+ n += read
+ if err != nil {
+ var hangup bool
+ if perr, ok := err.(*os.PathError); ok {
+ hangup = (perr.Err == unix.EAGAIN || perr.Err == unix.EIO)
+ } else {
+ hangup = (err == unix.EAGAIN || err == unix.EIO)
+ }
+ // if the other end disappear, assume this is temporary and wait for the
+ // signal to continue again. Unless we didnt read anything and the
+ // console is already marked as closed then we should exit
+ if hangup && !(n == 0 && len(p) > 0 && ec.closed) {
+ ec.readc.Wait()
+ continue
+ }
+ }
+ break
+ }
+ // if we didnt read anything then return io.EOF to end gracefully
+ if n == 0 && len(p) > 0 && err == nil {
+ err = io.EOF
+ }
+ // signal for others that we finished the read
+ ec.readc.Signal()
+ return n, err
+}
+
+// Writes len(p) bytes from p to the console. It returns the number of bytes
+// written from p (0 <= n <= len(p)) and any error encountered that caused
+// the write to stop early.
+//
+// If writes to the console returns EAGAIN or EIO, we assumes that its a
+// temporary error because the other side went away and wait for the signal
+// generated by epoll event to continue.
+func (ec *EpollConsole) Write(p []byte) (n int, err error) {
+ var written int
+ ec.writec.L.Lock()
+ defer ec.writec.L.Unlock()
+ for {
+ written, err = ec.Console.Write(p[n:])
+ n += written
+ if err != nil {
+ var hangup bool
+ if perr, ok := err.(*os.PathError); ok {
+ hangup = (perr.Err == unix.EAGAIN || perr.Err == unix.EIO)
+ } else {
+ hangup = (err == unix.EAGAIN || err == unix.EIO)
+ }
+ // if the other end disappear, assume this is temporary and wait for the
+ // signal to continue again.
+ if hangup {
+ ec.writec.Wait()
+ continue
+ }
+ }
+ // unrecoverable error, break the loop and return the error
+ break
+ }
+ if n < len(p) && err == nil {
+ err = io.ErrShortWrite
+ }
+ // signal for others that we finished the write
+ ec.writec.Signal()
+ return n, err
+}
+
+// Close closed the file descriptor and signal call waiters for this fd.
+// It accepts a callback which will be called with the console's fd. The
+// callback typically will be used to do further cleanup such as unregister the
+// console's fd from the epoll interface.
+// User should call Shutdown and wait for all I/O operation to be finished
+// before closing the console.
+func (ec *EpollConsole) Shutdown(close func(int) error) error {
+ ec.readc.L.Lock()
+ defer ec.readc.L.Unlock()
+ ec.writec.L.Lock()
+ defer ec.writec.L.Unlock()
+
+ ec.readc.Broadcast()
+ ec.writec.Broadcast()
+ ec.closed = true
+ return close(ec.sysfd)
+}
+
+// signalRead signals that the console is readable.
+func (ec *EpollConsole) signalRead() {
+ ec.readc.Signal()
+}
+
+// signalWrite signals that the console is writable.
+func (ec *EpollConsole) signalWrite() {
+ ec.writec.Signal()
+}
diff --git a/vendor/github.com/containerd/console/console_unix.go b/vendor/github.com/containerd/console/console_unix.go
new file mode 100644
index 000000000..118c8c3ab
--- /dev/null
+++ b/vendor/github.com/containerd/console/console_unix.go
@@ -0,0 +1,142 @@
+// +build darwin freebsd linux solaris
+
+package console
+
+import (
+ "os"
+
+ "golang.org/x/sys/unix"
+)
+
+// NewPty creates a new pty pair
+// The master is returned as the first console and a string
+// with the path to the pty slave is returned as the second
+func NewPty() (Console, string, error) {
+ f, err := os.OpenFile("/dev/ptmx", unix.O_RDWR|unix.O_NOCTTY|unix.O_CLOEXEC, 0)
+ if err != nil {
+ return nil, "", err
+ }
+ slave, err := ptsname(f)
+ if err != nil {
+ return nil, "", err
+ }
+ if err := unlockpt(f); err != nil {
+ return nil, "", err
+ }
+ m, err := newMaster(f)
+ if err != nil {
+ return nil, "", err
+ }
+ return m, slave, nil
+}
+
+type master struct {
+ f *os.File
+ original *unix.Termios
+}
+
+func (m *master) Read(b []byte) (int, error) {
+ return m.f.Read(b)
+}
+
+func (m *master) Write(b []byte) (int, error) {
+ return m.f.Write(b)
+}
+
+func (m *master) Close() error {
+ return m.f.Close()
+}
+
+func (m *master) Resize(ws WinSize) error {
+ return tcswinsz(m.f.Fd(), ws)
+}
+
+func (m *master) ResizeFrom(c Console) error {
+ ws, err := c.Size()
+ if err != nil {
+ return err
+ }
+ return m.Resize(ws)
+}
+
+func (m *master) Reset() error {
+ if m.original == nil {
+ return nil
+ }
+ return tcset(m.f.Fd(), m.original)
+}
+
+func (m *master) getCurrent() (unix.Termios, error) {
+ var termios unix.Termios
+ if err := tcget(m.f.Fd(), &termios); err != nil {
+ return unix.Termios{}, err
+ }
+ return termios, nil
+}
+
+func (m *master) SetRaw() error {
+ rawState, err := m.getCurrent()
+ if err != nil {
+ return err
+ }
+ rawState = cfmakeraw(rawState)
+ rawState.Oflag = rawState.Oflag | unix.OPOST
+ return tcset(m.f.Fd(), &rawState)
+}
+
+func (m *master) DisableEcho() error {
+ rawState, err := m.getCurrent()
+ if err != nil {
+ return err
+ }
+ rawState.Lflag = rawState.Lflag &^ unix.ECHO
+ return tcset(m.f.Fd(), &rawState)
+}
+
+func (m *master) Size() (WinSize, error) {
+ return tcgwinsz(m.f.Fd())
+}
+
+func (m *master) Fd() uintptr {
+ return m.f.Fd()
+}
+
+func (m *master) Name() string {
+ return m.f.Name()
+}
+
+// checkConsole checks if the provided file is a console
+func checkConsole(f *os.File) error {
+ var termios unix.Termios
+ if tcget(f.Fd(), &termios) != nil {
+ return ErrNotAConsole
+ }
+ return nil
+}
+
+func newMaster(f *os.File) (Console, error) {
+ m := &master{
+ f: f,
+ }
+ t, err := m.getCurrent()
+ if err != nil {
+ return nil, err
+ }
+ m.original = &t
+ return m, nil
+}
+
+// ClearONLCR sets the necessary tty_ioctl(4)s to ensure that a pty pair
+// created by us acts normally. In particular, a not-very-well-known default of
+// Linux unix98 ptys is that they have +onlcr by default. While this isn't a
+// problem for terminal emulators, because we relay data from the terminal we
+// also relay that funky line discipline.
+func ClearONLCR(fd uintptr) error {
+ return setONLCR(fd, false)
+}
+
+// SetONLCR sets the necessary tty_ioctl(4)s to ensure that a pty pair
+// created by us acts as intended for a terminal emulator.
+func SetONLCR(fd uintptr) error {
+ return setONLCR(fd, true)
+}
diff --git a/vendor/github.com/containerd/console/console_windows.go b/vendor/github.com/containerd/console/console_windows.go
new file mode 100644
index 000000000..d78a0b841
--- /dev/null
+++ b/vendor/github.com/containerd/console/console_windows.go
@@ -0,0 +1,200 @@
+package console
+
+import (
+ "fmt"
+ "os"
+
+ "github.com/pkg/errors"
+ "golang.org/x/sys/windows"
+)
+
+var (
+ vtInputSupported bool
+ ErrNotImplemented = errors.New("not implemented")
+)
+
+func (m *master) initStdios() {
+ m.in = windows.Handle(os.Stdin.Fd())
+ if err := windows.GetConsoleMode(m.in, &m.inMode); err == nil {
+ // Validate that windows.ENABLE_VIRTUAL_TERMINAL_INPUT is supported, but do not set it.
+ if err = windows.SetConsoleMode(m.in, m.inMode|windows.ENABLE_VIRTUAL_TERMINAL_INPUT); err == nil {
+ vtInputSupported = true
+ }
+ // Unconditionally set the console mode back even on failure because SetConsoleMode
+ // remembers invalid bits on input handles.
+ windows.SetConsoleMode(m.in, m.inMode)
+ } else {
+ fmt.Printf("failed to get console mode for stdin: %v\n", err)
+ }
+
+ m.out = windows.Handle(os.Stdout.Fd())
+ if err := windows.GetConsoleMode(m.out, &m.outMode); err == nil {
+ if err := windows.SetConsoleMode(m.out, m.outMode|windows.ENABLE_VIRTUAL_TERMINAL_PROCESSING); err == nil {
+ m.outMode |= windows.ENABLE_VIRTUAL_TERMINAL_PROCESSING
+ } else {
+ windows.SetConsoleMode(m.out, m.outMode)
+ }
+ } else {
+ fmt.Printf("failed to get console mode for stdout: %v\n", err)
+ }
+
+ m.err = windows.Handle(os.Stderr.Fd())
+ if err := windows.GetConsoleMode(m.err, &m.errMode); err == nil {
+ if err := windows.SetConsoleMode(m.err, m.errMode|windows.ENABLE_VIRTUAL_TERMINAL_PROCESSING); err == nil {
+ m.errMode |= windows.ENABLE_VIRTUAL_TERMINAL_PROCESSING
+ } else {
+ windows.SetConsoleMode(m.err, m.errMode)
+ }
+ } else {
+ fmt.Printf("failed to get console mode for stderr: %v\n", err)
+ }
+}
+
+type master struct {
+ in windows.Handle
+ inMode uint32
+
+ out windows.Handle
+ outMode uint32
+
+ err windows.Handle
+ errMode uint32
+}
+
+func (m *master) SetRaw() error {
+ if err := makeInputRaw(m.in, m.inMode); err != nil {
+ return err
+ }
+
+ // Set StdOut and StdErr to raw mode, we ignore failures since
+ // windows.DISABLE_NEWLINE_AUTO_RETURN might not be supported on this version of
+ // Windows.
+
+ windows.SetConsoleMode(m.out, m.outMode|windows.DISABLE_NEWLINE_AUTO_RETURN)
+
+ windows.SetConsoleMode(m.err, m.errMode|windows.DISABLE_NEWLINE_AUTO_RETURN)
+
+ return nil
+}
+
+func (m *master) Reset() error {
+ for _, s := range []struct {
+ fd windows.Handle
+ mode uint32
+ }{
+ {m.in, m.inMode},
+ {m.out, m.outMode},
+ {m.err, m.errMode},
+ } {
+ if err := windows.SetConsoleMode(s.fd, s.mode); err != nil {
+ return errors.Wrap(err, "unable to restore console mode")
+ }
+ }
+
+ return nil
+}
+
+func (m *master) Size() (WinSize, error) {
+ var info windows.ConsoleScreenBufferInfo
+ err := windows.GetConsoleScreenBufferInfo(m.out, &info)
+ if err != nil {
+ return WinSize{}, errors.Wrap(err, "unable to get console info")
+ }
+
+ winsize := WinSize{
+ Width: uint16(info.Window.Right - info.Window.Left + 1),
+ Height: uint16(info.Window.Bottom - info.Window.Top + 1),
+ }
+
+ return winsize, nil
+}
+
+func (m *master) Resize(ws WinSize) error {
+ return ErrNotImplemented
+}
+
+func (m *master) ResizeFrom(c Console) error {
+ return ErrNotImplemented
+}
+
+func (m *master) DisableEcho() error {
+ mode := m.inMode &^ windows.ENABLE_ECHO_INPUT
+ mode |= windows.ENABLE_PROCESSED_INPUT
+ mode |= windows.ENABLE_LINE_INPUT
+
+ if err := windows.SetConsoleMode(m.in, mode); err != nil {
+ return errors.Wrap(err, "unable to set console to disable echo")
+ }
+
+ return nil
+}
+
+func (m *master) Close() error {
+ return nil
+}
+
+func (m *master) Read(b []byte) (int, error) {
+ panic("not implemented on windows")
+}
+
+func (m *master) Write(b []byte) (int, error) {
+ panic("not implemented on windows")
+}
+
+func (m *master) Fd() uintptr {
+ return uintptr(m.in)
+}
+
+// on windows, console can only be made from os.Std{in,out,err}, hence there
+// isnt a single name here we can use. Return a dummy "console" value in this
+// case should be sufficient.
+func (m *master) Name() string {
+ return "console"
+}
+
+// makeInputRaw puts the terminal (Windows Console) connected to the given
+// file descriptor into raw mode
+func makeInputRaw(fd windows.Handle, mode uint32) error {
+ // See
+ // -- https://msdn.microsoft.com/en-us/library/windows/desktop/ms686033(v=vs.85).aspx
+ // -- https://msdn.microsoft.com/en-us/library/windows/desktop/ms683462(v=vs.85).aspx
+
+ // Disable these modes
+ mode &^= windows.ENABLE_ECHO_INPUT
+ mode &^= windows.ENABLE_LINE_INPUT
+ mode &^= windows.ENABLE_MOUSE_INPUT
+ mode &^= windows.ENABLE_WINDOW_INPUT
+ mode &^= windows.ENABLE_PROCESSED_INPUT
+
+ // Enable these modes
+ mode |= windows.ENABLE_EXTENDED_FLAGS
+ mode |= windows.ENABLE_INSERT_MODE
+ mode |= windows.ENABLE_QUICK_EDIT_MODE
+
+ if vtInputSupported {
+ mode |= windows.ENABLE_VIRTUAL_TERMINAL_INPUT
+ }
+
+ if err := windows.SetConsoleMode(fd, mode); err != nil {
+ return errors.Wrap(err, "unable to set console to raw mode")
+ }
+
+ return nil
+}
+
+func checkConsole(f *os.File) error {
+ var mode uint32
+ if err := windows.GetConsoleMode(windows.Handle(f.Fd()), &mode); err != nil {
+ return err
+ }
+ return nil
+}
+
+func newMaster(f *os.File) (Console, error) {
+ if f != os.Stdin && f != os.Stdout && f != os.Stderr {
+ return nil, errors.New("creating a console from a file is not supported on windows")
+ }
+ m := &master{}
+ m.initStdios()
+ return m, nil
+}
diff --git a/vendor/github.com/containerd/console/tc_darwin.go b/vendor/github.com/containerd/console/tc_darwin.go
new file mode 100644
index 000000000..b102bad74
--- /dev/null
+++ b/vendor/github.com/containerd/console/tc_darwin.go
@@ -0,0 +1,37 @@
+package console
+
+import (
+ "fmt"
+ "os"
+ "unsafe"
+
+ "golang.org/x/sys/unix"
+)
+
+const (
+ cmdTcGet = unix.TIOCGETA
+ cmdTcSet = unix.TIOCSETA
+)
+
+func ioctl(fd, flag, data uintptr) error {
+ if _, _, err := unix.Syscall(unix.SYS_IOCTL, fd, flag, data); err != 0 {
+ return err
+ }
+ return nil
+}
+
+// unlockpt unlocks the slave pseudoterminal device corresponding to the master pseudoterminal referred to by f.
+// unlockpt should be called before opening the slave side of a pty.
+func unlockpt(f *os.File) error {
+ var u int32
+ return ioctl(f.Fd(), unix.TIOCPTYUNLK, uintptr(unsafe.Pointer(&u)))
+}
+
+// ptsname retrieves the name of the first available pts for the given master.
+func ptsname(f *os.File) (string, error) {
+ n, err := unix.IoctlGetInt(int(f.Fd()), unix.TIOCPTYGNAME)
+ if err != nil {
+ return "", err
+ }
+ return fmt.Sprintf("/dev/pts/%d", n), nil
+}
diff --git a/vendor/github.com/containerd/console/tc_freebsd.go b/vendor/github.com/containerd/console/tc_freebsd.go
new file mode 100644
index 000000000..e2a10e441
--- /dev/null
+++ b/vendor/github.com/containerd/console/tc_freebsd.go
@@ -0,0 +1,29 @@
+package console
+
+import (
+ "fmt"
+ "os"
+
+ "golang.org/x/sys/unix"
+)
+
+const (
+ cmdTcGet = unix.TIOCGETA
+ cmdTcSet = unix.TIOCSETA
+)
+
+// unlockpt unlocks the slave pseudoterminal device corresponding to the master pseudoterminal referred to by f.
+// unlockpt should be called before opening the slave side of a pty.
+// This does not exist on FreeBSD, it does not allocate controlling terminals on open
+func unlockpt(f *os.File) error {
+ return nil
+}
+
+// ptsname retrieves the name of the first available pts for the given master.
+func ptsname(f *os.File) (string, error) {
+ n, err := unix.IoctlGetInt(int(f.Fd()), unix.TIOCGPTN)
+ if err != nil {
+ return "", err
+ }
+ return fmt.Sprintf("/dev/pts/%d", n), nil
+}
diff --git a/vendor/github.com/containerd/console/tc_linux.go b/vendor/github.com/containerd/console/tc_linux.go
new file mode 100644
index 000000000..80ef2f6fb
--- /dev/null
+++ b/vendor/github.com/containerd/console/tc_linux.go
@@ -0,0 +1,37 @@
+package console
+
+import (
+ "fmt"
+ "os"
+ "unsafe"
+
+ "golang.org/x/sys/unix"
+)
+
+const (
+ cmdTcGet = unix.TCGETS
+ cmdTcSet = unix.TCSETS
+)
+
+func ioctl(fd, flag, data uintptr) error {
+ if _, _, err := unix.Syscall(unix.SYS_IOCTL, fd, flag, data); err != 0 {
+ return err
+ }
+ return nil
+}
+
+// unlockpt unlocks the slave pseudoterminal device corresponding to the master pseudoterminal referred to by f.
+// unlockpt should be called before opening the slave side of a pty.
+func unlockpt(f *os.File) error {
+ var u int32
+ return ioctl(f.Fd(), unix.TIOCSPTLCK, uintptr(unsafe.Pointer(&u)))
+}
+
+// ptsname retrieves the name of the first available pts for the given master.
+func ptsname(f *os.File) (string, error) {
+ n, err := unix.IoctlGetInt(int(f.Fd()), unix.TIOCGPTN)
+ if err != nil {
+ return "", err
+ }
+ return fmt.Sprintf("/dev/pts/%d", n), nil
+}
diff --git a/vendor/github.com/containerd/console/tc_solaris_cgo.go b/vendor/github.com/containerd/console/tc_solaris_cgo.go
new file mode 100644
index 000000000..f8066d8e3
--- /dev/null
+++ b/vendor/github.com/containerd/console/tc_solaris_cgo.go
@@ -0,0 +1,35 @@
+// +build solaris,cgo
+
+package console
+
+import (
+ "os"
+
+ "golang.org/x/sys/unix"
+)
+
+//#include <stdlib.h>
+import "C"
+
+const (
+ cmdTcGet = unix.TCGETS
+ cmdTcSet = unix.TCSETS
+)
+
+// ptsname retrieves the name of the first available pts for the given master.
+func ptsname(f *os.File) (string, error) {
+ ptspath, err := C.ptsname(C.int(f.Fd()))
+ if err != nil {
+ return "", err
+ }
+ return C.GoString(ptspath), nil
+}
+
+// unlockpt unlocks the slave pseudoterminal device corresponding to the master pseudoterminal referred to by f.
+// unlockpt should be called before opening the slave side of a pty.
+func unlockpt(f *os.File) error {
+ if _, err := C.grantpt(C.int(f.Fd())); err != nil {
+ return err
+ }
+ return nil
+}
diff --git a/vendor/github.com/containerd/console/tc_solaris_nocgo.go b/vendor/github.com/containerd/console/tc_solaris_nocgo.go
new file mode 100644
index 000000000..0aefa0d2b
--- /dev/null
+++ b/vendor/github.com/containerd/console/tc_solaris_nocgo.go
@@ -0,0 +1,31 @@
+// +build solaris,!cgo
+
+//
+// Implementing the functions below requires cgo support. Non-cgo stubs
+// versions are defined below to enable cross-compilation of source code
+// that depends on these functions, but the resultant cross-compiled
+// binaries cannot actually be used. If the stub function(s) below are
+// actually invoked they will display an error message and cause the
+// calling process to exit.
+//
+
+package console
+
+import (
+ "os"
+
+ "golang.org/x/sys/unix"
+)
+
+const (
+ cmdTcGet = unix.TCGETS
+ cmdTcSet = unix.TCSETS
+)
+
+func ptsname(f *os.File) (string, error) {
+ panic("ptsname() support requires cgo.")
+}
+
+func unlockpt(f *os.File) error {
+ panic("unlockpt() support requires cgo.")
+}
diff --git a/vendor/github.com/containerd/console/tc_unix.go b/vendor/github.com/containerd/console/tc_unix.go
new file mode 100644
index 000000000..df7dcb933
--- /dev/null
+++ b/vendor/github.com/containerd/console/tc_unix.go
@@ -0,0 +1,75 @@
+// +build darwin freebsd linux solaris
+
+package console
+
+import (
+ "golang.org/x/sys/unix"
+)
+
+func tcget(fd uintptr, p *unix.Termios) error {
+ termios, err := unix.IoctlGetTermios(int(fd), cmdTcGet)
+ if err != nil {
+ return err
+ }
+ *p = *termios
+ return nil
+}
+
+func tcset(fd uintptr, p *unix.Termios) error {
+ return unix.IoctlSetTermios(int(fd), cmdTcSet, p)
+}
+
+func tcgwinsz(fd uintptr) (WinSize, error) {
+ var ws WinSize
+
+ uws, err := unix.IoctlGetWinsize(int(fd), unix.TIOCGWINSZ)
+ if err != nil {
+ return ws, err
+ }
+
+ // Translate from unix.Winsize to console.WinSize
+ ws.Height = uws.Row
+ ws.Width = uws.Col
+ ws.x = uws.Xpixel
+ ws.y = uws.Ypixel
+ return ws, nil
+}
+
+func tcswinsz(fd uintptr, ws WinSize) error {
+ // Translate from console.WinSize to unix.Winsize
+
+ var uws unix.Winsize
+ uws.Row = ws.Height
+ uws.Col = ws.Width
+ uws.Xpixel = ws.x
+ uws.Ypixel = ws.y
+
+ return unix.IoctlSetWinsize(int(fd), unix.TIOCSWINSZ, &uws)
+}
+
+func setONLCR(fd uintptr, enable bool) error {
+ var termios unix.Termios
+ if err := tcget(fd, &termios); err != nil {
+ return err
+ }
+ if enable {
+ // Set +onlcr so we can act like a real terminal
+ termios.Oflag |= unix.ONLCR
+ } else {
+ // Set -onlcr so we don't have to deal with \r.
+ termios.Oflag &^= unix.ONLCR
+ }
+ return tcset(fd, &termios)
+}
+
+func cfmakeraw(t unix.Termios) unix.Termios {
+ t.Iflag &^= (unix.IGNBRK | unix.BRKINT | unix.PARMRK | unix.ISTRIP | unix.INLCR | unix.IGNCR | unix.ICRNL | unix.IXON)
+ t.Oflag &^= unix.OPOST
+ t.Lflag &^= (unix.ECHO | unix.ECHONL | unix.ICANON | unix.ISIG | unix.IEXTEN)
+ t.Cflag &^= (unix.CSIZE | unix.PARENB)
+ t.Cflag &^= unix.CS8
+ t.Cc[unix.VMIN] = 1
+ t.Cc[unix.VTIME] = 0
+
+ return t
+}
diff --git a/vendor/github.com/cyphar/filepath-securejoin/LICENSE b/vendor/github.com/cyphar/filepath-securejoin/LICENSE
new file mode 100644
index 000000000..bec842f29
--- /dev/null
+++ b/vendor/github.com/cyphar/filepath-securejoin/LICENSE
@@ -0,0 +1,28 @@
+Copyright (C) 2014-2015 Docker Inc & Go Authors. All rights reserved.
+Copyright (C) 2017 SUSE LLC. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+ * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/vendor/github.com/cyphar/filepath-securejoin/README.md b/vendor/github.com/cyphar/filepath-securejoin/README.md
new file mode 100644
index 000000000..49b2baa9f
--- /dev/null
+++ b/vendor/github.com/cyphar/filepath-securejoin/README.md
@@ -0,0 +1,65 @@
+## `filepath-securejoin` ##
+
+[![Build Status](https://travis-ci.org/cyphar/filepath-securejoin.svg?branch=master)](https://travis-ci.org/cyphar/filepath-securejoin)
+
+An implementation of `SecureJoin`, a [candidate for inclusion in the Go
+standard library][go#20126]. The purpose of this function is to be a "secure"
+alternative to `filepath.Join`, and in particular it provides certain
+guarantees that are not provided by `filepath.Join`.
+
+This is the function prototype:
+
+```go
+func SecureJoin(root, unsafePath string) (string, error)
+```
+
+This library **guarantees** the following:
+
+* If no error is set, the resulting string **must** be a child path of
+ `SecureJoin` and will not contain any symlink path components (they will all
+ be expanded).
+
+* When expanding symlinks, all symlink path components **must** be resolved
+ relative to the provided root. In particular, this can be considered a
+ userspace implementation of how `chroot(2)` operates on file paths. Note that
+ these symlinks will **not** be expanded lexically (`filepath.Clean` is not
+ called on the input before processing).
+
+* Non-existant path components are unaffected by `SecureJoin` (similar to
+ `filepath.EvalSymlinks`'s semantics).
+
+* The returned path will always be `filepath.Clean`ed and thus not contain any
+ `..` components.
+
+A (trivial) implementation of this function on GNU/Linux systems could be done
+with the following (note that this requires root privileges and is far more
+opaque than the implementation in this library, and also requires that
+`readlink` is inside the `root` path):
+
+```go
+package securejoin
+
+import (
+ "os/exec"
+ "path/filepath"
+)
+
+func SecureJoin(root, unsafePath string) (string, error) {
+ unsafePath = string(filepath.Separator) + unsafePath
+ cmd := exec.Command("chroot", root,
+ "readlink", "--canonicalize-missing", "--no-newline", unsafePath)
+ output, err := cmd.CombinedOutput()
+ if err != nil {
+ return "", err
+ }
+ expanded := string(output)
+ return filepath.Join(root, expanded), nil
+}
+```
+
+[go#20126]: https://github.com/golang/go/issues/20126
+
+### License ###
+
+The license of this project is the same as Go, which is a BSD 3-clause license
+available in the `LICENSE` file.
diff --git a/vendor/github.com/cyphar/filepath-securejoin/join.go b/vendor/github.com/cyphar/filepath-securejoin/join.go
new file mode 100644
index 000000000..f20985479
--- /dev/null
+++ b/vendor/github.com/cyphar/filepath-securejoin/join.go
@@ -0,0 +1,135 @@
+// Copyright (C) 2014-2015 Docker Inc & Go Authors. All rights reserved.
+// Copyright (C) 2017 SUSE LLC. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package securejoin is an implementation of the hopefully-soon-to-be-included
+// SecureJoin helper that is meant to be part of the "path/filepath" package.
+// The purpose of this project is to provide a PoC implementation to make the
+// SecureJoin proposal (https://github.com/golang/go/issues/20126) more
+// tangible.
+package securejoin
+
+import (
+ "bytes"
+ "fmt"
+ "os"
+ "path/filepath"
+ "strings"
+ "syscall"
+
+ "github.com/pkg/errors"
+)
+
+// ErrSymlinkLoop is returned by SecureJoinVFS when too many symlinks have been
+// evaluated in attempting to securely join the two given paths.
+var ErrSymlinkLoop = fmt.Errorf("SecureJoin: too many links")
+
+// IsNotExist tells you if err is an error that implies that either the path
+// accessed does not exist (or path components don't exist). This is
+// effectively a more broad version of os.IsNotExist.
+func IsNotExist(err error) bool {
+ // If it's a bone-fide ENOENT just bail.
+ if os.IsNotExist(errors.Cause(err)) {
+ return true
+ }
+
+ // Check that it's not actually an ENOTDIR, which in some cases is a more
+ // convoluted case of ENOENT (usually involving weird paths).
+ var errno error
+ switch err := errors.Cause(err).(type) {
+ case *os.PathError:
+ errno = err.Err
+ case *os.LinkError:
+ errno = err.Err
+ case *os.SyscallError:
+ errno = err.Err
+ }
+ return errno == syscall.ENOTDIR || errno == syscall.ENOENT
+}
+
+// SecureJoinVFS joins the two given path components (similar to Join) except
+// that the returned path is guaranteed to be scoped inside the provided root
+// path (when evaluated). Any symbolic links in the path are evaluated with the
+// given root treated as the root of the filesystem, similar to a chroot. The
+// filesystem state is evaluated through the given VFS interface (if nil, the
+// standard os.* family of functions are used).
+//
+// Note that the guarantees provided by this function only apply if the path
+// components in the returned string are not modified (in other words are not
+// replaced with symlinks on the filesystem) after this function has returned.
+// Such a symlink race is necessarily out-of-scope of SecureJoin.
+func SecureJoinVFS(root, unsafePath string, vfs VFS) (string, error) {
+ // Use the os.* VFS implementation if none was specified.
+ if vfs == nil {
+ vfs = osVFS{}
+ }
+
+ var path bytes.Buffer
+ n := 0
+ for unsafePath != "" {
+ if n > 255 {
+ return "", ErrSymlinkLoop
+ }
+
+ // Next path component, p.
+ i := strings.IndexRune(unsafePath, filepath.Separator)
+ var p string
+ if i == -1 {
+ p, unsafePath = unsafePath, ""
+ } else {
+ p, unsafePath = unsafePath[:i], unsafePath[i+1:]
+ }
+
+ // Create a cleaned path, using the lexical semantics of /../a, to
+ // create a "scoped" path component which can safely be joined to fullP
+ // for evaluation. At this point, path.String() doesn't contain any
+ // symlink components.
+ cleanP := filepath.Clean(string(filepath.Separator) + path.String() + p)
+ if cleanP == string(filepath.Separator) {
+ path.Reset()
+ continue
+ }
+ fullP := filepath.Clean(root + cleanP)
+
+ // Figure out whether the path is a symlink.
+ fi, err := vfs.Lstat(fullP)
+ if err != nil && !IsNotExist(err) {
+ return "", err
+ }
+ // Treat non-existent path components the same as non-symlinks (we
+ // can't do any better here).
+ if IsNotExist(err) || fi.Mode()&os.ModeSymlink == 0 {
+ path.WriteString(p)
+ path.WriteRune(filepath.Separator)
+ continue
+ }
+
+ // Only increment when we actually dereference a link.
+ n++
+
+ // It's a symlink, expand it by prepending it to the yet-unparsed path.
+ dest, err := vfs.Readlink(fullP)
+ if err != nil {
+ return "", err
+ }
+ // Absolute symlinks reset any work we've already done.
+ if filepath.IsAbs(dest) {
+ path.Reset()
+ }
+ unsafePath = dest + string(filepath.Separator) + unsafePath
+ }
+
+ // We have to clean path.String() here because it may contain '..'
+ // components that are entirely lexical, but would be misleading otherwise.
+ // And finally do a final clean to ensure that root is also lexically
+ // clean.
+ fullP := filepath.Clean(string(filepath.Separator) + path.String())
+ return filepath.Clean(root + fullP), nil
+}
+
+// SecureJoin is a wrapper around SecureJoinVFS that just uses the os.* library
+// of functions as the VFS. If in doubt, use this function over SecureJoinVFS.
+func SecureJoin(root, unsafePath string) (string, error) {
+ return SecureJoinVFS(root, unsafePath, nil)
+}
diff --git a/vendor/github.com/cyphar/filepath-securejoin/vendor.conf b/vendor/github.com/cyphar/filepath-securejoin/vendor.conf
new file mode 100644
index 000000000..66bb574b9
--- /dev/null
+++ b/vendor/github.com/cyphar/filepath-securejoin/vendor.conf
@@ -0,0 +1 @@
+github.com/pkg/errors v0.8.0
diff --git a/vendor/github.com/cyphar/filepath-securejoin/vfs.go b/vendor/github.com/cyphar/filepath-securejoin/vfs.go
new file mode 100644
index 000000000..a82a5eae1
--- /dev/null
+++ b/vendor/github.com/cyphar/filepath-securejoin/vfs.go
@@ -0,0 +1,41 @@
+// Copyright (C) 2017 SUSE LLC. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package securejoin
+
+import "os"
+
+// In future this should be moved into a separate package, because now there
+// are several projects (umoci and go-mtree) that are using this sort of
+// interface.
+
+// VFS is the minimal interface necessary to use SecureJoinVFS. A nil VFS is
+// equivalent to using the standard os.* family of functions. This is mainly
+// used for the purposes of mock testing, but also can be used to otherwise use
+// SecureJoin with VFS-like system.
+type VFS interface {
+ // Lstat returns a FileInfo describing the named file. If the file is a
+ // symbolic link, the returned FileInfo describes the symbolic link. Lstat
+ // makes no attempt to follow the link. These semantics are identical to
+ // os.Lstat.
+ Lstat(name string) (os.FileInfo, error)
+
+ // Readlink returns the destination of the named symbolic link. These
+ // semantics are identical to os.Readlink.
+ Readlink(name string) (string, error)
+}
+
+// osVFS is the "nil" VFS, in that it just passes everything through to the os
+// module.
+type osVFS struct{}
+
+// Lstat returns a FileInfo describing the named file. If the file is a
+// symbolic link, the returned FileInfo describes the symbolic link. Lstat
+// makes no attempt to follow the link. These semantics are identical to
+// os.Lstat.
+func (o osVFS) Lstat(name string) (os.FileInfo, error) { return os.Lstat(name) }
+
+// Readlink returns the destination of the named symbolic link. These
+// semantics are identical to os.Readlink.
+func (o osVFS) Readlink(name string) (string, error) { return os.Readlink(name) }
diff --git a/vendor/github.com/opencontainers/runc/README.md b/vendor/github.com/opencontainers/runc/README.md
index eabfb982b..3ca7a1a22 100644
--- a/vendor/github.com/opencontainers/runc/README.md
+++ b/vendor/github.com/opencontainers/runc/README.md
@@ -56,7 +56,7 @@ make BUILDTAGS='seccomp apparmor'
|-----------|------------------------------------|-------------|
| seccomp | Syscall filtering | libseccomp |
| selinux | selinux process and mount labeling | <none> |
-| apparmor | apparmor profile support | libapparmor |
+| apparmor | apparmor profile support | <none> |
| ambient | ambient capability support | kernel 4.3 |
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor.go b/vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor.go
index 82ed1a68a..7fff0627f 100644
--- a/vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor.go
@@ -2,15 +2,10 @@
package apparmor
-// #cgo LDFLAGS: -lapparmor
-// #include <sys/apparmor.h>
-// #include <stdlib.h>
-import "C"
import (
"fmt"
"io/ioutil"
"os"
- "unsafe"
)
// IsEnabled returns true if apparmor is enabled for the host.
@@ -24,16 +19,36 @@ func IsEnabled() bool {
return false
}
+func setprocattr(attr, value string) error {
+ // Under AppArmor you can only change your own attr, so use /proc/self/
+ // instead of /proc/<tid>/ like libapparmor does
+ path := fmt.Sprintf("/proc/self/attr/%s", attr)
+
+ f, err := os.OpenFile(path, os.O_WRONLY, 0)
+ if err != nil {
+ return err
+ }
+ defer f.Close()
+
+ _, err = fmt.Fprintf(f, "%s", value)
+ return err
+}
+
+// changeOnExec reimplements aa_change_onexec from libapparmor in Go
+func changeOnExec(name string) error {
+ value := "exec " + name
+ if err := setprocattr("exec", value); err != nil {
+ return fmt.Errorf("apparmor failed to apply profile: %s", err)
+ }
+ return nil
+}
+
// ApplyProfile will apply the profile with the specified name to the process after
// the next exec.
func ApplyProfile(name string) error {
if name == "" {
return nil
}
- cName := C.CString(name)
- defer C.free(unsafe.Pointer(cName))
- if _, err := C.aa_change_onexec(cName); err != nil {
- return fmt.Errorf("apparmor failed to apply profile: %s", err)
- }
- return nil
+
+ return changeOnExec(name)
}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go
index 22d82acb4..43bdccf3e 100644
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go
@@ -145,8 +145,17 @@ func (m *Manager) Apply(pid int) (err error) {
m.Paths[sys.Name()] = p
if err := sys.Apply(d); err != nil {
+ if os.IsPermission(err) && m.Cgroups.Path == "" {
+ // If we didn't set a cgroup path, then let's defer the error here
+ // until we know whether we have set limits or not.
+ // If we hadn't set limits, then it's ok that we couldn't join this cgroup, because
+ // it will have the same limits as its parent.
+ delete(m.Paths, sys.Name())
+ continue
+ }
return err
}
+
}
return nil
}
@@ -198,6 +207,10 @@ func (m *Manager) Set(container *configs.Config) error {
for _, sys := range subsystems {
path := paths[sys.Name()]
if err := sys.Set(path, container.Cgroups); err != nil {
+ if path == "" {
+ // cgroup never applied
+ return fmt.Errorf("cannot set limits on the %s cgroup, as the container has not joined it", sys.Name())
+ }
return err
}
}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go
index e70dfe3b9..4b19f8a97 100644
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go
@@ -29,11 +29,15 @@ func (s *FreezerGroup) Apply(d *cgroupData) error {
func (s *FreezerGroup) Set(path string, cgroup *configs.Cgroup) error {
switch cgroup.Resources.Freezer {
case configs.Frozen, configs.Thawed:
- if err := writeFile(path, "freezer.state", string(cgroup.Resources.Freezer)); err != nil {
- return err
- }
-
for {
+ // In case this loop does not exit because it doesn't get the expected
+ // state, let's write again this state, hoping it's going to be properly
+ // set this time. Otherwise, this loop could run infinitely, waiting for
+ // a state change that would never happen.
+ if err := writeFile(path, "freezer.state", string(cgroup.Resources.Freezer)); err != nil {
+ return err
+ }
+
state, err := readFile(path, "freezer.state")
if err != nil {
return err
@@ -41,6 +45,7 @@ func (s *FreezerGroup) Set(path string, cgroup *configs.Cgroup) error {
if strings.TrimSpace(state) == string(cgroup.Resources.Freezer) {
break
}
+
time.Sleep(1 * time.Millisecond)
}
case configs.Undefined:
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/rootless/rootless.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/rootless/rootless.go
deleted file mode 100644
index b1efbfd99..000000000
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/rootless/rootless.go
+++ /dev/null
@@ -1,128 +0,0 @@
-// +build linux
-
-package rootless
-
-import (
- "fmt"
-
- "github.com/opencontainers/runc/libcontainer/cgroups"
- "github.com/opencontainers/runc/libcontainer/cgroups/fs"
- "github.com/opencontainers/runc/libcontainer/configs"
- "github.com/opencontainers/runc/libcontainer/configs/validate"
-)
-
-// TODO: This is copied from libcontainer/cgroups/fs, which duplicates this code
-// needlessly. We should probably export this list.
-
-var subsystems = []subsystem{
- &fs.CpusetGroup{},
- &fs.DevicesGroup{},
- &fs.MemoryGroup{},
- &fs.CpuGroup{},
- &fs.CpuacctGroup{},
- &fs.PidsGroup{},
- &fs.BlkioGroup{},
- &fs.HugetlbGroup{},
- &fs.NetClsGroup{},
- &fs.NetPrioGroup{},
- &fs.PerfEventGroup{},
- &fs.FreezerGroup{},
- &fs.NameGroup{GroupName: "name=systemd"},
-}
-
-type subsystem interface {
- // Name returns the name of the subsystem.
- Name() string
-
- // Returns the stats, as 'stats', corresponding to the cgroup under 'path'.
- GetStats(path string, stats *cgroups.Stats) error
-}
-
-// The noop cgroup manager is used for rootless containers, because we currently
-// cannot manage cgroups if we are in a rootless setup. This manager is chosen
-// by factory if we are in rootless mode. We error out if any cgroup options are
-// set in the config -- this may change in the future with upcoming kernel features
-// like the cgroup namespace.
-
-type Manager struct {
- Cgroups *configs.Cgroup
- Paths map[string]string
-}
-
-func (m *Manager) Apply(pid int) error {
- // If there are no cgroup settings, there's nothing to do.
- if m.Cgroups == nil {
- return nil
- }
-
- // We can't set paths.
- // TODO(cyphar): Implement the case where the runner of a rootless container
- // owns their own cgroup, which would allow us to set up a
- // cgroup for each path.
- if m.Cgroups.Paths != nil {
- return fmt.Errorf("cannot change cgroup path in rootless container")
- }
-
- // We load the paths into the manager.
- paths := make(map[string]string)
- for _, sys := range subsystems {
- name := sys.Name()
-
- path, err := cgroups.GetOwnCgroupPath(name)
- if err != nil {
- // Ignore paths we couldn't resolve.
- continue
- }
-
- paths[name] = path
- }
-
- m.Paths = paths
- return nil
-}
-
-func (m *Manager) GetPaths() map[string]string {
- return m.Paths
-}
-
-func (m *Manager) Set(container *configs.Config) error {
- // We have to re-do the validation here, since someone might decide to
- // update a rootless container.
- return validate.New().Validate(container)
-}
-
-func (m *Manager) GetPids() ([]int, error) {
- dir, err := cgroups.GetOwnCgroupPath("devices")
- if err != nil {
- return nil, err
- }
- return cgroups.GetPids(dir)
-}
-
-func (m *Manager) GetAllPids() ([]int, error) {
- dir, err := cgroups.GetOwnCgroupPath("devices")
- if err != nil {
- return nil, err
- }
- return cgroups.GetAllPids(dir)
-}
-
-func (m *Manager) GetStats() (*cgroups.Stats, error) {
- // TODO(cyphar): We can make this work if we figure out a way to allow usage
- // of cgroups with a rootless container. While this doesn't
- // actually require write access to a cgroup directory, the
- // statistics are not useful if they can be affected by
- // non-container processes.
- return nil, fmt.Errorf("cannot get cgroup stats in rootless container")
-}
-
-func (m *Manager) Freeze(state configs.FreezerState) error {
- // TODO(cyphar): We can make this work if we figure out a way to allow usage
- // of cgroups with a rootless container.
- return fmt.Errorf("cannot use freezer cgroup in rootless container")
-}
-
-func (m *Manager) Destroy() error {
- // We don't have to do anything here because we didn't do any setup.
- return nil
-}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_nosystemd.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_nosystemd.go
index 7de9ae605..a65d8e443 100644
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_nosystemd.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_nosystemd.go
@@ -1,4 +1,4 @@
-// +build !linux
+// +build !linux static_build
package systemd
@@ -43,7 +43,7 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) {
}
func (m *Manager) Set(container *configs.Config) error {
- return nil, fmt.Errorf("Systemd not supported")
+ return fmt.Errorf("Systemd not supported")
}
func (m *Manager) Freeze(state configs.FreezerState) error {
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go
index b010b4b32..45bd3acce 100644
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go
@@ -1,4 +1,4 @@
-// +build linux
+// +build linux,!static_build
package systemd
@@ -260,7 +260,7 @@ func (m *Manager) Apply(pid int) error {
if c.Resources.Memory != 0 {
properties = append(properties,
- newProp("MemoryLimit", c.Resources.Memory))
+ newProp("MemoryLimit", uint64(c.Resources.Memory)))
}
if c.Resources.CpuShares != 0 {
@@ -271,6 +271,13 @@ func (m *Manager) Apply(pid int) error {
// cpu.cfs_quota_us and cpu.cfs_period_us are controlled by systemd.
if c.Resources.CpuQuota != 0 && c.Resources.CpuPeriod != 0 {
cpuQuotaPerSecUSec := uint64(c.Resources.CpuQuota*1000000) / c.Resources.CpuPeriod
+ // systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota
+ // (integer percentage of CPU) internally. This means that if a fractional percent of
+ // CPU is indicated by Resources.CpuQuota, we need to round up to the nearest
+ // 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect.
+ if cpuQuotaPerSecUSec%10000 != 0 {
+ cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000
+ }
properties = append(properties,
newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec))
}
@@ -288,10 +295,13 @@ func (m *Manager) Apply(pid int) error {
}
}
- if _, err := theConn.StartTransientUnit(unitName, "replace", properties, nil); err != nil && !isUnitExists(err) {
+ statusChan := make(chan string)
+ if _, err := theConn.StartTransientUnit(unitName, "replace", properties, statusChan); err != nil && !isUnitExists(err) {
return err
}
+ <-statusChan
+
if err := joinCgroups(c, pid); err != nil {
return err
}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/compat_1.5_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/compat_1.5_linux.go
deleted file mode 100644
index c7bdf1f60..000000000
--- a/vendor/github.com/opencontainers/runc/libcontainer/compat_1.5_linux.go
+++ /dev/null
@@ -1,10 +0,0 @@
-// +build linux,!go1.5
-
-package libcontainer
-
-import "syscall"
-
-// GidMappingsEnableSetgroups was added in Go 1.5, so do nothing when building
-// with earlier versions
-func enableSetgroups(sys *syscall.SysProcAttr) {
-}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unsupported.go
deleted file mode 100644
index 95e2830a4..000000000
--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unsupported.go
+++ /dev/null
@@ -1,6 +0,0 @@
-// +build !windows,!linux,!freebsd
-
-package configs
-
-type Cgroup struct {
-}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go
index 269fffff3..3cae4fd8d 100644
--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go
@@ -187,6 +187,10 @@ type Config struct {
// Rootless specifies whether the container is a rootless container.
Rootless bool `json:"rootless"`
+
+ // IntelRdt specifies settings for Intel RDT/CAT group that the container is placed into
+ // to limit the resources (e.g., L3 cache) the container has available
+ IntelRdt *IntelRdt `json:"intel_rdt,omitempty"`
}
type Hooks struct {
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go
index 4d348d217..e4f423c52 100644
--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go
@@ -1,4 +1,4 @@
-// +build linux freebsd
+// +build linux
package configs
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go
new file mode 100644
index 000000000..36bd5f96a
--- /dev/null
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go
@@ -0,0 +1,7 @@
+package configs
+
+type IntelRdt struct {
+ // The schema for L3 cache id and capacity bitmask (CBM)
+ // Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
+ L3CacheSchema string `json:"l3_cache_schema,omitempty"`
+}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go
index 0cebfaf80..7a9f33b71 100644
--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go
@@ -21,13 +21,6 @@ func (v *ConfigValidator) rootless(config *configs.Config) error {
if err := rootlessMount(config); err != nil {
return err
}
- // Currently, cgroups cannot effectively be used in rootless containers.
- // The new cgroup namespace doesn't really help us either because it doesn't
- // have nice interactions with the user namespace (we're working with upstream
- // to fix this).
- if err := rootlessCgroup(config); err != nil {
- return err
- }
// XXX: We currently can't verify the user config at all, because
// configs.Config doesn't store the user-related configs. So this
@@ -36,37 +29,27 @@ func (v *ConfigValidator) rootless(config *configs.Config) error {
return nil
}
-func rootlessMappings(config *configs.Config) error {
- rootuid, err := config.HostRootUID()
- if err != nil {
- return fmt.Errorf("failed to get root uid from uidMappings: %v", err)
+func hasIDMapping(id int, mappings []configs.IDMap) bool {
+ for _, m := range mappings {
+ if id >= m.ContainerID && id < m.ContainerID+m.Size {
+ return true
+ }
}
+ return false
+}
+
+func rootlessMappings(config *configs.Config) error {
if euid := geteuid(); euid != 0 {
if !config.Namespaces.Contains(configs.NEWUSER) {
return fmt.Errorf("rootless containers require user namespaces")
}
- if rootuid != euid {
- return fmt.Errorf("rootless containers cannot map container root to a different host user")
- }
- }
-
- rootgid, err := config.HostRootGID()
- if err != nil {
- return fmt.Errorf("failed to get root gid from gidMappings: %v", err)
}
- // Similar to the above test, we need to make sure that we aren't trying to
- // map to a group ID that we don't have the right to be.
- if rootgid != getegid() {
- return fmt.Errorf("rootless containers cannot map container root to a different host group")
+ if len(config.UidMappings) == 0 {
+ return fmt.Errorf("rootless containers requires at least one UID mapping")
}
-
- // We can only map one user and group inside a container (our own).
- if len(config.UidMappings) != 1 || config.UidMappings[0].Size != 1 {
- return fmt.Errorf("rootless containers cannot map more than one user")
- }
- if len(config.GidMappings) != 1 || config.GidMappings[0].Size != 1 {
- return fmt.Errorf("rootless containers cannot map more than one group")
+ if len(config.GidMappings) == 0 {
+ return fmt.Errorf("rootless containers requires at least one UID mapping")
}
return nil
@@ -104,11 +87,28 @@ func rootlessMount(config *configs.Config) error {
// Check that the options list doesn't contain any uid= or gid= entries
// that don't resolve to root.
for _, opt := range strings.Split(mount.Data, ",") {
- if strings.HasPrefix(opt, "uid=") && opt != "uid=0" {
- return fmt.Errorf("cannot specify uid= mount options in rootless containers where argument isn't 0")
+ if strings.HasPrefix(opt, "uid=") {
+ var uid int
+ n, err := fmt.Sscanf(opt, "uid=%d", &uid)
+ if n != 1 || err != nil {
+ // Ignore unknown mount options.
+ continue
+ }
+ if !hasIDMapping(uid, config.UidMappings) {
+ return fmt.Errorf("cannot specify uid= mount options for unmapped uid in rootless containers")
+ }
}
- if strings.HasPrefix(opt, "gid=") && opt != "gid=0" {
- return fmt.Errorf("cannot specify gid= mount options in rootless containers where argument isn't 0")
+
+ if strings.HasPrefix(opt, "gid=") {
+ var gid int
+ n, err := fmt.Sscanf(opt, "gid=%d", &gid)
+ if n != 1 || err != nil {
+ // Ignore unknown mount options.
+ continue
+ }
+ if !hasIDMapping(gid, config.GidMappings) {
+ return fmt.Errorf("cannot specify gid= mount options for unmapped gid in rootless containers")
+ }
}
}
}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go
index 828434544..cbbba9a03 100644
--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go
@@ -7,6 +7,7 @@ import (
"strings"
"github.com/opencontainers/runc/libcontainer/configs"
+ "github.com/opencontainers/runc/libcontainer/intelrdt"
selinux "github.com/opencontainers/selinux/go-selinux"
)
@@ -40,6 +41,9 @@ func (v *ConfigValidator) Validate(config *configs.Config) error {
if err := v.sysctl(config); err != nil {
return err
}
+ if err := v.intelrdt(config); err != nil {
+ return err
+ }
if config.Rootless {
if err := v.rootless(config); err != nil {
return err
@@ -153,6 +157,19 @@ func (v *ConfigValidator) sysctl(config *configs.Config) error {
return nil
}
+func (v *ConfigValidator) intelrdt(config *configs.Config) error {
+ if config.IntelRdt != nil {
+ if !intelrdt.IsEnabled() {
+ return fmt.Errorf("intelRdt is specified in config, but Intel RDT feature is not supported or enabled")
+ }
+ if config.IntelRdt.L3CacheSchema == "" {
+ return fmt.Errorf("intelRdt is specified in config, but intelRdt.l3CacheSchema is empty")
+ }
+ }
+
+ return nil
+}
+
func isSymbolicLink(path string) (bool, error) {
fi, err := os.Lstat(path)
if err != nil {
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/console.go b/vendor/github.com/opencontainers/runc/libcontainer/console.go
deleted file mode 100644
index 917acc702..000000000
--- a/vendor/github.com/opencontainers/runc/libcontainer/console.go
+++ /dev/null
@@ -1,17 +0,0 @@
-package libcontainer
-
-import (
- "io"
- "os"
-)
-
-// Console represents a pseudo TTY.
-type Console interface {
- io.ReadWriteCloser
-
- // Path returns the filesystem path to the slave side of the pty.
- Path() string
-
- // Fd returns the fd for the master of the pty.
- File() *os.File
-}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/console_freebsd.go b/vendor/github.com/opencontainers/runc/libcontainer/console_freebsd.go
deleted file mode 100644
index b7166a31f..000000000
--- a/vendor/github.com/opencontainers/runc/libcontainer/console_freebsd.go
+++ /dev/null
@@ -1,13 +0,0 @@
-// +build freebsd
-
-package libcontainer
-
-import (
- "errors"
-)
-
-// newConsole returns an initialized console that can be used within a container by copying bytes
-// from the master side to the slave that is attached as the tty for the container's init process.
-func newConsole() (Console, error) {
- return nil, errors.New("libcontainer console is not supported on FreeBSD")
-}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/console_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/console_linux.go
index f70de3848..9997e93ed 100644
--- a/vendor/github.com/opencontainers/runc/libcontainer/console_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/console_linux.go
@@ -1,71 +1,14 @@
package libcontainer
import (
- "fmt"
"os"
- "unsafe"
"golang.org/x/sys/unix"
)
-func ConsoleFromFile(f *os.File) Console {
- return &linuxConsole{
- master: f,
- }
-}
-
-// newConsole returns an initialized console that can be used within a container by copying bytes
-// from the master side to the slave that is attached as the tty for the container's init process.
-func newConsole() (Console, error) {
- master, err := os.OpenFile("/dev/ptmx", unix.O_RDWR|unix.O_NOCTTY|unix.O_CLOEXEC, 0)
- if err != nil {
- return nil, err
- }
- console, err := ptsname(master)
- if err != nil {
- return nil, err
- }
- if err := unlockpt(master); err != nil {
- return nil, err
- }
- return &linuxConsole{
- slavePath: console,
- master: master,
- }, nil
-}
-
-// linuxConsole is a linux pseudo TTY for use within a container.
-type linuxConsole struct {
- master *os.File
- slavePath string
-}
-
-func (c *linuxConsole) File() *os.File {
- return c.master
-}
-
-func (c *linuxConsole) Path() string {
- return c.slavePath
-}
-
-func (c *linuxConsole) Read(b []byte) (int, error) {
- return c.master.Read(b)
-}
-
-func (c *linuxConsole) Write(b []byte) (int, error) {
- return c.master.Write(b)
-}
-
-func (c *linuxConsole) Close() error {
- if m := c.master; m != nil {
- return m.Close()
- }
- return nil
-}
-
// mount initializes the console inside the rootfs mounting with the specified mount label
// and applying the correct ownership of the console.
-func (c *linuxConsole) mount() error {
+func mountConsole(slavePath string) error {
oldMask := unix.Umask(0000)
defer unix.Umask(oldMask)
f, err := os.Create("/dev/console")
@@ -75,17 +18,20 @@ func (c *linuxConsole) mount() error {
if f != nil {
f.Close()
}
- return unix.Mount(c.slavePath, "/dev/console", "bind", unix.MS_BIND, "")
+ return unix.Mount(slavePath, "/dev/console", "bind", unix.MS_BIND, "")
}
// dupStdio opens the slavePath for the console and dups the fds to the current
// processes stdio, fd 0,1,2.
-func (c *linuxConsole) dupStdio() error {
- slave, err := c.open(unix.O_RDWR)
+func dupStdio(slavePath string) error {
+ fd, err := unix.Open(slavePath, unix.O_RDWR, 0)
if err != nil {
- return err
+ return &os.PathError{
+ Op: "open",
+ Path: slavePath,
+ Err: err,
+ }
}
- fd := int(slave.Fd())
for _, i := range []int{0, 1, 2} {
if err := unix.Dup3(fd, i, 0); err != nil {
return err
@@ -93,60 +39,3 @@ func (c *linuxConsole) dupStdio() error {
}
return nil
}
-
-// open is a clone of os.OpenFile without the O_CLOEXEC used to open the pty slave.
-func (c *linuxConsole) open(flag int) (*os.File, error) {
- r, e := unix.Open(c.slavePath, flag, 0)
- if e != nil {
- return nil, &os.PathError{
- Op: "open",
- Path: c.slavePath,
- Err: e,
- }
- }
- return os.NewFile(uintptr(r), c.slavePath), nil
-}
-
-func ioctl(fd uintptr, flag, data uintptr) error {
- if _, _, err := unix.Syscall(unix.SYS_IOCTL, fd, flag, data); err != 0 {
- return err
- }
- return nil
-}
-
-// unlockpt unlocks the slave pseudoterminal device corresponding to the master pseudoterminal referred to by f.
-// unlockpt should be called before opening the slave side of a pty.
-func unlockpt(f *os.File) error {
- var u int32
- return ioctl(f.Fd(), unix.TIOCSPTLCK, uintptr(unsafe.Pointer(&u)))
-}
-
-// ptsname retrieves the name of the first available pts for the given master.
-func ptsname(f *os.File) (string, error) {
- n, err := unix.IoctlGetInt(int(f.Fd()), unix.TIOCGPTN)
- if err != nil {
- return "", err
- }
- return fmt.Sprintf("/dev/pts/%d", n), nil
-}
-
-// SaneTerminal sets the necessary tty_ioctl(4)s to ensure that a pty pair
-// created by us acts normally. In particular, a not-very-well-known default of
-// Linux unix98 ptys is that they have +onlcr by default. While this isn't a
-// problem for terminal emulators, because we relay data from the terminal we
-// also relay that funky line discipline.
-func SaneTerminal(terminal *os.File) error {
- termios, err := unix.IoctlGetTermios(int(terminal.Fd()), unix.TCGETS)
- if err != nil {
- return fmt.Errorf("ioctl(tty, tcgets): %s", err.Error())
- }
-
- // Set -onlcr so we don't have to deal with \r.
- termios.Oflag &^= unix.ONLCR
-
- if err := unix.IoctlSetTermios(int(terminal.Fd()), unix.TCSETS, termios); err != nil {
- return fmt.Errorf("ioctl(tty, tcsets): %s", err.Error())
- }
-
- return nil
-}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/console_solaris.go b/vendor/github.com/opencontainers/runc/libcontainer/console_solaris.go
deleted file mode 100644
index e5ca54599..000000000
--- a/vendor/github.com/opencontainers/runc/libcontainer/console_solaris.go
+++ /dev/null
@@ -1,11 +0,0 @@
-package libcontainer
-
-import (
- "errors"
-)
-
-// newConsole returns an initialized console that can be used within a container by copying bytes
-// from the master side to the slave that is attached as the tty for the container's init process.
-func newConsole() (Console, error) {
- return nil, errors.New("libcontainer console is not supported on Solaris")
-}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/console_windows.go b/vendor/github.com/opencontainers/runc/libcontainer/console_windows.go
deleted file mode 100644
index c61e866a5..000000000
--- a/vendor/github.com/opencontainers/runc/libcontainer/console_windows.go
+++ /dev/null
@@ -1,30 +0,0 @@
-package libcontainer
-
-// newConsole returns an initialized console that can be used within a container
-func newConsole() (Console, error) {
- return &windowsConsole{}, nil
-}
-
-// windowsConsole is a Windows pseudo TTY for use within a container.
-type windowsConsole struct {
-}
-
-func (c *windowsConsole) Fd() uintptr {
- return 0
-}
-
-func (c *windowsConsole) Path() string {
- return ""
-}
-
-func (c *windowsConsole) Read(b []byte) (int, error) {
- return 0, nil
-}
-
-func (c *windowsConsole) Write(b []byte) (int, error) {
- return 0, nil
-}
-
-func (c *windowsConsole) Close() error {
- return nil
-}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go
index d7e7516e5..cfb05b43a 100644
--- a/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go
@@ -5,6 +5,7 @@ package libcontainer
import (
"bytes"
"encoding/json"
+ "errors"
"fmt"
"io"
"io/ioutil"
@@ -21,6 +22,7 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/criurpc"
+ "github.com/opencontainers/runc/libcontainer/intelrdt"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/utils"
@@ -38,10 +40,14 @@ type linuxContainer struct {
root string
config *configs.Config
cgroupManager cgroups.Manager
+ intelRdtManager intelrdt.Manager
+ initPath string
initArgs []string
initProcess parentProcess
initProcessStartTime uint64
criuPath string
+ newuidmapPath string
+ newgidmapPath string
m sync.Mutex
criuVersion int
state containerState
@@ -67,6 +73,9 @@ type State struct {
// Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore
ExternalDescriptors []string `json:"external_descriptors,omitempty"`
+
+ // Intel RDT "resource control" filesystem path
+ IntelRdtPath string `json:"intel_rdt_path"`
}
// Container is a libcontainer container object.
@@ -163,6 +172,11 @@ func (c *linuxContainer) Stats() (*Stats, error) {
if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil {
return stats, newSystemErrorWithCause(err, "getting container stats from cgroups")
}
+ if c.intelRdtManager != nil {
+ if stats.IntelRdtStats, err = c.intelRdtManager.GetStats(); err != nil {
+ return stats, newSystemErrorWithCause(err, "getting container's Intel RDT stats")
+ }
+ }
for _, iface := range c.config.Networks {
switch iface.Type {
case "veth":
@@ -186,8 +200,26 @@ func (c *linuxContainer) Set(config configs.Config) error {
if status == Stopped {
return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning)
}
+ if err := c.cgroupManager.Set(&config); err != nil {
+ // Set configs back
+ if err2 := c.cgroupManager.Set(c.config); err2 != nil {
+ logrus.Warnf("Setting back cgroup configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2)
+ }
+ return err
+ }
+ if c.intelRdtManager != nil {
+ if err := c.intelRdtManager.Set(&config); err != nil {
+ // Set configs back
+ if err2 := c.intelRdtManager.Set(c.config); err2 != nil {
+ logrus.Warnf("Setting back intelrdt configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2)
+ }
+ return err
+ }
+ }
+ // After config setting succeed, update config and states
c.config = &config
- return c.cgroupManager.Set(c.config)
+ _, err = c.updateState(nil)
+ return err
}
func (c *linuxContainer) Start(process *Process) error {
@@ -236,20 +268,71 @@ func (c *linuxContainer) Exec() error {
func (c *linuxContainer) exec() error {
path := filepath.Join(c.root, execFifoFilename)
- f, err := os.OpenFile(path, os.O_RDONLY, 0)
- if err != nil {
- return newSystemErrorWithCause(err, "open exec fifo for reading")
+
+ fifoOpen := make(chan struct{})
+ select {
+ case <-awaitProcessExit(c.initProcess.pid(), fifoOpen):
+ return errors.New("container process is already dead")
+ case result := <-awaitFifoOpen(path):
+ close(fifoOpen)
+ if result.err != nil {
+ return result.err
+ }
+ f := result.file
+ defer f.Close()
+ if err := readFromExecFifo(f); err != nil {
+ return err
+ }
+ return os.Remove(path)
}
- defer f.Close()
- data, err := ioutil.ReadAll(f)
+}
+
+func readFromExecFifo(execFifo io.Reader) error {
+ data, err := ioutil.ReadAll(execFifo)
if err != nil {
return err
}
- if len(data) > 0 {
- os.Remove(path)
- return nil
+ if len(data) <= 0 {
+ return fmt.Errorf("cannot start an already running container")
}
- return fmt.Errorf("cannot start an already running container")
+ return nil
+}
+
+func awaitProcessExit(pid int, exit <-chan struct{}) <-chan struct{} {
+ isDead := make(chan struct{})
+ go func() {
+ for {
+ select {
+ case <-exit:
+ return
+ case <-time.After(time.Millisecond * 100):
+ stat, err := system.Stat(pid)
+ if err != nil || stat.State == system.Zombie {
+ close(isDead)
+ return
+ }
+ }
+ }
+ }()
+ return isDead
+}
+
+func awaitFifoOpen(path string) <-chan openResult {
+ fifoOpened := make(chan openResult)
+ go func() {
+ f, err := os.OpenFile(path, os.O_RDONLY, 0)
+ if err != nil {
+ fifoOpened <- openResult{err: newSystemErrorWithCause(err, "open exec fifo for reading")}
+ return
+ }
+ fifoOpened <- openResult{file: f}
+ }()
+ return fifoOpened
+}
+
+type openResult struct {
+ file *os.File
+ err error
}
func (c *linuxContainer) start(process *Process, isInit bool) error {
@@ -259,7 +342,7 @@ func (c *linuxContainer) start(process *Process, isInit bool) error {
}
if err := parent.start(); err != nil {
// terminate the process to ensure that it properly is reaped.
- if err := parent.terminate(); err != nil {
+ if err := ignoreTerminateErrors(parent.terminate()); err != nil {
logrus.Warn(err)
}
return newSystemErrorWithCause(err, "starting container process")
@@ -277,15 +360,17 @@ func (c *linuxContainer) start(process *Process, isInit bool) error {
c.initProcessStartTime = state.InitProcessStartTime
if c.config.Hooks != nil {
+ bundle, annotations := utils.Annotations(c.config.Labels)
s := configs.HookState{
- Version: c.config.Version,
- ID: c.id,
- Pid: parent.pid(),
- Bundle: utils.SearchLabels(c.config.Labels, "bundle"),
+ Version: c.config.Version,
+ ID: c.id,
+ Pid: parent.pid(),
+ Bundle: bundle,
+ Annotations: annotations,
}
for i, hook := range c.config.Hooks.Poststart {
if err := hook.Run(s); err != nil {
- if err := parent.terminate(); err != nil {
+ if err := ignoreTerminateErrors(parent.terminate()); err != nil {
logrus.Warn(err)
}
return newSystemErrorWithCausef(err, "running poststart hook %d", i)
@@ -341,6 +426,23 @@ func (c *linuxContainer) deleteExecFifo() {
os.Remove(fifoName)
}
+// includeExecFifo opens the container's execfifo as a pathfd, so that the
+// container cannot access the statedir (and the FIFO itself remains
+// un-opened). It then adds the FifoFd to the given exec.Cmd as an inherited
+// fd, with _LIBCONTAINER_FIFOFD set to its fd number.
+func (c *linuxContainer) includeExecFifo(cmd *exec.Cmd) error {
+ fifoName := filepath.Join(c.root, execFifoFilename)
+ fifoFd, err := unix.Open(fifoName, unix.O_PATH|unix.O_CLOEXEC, 0)
+ if err != nil {
+ return err
+ }
+
+ cmd.ExtraFiles = append(cmd.ExtraFiles, os.NewFile(uintptr(fifoFd), fifoName))
+ cmd.Env = append(cmd.Env,
+ fmt.Sprintf("_LIBCONTAINER_FIFOFD=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
+ return nil
+}
+
func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProcess, error) {
parentPipe, childPipe, err := utils.NewSockPair("init")
if err != nil {
@@ -354,22 +456,20 @@ func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProces
return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
}
- // We only set up rootDir if we're not doing a `runc exec`. The reason for
- // this is to avoid cases where a racing, unprivileged process inside the
- // container can get access to the statedir file descriptor (which would
- // allow for container rootfs escape).
- rootDir, err := os.Open(c.root)
- if err != nil {
- return nil, err
+ // We only set up fifoFd if we're not doing a `runc exec`. The historic
+ // reason for this is that previously we would pass a dirfd that allowed
+ // for container rootfs escape (and not doing it in `runc exec` avoided
+ // that problem), but we no longer do that. However, there's no need to do
+ // this for `runc exec` so we just keep it this way to be safe.
+ if err := c.includeExecFifo(cmd); err != nil {
+ return nil, newSystemErrorWithCause(err, "including execfifo in cmd.Exec setup")
}
- cmd.ExtraFiles = append(cmd.ExtraFiles, rootDir)
- cmd.Env = append(cmd.Env,
- fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
- return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir)
+ return c.newInitProcess(p, cmd, parentPipe, childPipe)
}
func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) {
- cmd := exec.Command(c.initArgs[0], c.initArgs[1:]...)
+ cmd := exec.Command(c.initPath, c.initArgs[1:]...)
+ cmd.Args[0] = c.initArgs[0]
cmd.Stdin = p.Stdin
cmd.Stdout = p.Stdout
cmd.Stderr = p.Stderr
@@ -397,7 +497,7 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.
return cmd, nil
}
-func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess, error) {
+func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
nsMaps := make(map[configs.NamespaceType]string)
for _, ns := range c.config.Namespaces {
@@ -411,16 +511,16 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c
return nil, err
}
return &initProcess{
- cmd: cmd,
- childPipe: childPipe,
- parentPipe: parentPipe,
- manager: c.cgroupManager,
- config: c.newInitConfig(p),
- container: c,
- process: p,
- bootstrapData: data,
- sharePidns: sharePidns,
- rootDir: rootDir,
+ cmd: cmd,
+ childPipe: childPipe,
+ parentPipe: parentPipe,
+ manager: c.cgroupManager,
+ intelRdtManager: c.intelRdtManager,
+ config: c.newInitConfig(p),
+ container: c,
+ process: p,
+ bootstrapData: data,
+ sharePidns: sharePidns,
}, nil
}
@@ -439,6 +539,7 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe,
return &setnsProcess{
cmd: cmd,
cgroupPaths: c.cgroupManager.GetPaths(),
+ intelRdtPath: state.IntelRdtPath,
childPipe: childPipe,
parentPipe: parentPipe,
config: c.newInitConfig(p),
@@ -477,6 +578,8 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
cfg.Rlimits = process.Rlimits
}
cfg.CreateConsole = process.ConsoleSocket != nil
+ cfg.ConsoleWidth = process.ConsoleWidth
+ cfg.ConsoleHeight = process.ConsoleHeight
return cfg
}
@@ -578,9 +681,24 @@ func (c *linuxContainer) checkCriuFeatures(criuOpts *CriuOpts, rpcOpts *criurpc.
logrus.Debugf("Feature check says: %s", criuFeatures)
missingFeatures := false
- if *criuFeat.MemTrack && !*criuFeatures.MemTrack {
- missingFeatures = true
- logrus.Debugf("CRIU does not support MemTrack")
+ // The outer if checks if the fields actually exist
+ if (criuFeat.MemTrack != nil) &&
+ (criuFeatures.MemTrack != nil) {
+ // The inner if checks if they are set to true
+ if *criuFeat.MemTrack && !*criuFeatures.MemTrack {
+ missingFeatures = true
+ logrus.Debugf("CRIU does not support MemTrack")
+ }
+ }
+
+ // This needs to be repeated for every new feature check.
+ // Is there a way to put this in a function. Reflection?
+ if (criuFeat.LazyPages != nil) &&
+ (criuFeatures.LazyPages != nil) {
+ if *criuFeat.LazyPages && !*criuFeatures.LazyPages {
+ missingFeatures = true
+ logrus.Debugf("CRIU does not support LazyPages")
+ }
}
if missingFeatures {
@@ -610,9 +728,9 @@ func parseCriuVersion(path string) (int, error) {
return 0, fmt.Errorf("Unable to parse the CRIU version: %s", path)
}
- n, err := fmt.Sscanf(string(version), "GitID: v%d.%d.%d", &x, &y, &z) // 1.5.2
+ n, err := fmt.Sscanf(version, "GitID: v%d.%d.%d", &x, &y, &z) // 1.5.2
if err != nil {
- n, err = fmt.Sscanf(string(version), "GitID: v%d.%d", &x, &y) // 1.6
+ n, err = fmt.Sscanf(version, "GitID: v%d.%d", &x, &y) // 1.6
y++
} else {
z++
@@ -736,6 +854,25 @@ func (c *linuxContainer) addMaskPaths(req *criurpc.CriuReq) error {
}
req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
}
+ return nil
+}
+
+func waitForCriuLazyServer(r *os.File, status string) error {
+
+ data := make([]byte, 1)
+ _, err := r.Read(data)
+ if err != nil {
+ return err
+ }
+ fd, err := os.OpenFile(status, os.O_TRUNC|os.O_WRONLY, os.ModeAppend)
+ if err != nil {
+ return err
+ }
+ _, err = fd.Write(data)
+ if err != nil {
+ return err
+ }
+ fd.Close()
return nil
}
@@ -802,6 +939,8 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
FileLocks: proto.Bool(criuOpts.FileLocks),
EmptyNs: proto.Uint32(criuOpts.EmptyNs),
OrphanPtsMaster: proto.Bool(true),
+ AutoDedup: proto.Bool(criuOpts.AutoDedup),
+ LazyPages: proto.Bool(criuOpts.LazyPages),
}
fcg := c.cgroupManager.GetPaths()["freezer"]
@@ -852,6 +991,24 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
Opts: &rpcOpts,
}
+ if criuOpts.LazyPages {
+ // lazy migration requested; check if criu supports it
+ feat := criurpc.CriuFeatures{
+ LazyPages: proto.Bool(true),
+ }
+
+ if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil {
+ return err
+ }
+
+ statusRead, statusWrite, err := os.Pipe()
+ if err != nil {
+ return err
+ }
+ rpcOpts.StatusFd = proto.Int32(int32(statusWrite.Fd()))
+ go waitForCriuLazyServer(statusRead, criuOpts.StatusFd)
+ }
+
//no need to dump these information in pre-dump
if !criuOpts.PreDump {
for _, m := range c.config.Mounts {
@@ -1003,6 +1160,8 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
FileLocks: proto.Bool(criuOpts.FileLocks),
EmptyNs: proto.Uint32(criuOpts.EmptyNs),
OrphanPtsMaster: proto.Bool(true),
+ AutoDedup: proto.Bool(criuOpts.AutoDedup),
+ LazyPages: proto.Bool(criuOpts.LazyPages),
},
}
@@ -1331,11 +1490,13 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc
}
case notify.GetScript() == "setup-namespaces":
if c.config.Hooks != nil {
+ bundle, annotations := utils.Annotations(c.config.Labels)
s := configs.HookState{
- Version: c.config.Version,
- ID: c.id,
- Pid: int(notify.GetPid()),
- Bundle: utils.SearchLabels(c.config.Labels, "bundle"),
+ Version: c.config.Version,
+ ID: c.id,
+ Pid: int(notify.GetPid()),
+ Bundle: bundle,
+ Annotations: annotations,
}
for i, hook := range c.config.Hooks.Prestart {
if err := hook.Run(s); err != nil {
@@ -1380,7 +1541,7 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc
defer master.Close()
// While we can access console.master, using the API is a good idea.
- if err := utils.SendFd(process.ConsoleSocket, master); err != nil {
+ if err := utils.SendFd(process.ConsoleSocket, master.Name(), master.Fd()); err != nil {
return err
}
}
@@ -1388,7 +1549,9 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc
}
func (c *linuxContainer) updateState(process parentProcess) (*State, error) {
- c.initProcess = process
+ if process != nil {
+ c.initProcess = process
+ }
state, err := c.currentState()
if err != nil {
return nil, err
@@ -1493,6 +1656,10 @@ func (c *linuxContainer) currentState() (*State, error) {
startTime, _ = c.initProcess.startTime()
externalDescriptors = c.initProcess.externalDescriptors()
}
+ intelRdtPath, err := intelrdt.GetIntelRdtPath(c.ID())
+ if err != nil {
+ intelRdtPath = ""
+ }
state := &State{
BaseState: BaseState{
ID: c.ID(),
@@ -1503,6 +1670,7 @@ func (c *linuxContainer) currentState() (*State, error) {
},
Rootless: c.config.Rootless,
CgroupPaths: c.cgroupManager.GetPaths(),
+ IntelRdtPath: intelRdtPath,
NamespacePaths: make(map[configs.NamespaceType]string),
ExternalDescriptors: externalDescriptors,
}
@@ -1601,6 +1769,12 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na
if !joinExistingUser {
// write uid mappings
if len(c.config.UidMappings) > 0 {
+ if c.config.Rootless && c.newuidmapPath != "" {
+ r.AddData(&Bytemsg{
+ Type: UidmapPathAttr,
+ Value: []byte(c.newuidmapPath),
+ })
+ }
b, err := encodeIDMapping(c.config.UidMappings)
if err != nil {
return nil, err
@@ -1621,6 +1795,12 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na
Type: GidmapAttr,
Value: b,
})
+ if c.config.Rootless && c.newgidmapPath != "" {
+ r.AddData(&Bytemsg{
+ Type: GidmapPathAttr,
+ Value: []byte(c.newgidmapPath),
+ })
+ }
// The following only applies if we are root.
if !c.config.Rootless {
// check if we have CAP_SETGID to setgroup properly
@@ -1652,3 +1832,18 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na
return bytes.NewReader(r.Serialize()), nil
}
+
+// ignoreTerminateErrors returns nil if the given err matches an error known
+// to indicate that the terminate occurred successfully or err was nil, otherwise
+// err is returned unaltered.
+func ignoreTerminateErrors(err error) error {
+ if err == nil {
+ return nil
+ }
+ s := err.Error()
+ switch {
+ case strings.Contains(s, "process already finished"), strings.Contains(s, "Wait was already called"):
+ return nil
+ }
+ return err
+}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/container_solaris.go b/vendor/github.com/opencontainers/runc/libcontainer/container_solaris.go
deleted file mode 100644
index bb84ff740..000000000
--- a/vendor/github.com/opencontainers/runc/libcontainer/container_solaris.go
+++ /dev/null
@@ -1,20 +0,0 @@
-package libcontainer
-
-// State represents a running container's state
-type State struct {
- BaseState
-
- // Platform specific fields below here
-}
-
-// A libcontainer container object.
-//
-// Each container is thread-safe within the same process. Since a container can
-// be destroyed by a separate process, any function may return that the container
-// was not found.
-type Container interface {
- BaseContainer
-
- // Methods below here are platform specific
-
-}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/container_windows.go b/vendor/github.com/opencontainers/runc/libcontainer/container_windows.go
deleted file mode 100644
index bb84ff740..000000000
--- a/vendor/github.com/opencontainers/runc/libcontainer/container_windows.go
+++ /dev/null
@@ -1,20 +0,0 @@
-package libcontainer
-
-// State represents a running container's state
-type State struct {
- BaseState
-
- // Platform specific fields below here
-}
-
-// A libcontainer container object.
-//
-// Each container is thread-safe within the same process. Since a container can
-// be destroyed by a separate process, any function may return that the container
-// was not found.
-type Container interface {
- BaseContainer
-
- // Methods below here are platform specific
-
-}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/criu_opts_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/criu_opts_linux.go
index 9423d2464..a2e344fc4 100644
--- a/vendor/github.com/opencontainers/runc/libcontainer/criu_opts_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/criu_opts_linux.go
@@ -23,7 +23,7 @@ type VethPairName struct {
type CriuOpts struct {
ImagesDirectory string // directory for storing image files
WorkDirectory string // directory to cd and write logs/pidfiles/stats to
- ParentImage string // direcotry for storing parent image files in pre-dump and dump
+ ParentImage string // directory for storing parent image files in pre-dump and dump
LeaveRunning bool // leave container in running state after checkpoint
TcpEstablished bool // checkpoint/restore established TCP connections
ExternalUnixConnections bool // allow external unix connections
@@ -34,4 +34,7 @@ type CriuOpts struct {
VethPairs []VethPairName // pass the veth to criu when restore
ManageCgroupsMode cgMode // dump or restore cgroup mode
EmptyNs uint32 // don't c/r properties for namespace from this mask
+ AutoDedup bool // auto deduplication for incremental dumps
+ LazyPages bool // restore memory pages lazily using userfaultfd
+ StatusFd string // fd for feedback when lazy server is ready
}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/criu_opts_windows.go b/vendor/github.com/opencontainers/runc/libcontainer/criu_opts_windows.go
deleted file mode 100644
index bc9207703..000000000
--- a/vendor/github.com/opencontainers/runc/libcontainer/criu_opts_windows.go
+++ /dev/null
@@ -1,6 +0,0 @@
-package libcontainer
-
-// TODO Windows: This can ultimately be entirely factored out as criu is
-// a Unix concept not relevant on Windows.
-type CriuOpts struct {
-}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/devices/devices_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/devices/devices.go
index 461dc097c..361925890 100644
--- a/vendor/github.com/opencontainers/runc/libcontainer/devices/devices_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/devices/devices.go
@@ -28,6 +28,15 @@ func DeviceFromPath(path, permissions string) (*configs.Device, error) {
if err != nil {
return nil, err
}
+
+ var (
+ devNumber = stat.Rdev
+ major = unix.Major(devNumber)
+ )
+ if major == 0 {
+ return nil, ErrNotADevice
+ }
+
var (
devType rune
mode = stat.Mode
@@ -37,21 +46,16 @@ func DeviceFromPath(path, permissions string) (*configs.Device, error) {
devType = 'b'
case mode&unix.S_IFCHR == unix.S_IFCHR:
devType = 'c'
- default:
- return nil, ErrNotADevice
}
- devNumber := int(stat.Rdev)
- uid := stat.Uid
- gid := stat.Gid
return &configs.Device{
Type: devType,
Path: path,
- Major: Major(devNumber),
- Minor: Minor(devNumber),
+ Major: int64(major),
+ Minor: int64(unix.Minor(devNumber)),
Permissions: permissions,
FileMode: os.FileMode(mode),
- Uid: uid,
- Gid: gid,
+ Uid: stat.Uid,
+ Gid: stat.Gid,
}, nil
}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/devices/devices_unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/devices/devices_unsupported.go
deleted file mode 100644
index 6649b9f2d..000000000
--- a/vendor/github.com/opencontainers/runc/libcontainer/devices/devices_unsupported.go
+++ /dev/null
@@ -1,3 +0,0 @@
-// +build !linux
-
-package devices
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/devices/number.go b/vendor/github.com/opencontainers/runc/libcontainer/devices/number.go
deleted file mode 100644
index 885b6e5dd..000000000
--- a/vendor/github.com/opencontainers/runc/libcontainer/devices/number.go
+++ /dev/null
@@ -1,24 +0,0 @@
-// +build linux freebsd
-
-package devices
-
-/*
-
-This code provides support for manipulating linux device numbers. It should be replaced by normal syscall functions once http://code.google.com/p/go/issues/detail?id=8106 is solved.
-
-You can read what they are here:
-
- - http://www.makelinux.net/ldd3/chp-3-sect-2
- - http://www.linux-tutorial.info/modules.php?name=MContent&pageid=94
-
-Note! These are NOT the same as the MAJOR(dev_t device);, MINOR(dev_t device); and MKDEV(int major, int minor); functions as defined in <linux/kdev_t.h> as the representation of device numbers used by go is different than the one used internally to the kernel! - https://github.com/torvalds/linux/blob/master/include/linux/kdev_t.h#L9
-
-*/
-
-func Major(devNumber int) int64 {
- return int64((devNumber >> 8) & 0xfff)
-}
-
-func Minor(devNumber int) int64 {
- return int64((devNumber & 0xff) | ((devNumber >> 12) & 0xfff00))
-}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go
index 42b6f5a05..7d53d5e04 100644
--- a/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go
@@ -11,13 +11,13 @@ import (
"runtime/debug"
"strconv"
- "github.com/docker/docker/pkg/mount"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs"
- "github.com/opencontainers/runc/libcontainer/cgroups/rootless"
"github.com/opencontainers/runc/libcontainer/cgroups/systemd"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/configs/validate"
+ "github.com/opencontainers/runc/libcontainer/intelrdt"
+ "github.com/opencontainers/runc/libcontainer/mount"
"github.com/opencontainers/runc/libcontainer/utils"
"golang.org/x/sys/unix"
@@ -72,15 +72,15 @@ func Cgroupfs(l *LinuxFactory) error {
return nil
}
-// RootlessCgroups is an options func to configure a LinuxFactory to
-// return containers that use the "rootless" cgroup manager, which will
-// fail to do any operations not possible to do with an unprivileged user.
-// It should only be used in conjunction with rootless containers.
-func RootlessCgroups(l *LinuxFactory) error {
- l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
- return &rootless.Manager{
- Cgroups: config,
- Paths: paths,
+// IntelRdtfs is an options func to configure a LinuxFactory to return
+// containers that use the Intel RDT "resource control" filesystem to
+// create and manage Intel Xeon platform shared resources (e.g., L3 cache).
+func IntelRdtFs(l *LinuxFactory) error {
+ l.NewIntelRdtManager = func(config *configs.Config, id string, path string) intelrdt.Manager {
+ return &intelrdt.IntelRdtManager{
+ Config: config,
+ Id: id,
+ Path: path,
}
}
return nil
@@ -119,12 +119,16 @@ func New(root string, options ...func(*LinuxFactory) error) (Factory, error) {
}
l := &LinuxFactory{
Root: root,
- InitArgs: []string{"/proc/self/exe", "init"},
+ InitPath: "/proc/self/exe",
+ InitArgs: []string{os.Args[0], "init"},
Validator: validate.New(),
CriuPath: "criu",
}
Cgroupfs(l)
for _, opt := range options {
+ if opt == nil {
+ continue
+ }
if err := opt(l); err != nil {
return nil, err
}
@@ -137,6 +141,10 @@ type LinuxFactory struct {
// Root directory for the factory to store state.
Root string
+ // InitPath is the path for calling the init responsibilities for spawning
+ // a container.
+ InitPath string
+
// InitArgs are arguments for calling the init responsibilities for spawning
// a container.
InitArgs []string
@@ -145,11 +153,19 @@ type LinuxFactory struct {
// containers.
CriuPath string
+ // New{u,g}uidmapPath is the path to the binaries used for mapping with
+ // rootless containers.
+ NewuidmapPath string
+ NewgidmapPath string
+
// Validator provides validation to container configurations.
Validator validate.Validator
// NewCgroupsManager returns an initialized cgroups manager for a single container.
NewCgroupsManager func(config *configs.Cgroup, paths map[string]string) cgroups.Manager
+
+ // NewIntelRdtManager returns an initialized Intel RDT manager for a single container.
+ NewIntelRdtManager func(config *configs.Config, id string, path string) intelrdt.Manager
}
func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) {
@@ -174,17 +190,20 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
if err := os.Chown(containerRoot, unix.Geteuid(), unix.Getegid()); err != nil {
return nil, newGenericError(err, SystemError)
}
- if config.Rootless {
- RootlessCgroups(l)
- }
c := &linuxContainer{
id: id,
root: containerRoot,
config: config,
+ initPath: l.InitPath,
initArgs: l.InitArgs,
criuPath: l.CriuPath,
+ newuidmapPath: l.NewuidmapPath,
+ newgidmapPath: l.NewgidmapPath,
cgroupManager: l.NewCgroupsManager(config.Cgroups, nil),
}
+ if intelrdt.IsEnabled() {
+ c.intelRdtManager = l.NewIntelRdtManager(config, id, "")
+ }
c.state = &stoppedState{c: c}
return c, nil
}
@@ -203,17 +222,16 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
processStartTime: state.InitProcessStartTime,
fds: state.ExternalDescriptors,
}
- // We have to use the RootlessManager.
- if state.Rootless {
- RootlessCgroups(l)
- }
c := &linuxContainer{
initProcess: r,
initProcessStartTime: state.InitProcessStartTime,
id: id,
config: &state.Config,
+ initPath: l.InitPath,
initArgs: l.InitArgs,
criuPath: l.CriuPath,
+ newuidmapPath: l.NewuidmapPath,
+ newgidmapPath: l.NewgidmapPath,
cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths),
root: containerRoot,
created: state.Created,
@@ -222,6 +240,9 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
if err := c.refreshState(); err != nil {
return nil, err
}
+ if intelrdt.IsEnabled() {
+ c.intelRdtManager = l.NewIntelRdtManager(&state.Config, id, state.IntelRdtPath)
+ }
return c, nil
}
@@ -233,10 +254,10 @@ func (l *LinuxFactory) Type() string {
// This is a low level implementation detail of the reexec and should not be consumed externally
func (l *LinuxFactory) StartInitialization() (err error) {
var (
- pipefd, rootfd int
+ pipefd, fifofd int
consoleSocket *os.File
envInitPipe = os.Getenv("_LIBCONTAINER_INITPIPE")
- envStateDir = os.Getenv("_LIBCONTAINER_STATEDIR")
+ envFifoFd = os.Getenv("_LIBCONTAINER_FIFOFD")
envConsole = os.Getenv("_LIBCONTAINER_CONSOLE")
)
@@ -252,11 +273,11 @@ func (l *LinuxFactory) StartInitialization() (err error) {
)
defer pipe.Close()
- // Only init processes have STATEDIR.
- rootfd = -1
+ // Only init processes have FIFOFD.
+ fifofd = -1
if it == initStandard {
- if rootfd, err = strconv.Atoi(envStateDir); err != nil {
- return fmt.Errorf("unable to convert _LIBCONTAINER_STATEDIR=%s to int: %s", envStateDir, err)
+ if fifofd, err = strconv.Atoi(envFifoFd); err != nil {
+ return fmt.Errorf("unable to convert _LIBCONTAINER_FIFOFD=%s to int: %s", envFifoFd, err)
}
}
@@ -291,7 +312,7 @@ func (l *LinuxFactory) StartInitialization() (err error) {
}
}()
- i, err := newContainerInit(it, pipe, consoleSocket, rootfd)
+ i, err := newContainerInit(it, pipe, consoleSocket, fifofd)
if err != nil {
return err
}
@@ -323,3 +344,21 @@ func (l *LinuxFactory) validateID(id string) error {
return nil
}
+
+// NewuidmapPath returns an option func to configure a LinuxFactory with the
+// provided ..
+func NewuidmapPath(newuidmapPath string) func(*LinuxFactory) error {
+ return func(l *LinuxFactory) error {
+ l.NewuidmapPath = newuidmapPath
+ return nil
+ }
+}
+
+// NewgidmapPath returns an option func to configure a LinuxFactory with the
+// provided ..
+func NewgidmapPath(newgidmapPath string) func(*LinuxFactory) error {
+ return func(l *LinuxFactory) error {
+ l.NewgidmapPath = newgidmapPath
+ return nil
+ }
+}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go
index 63afd28eb..2770be307 100644
--- a/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go
@@ -12,15 +12,16 @@ import (
"syscall" // only for Errno
"unsafe"
+ "golang.org/x/sys/unix"
+
+ "github.com/containerd/console"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/user"
"github.com/opencontainers/runc/libcontainer/utils"
-
"github.com/sirupsen/logrus"
"github.com/vishvananda/netlink"
- "golang.org/x/sys/unix"
)
type initType string
@@ -31,7 +32,8 @@ const (
)
type pid struct {
- Pid int `json:"pid"`
+ Pid int `json:"pid"`
+ PidFirstChild int `json:"pid_first"`
}
// network is an internal struct used to setup container networks.
@@ -60,6 +62,8 @@ type initConfig struct {
ContainerId string `json:"containerid"`
Rlimits []configs.Rlimit `json:"rlimits"`
CreateConsole bool `json:"create_console"`
+ ConsoleWidth uint16 `json:"console_width"`
+ ConsoleHeight uint16 `json:"console_height"`
Rootless bool `json:"rootless"`
}
@@ -67,7 +71,7 @@ type initer interface {
Init() error
}
-func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, stateDirFD int) (initer, error) {
+func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd int) (initer, error) {
var config *initConfig
if err := json.NewDecoder(pipe).Decode(&config); err != nil {
return nil, err
@@ -88,7 +92,7 @@ func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, stateDi
consoleSocket: consoleSocket,
parentPid: unix.Getppid(),
config: config,
- stateDirFD: stateDirFD,
+ fifoFd: fifoFd,
}, nil
}
return nil, fmt.Errorf("unknown init type %q", t)
@@ -169,29 +173,38 @@ func setupConsole(socket *os.File, config *initConfig, mount bool) error {
// however, that setupUser (specifically fixStdioPermissions) *will* change
// the UID owner of the console to be the user the process will run as (so
// they can actually control their console).
- console, err := newConsole()
+
+ pty, slavePath, err := console.NewPty()
if err != nil {
return err
}
- // After we return from here, we don't need the console anymore.
- defer console.Close()
- linuxConsole, ok := console.(*linuxConsole)
- if !ok {
- return fmt.Errorf("failed to cast console to *linuxConsole")
+ if config.ConsoleHeight != 0 && config.ConsoleWidth != 0 {
+ err = pty.Resize(console.WinSize{
+ Height: config.ConsoleHeight,
+ Width: config.ConsoleWidth,
+ })
+
+ if err != nil {
+ return err
+ }
}
+
+ // After we return from here, we don't need the console anymore.
+ defer pty.Close()
+
// Mount the console inside our rootfs.
if mount {
- if err := linuxConsole.mount(); err != nil {
+ if err := mountConsole(slavePath); err != nil {
return err
}
}
// While we can access console.master, using the API is a good idea.
- if err := utils.SendFd(socket, linuxConsole.File()); err != nil {
+ if err := utils.SendFd(socket, pty.Name(), pty.Fd()); err != nil {
return err
}
// Now, dup over all the things.
- return linuxConsole.dupStdio()
+ return dupStdio(slavePath)
}
// syncParentReady sends to the given pipe a JSON payload which indicates that
@@ -260,25 +273,27 @@ func setupUser(config *initConfig) error {
}
}
- if config.Rootless {
- if execUser.Uid != 0 {
- return fmt.Errorf("cannot run as a non-root user in a rootless container")
- }
-
- if execUser.Gid != 0 {
- return fmt.Errorf("cannot run as a non-root group in a rootless container")
- }
+ // Rather than just erroring out later in setuid(2) and setgid(2), check
+ // that the user is mapped here.
+ if _, err := config.Config.HostUID(execUser.Uid); err != nil {
+ return fmt.Errorf("cannot set uid to unmapped user in user namespace")
+ }
+ if _, err := config.Config.HostGID(execUser.Gid); err != nil {
+ return fmt.Errorf("cannot set gid to unmapped user in user namespace")
+ }
- // We cannot set any additional groups in a rootless container and thus we
- // bail if the user asked us to do so. TODO: We currently can't do this
- // earlier, but if libcontainer.Process.User was typesafe this might work.
+ if config.Rootless {
+ // We cannot set any additional groups in a rootless container and thus
+ // we bail if the user asked us to do so. TODO: We currently can't do
+ // this check earlier, but if libcontainer.Process.User was typesafe
+ // this might work.
if len(addGroups) > 0 {
return fmt.Errorf("cannot set any additional groups in a rootless container")
}
}
- // before we change to the container's user make sure that the processes STDIO
- // is correctly owned by the user that we are switching to.
+ // Before we change to the container's user make sure that the processes
+ // STDIO is correctly owned by the user that we are switching to.
if err := fixStdioPermissions(config, execUser); err != nil {
return err
}
@@ -297,7 +312,6 @@ func setupUser(config *initConfig) error {
if err := system.Setgid(execUser.Gid); err != nil {
return err
}
-
if err := system.Setuid(execUser.Uid); err != nil {
return err
}
@@ -334,14 +348,6 @@ func fixStdioPermissions(config *initConfig, u *user.ExecUser) error {
continue
}
- // Skip chown if s.Gid is actually an unmapped gid in the host. While
- // this is a bit dodgy if it just so happens that the console _is_
- // owned by overflow_gid, there's no way for us to disambiguate this as
- // a userspace program.
- if _, err := config.Config.HostGID(int(s.Gid)); err != nil {
- continue
- }
-
// We only change the uid owner (as it is possible for the mount to
// prefer a different gid, and there's no reason for us to change it).
// The reason why we don't just leave the default uid=X mount setup is
@@ -349,6 +355,15 @@ func fixStdioPermissions(config *initConfig, u *user.ExecUser) error {
// this code, you couldn't effectively run as a non-root user inside a
// container and also have a console set up.
if err := unix.Fchown(int(fd), u.Uid, int(s.Gid)); err != nil {
+ // If we've hit an EINVAL then s.Gid isn't mapped in the user
+ // namespace. If we've hit an EPERM then the inode's current owner
+ // is not mapped in our user namespace (in particular,
+ // privileged_wrt_inode_uidgid() has failed). In either case, we
+ // are in a configuration where it's better for us to just not
+ // touch the stdio rather than bail at this point.
+ if err == unix.EINVAL || err == unix.EPERM {
+ continue
+ }
return err
}
}
@@ -479,6 +494,16 @@ func signalAllProcesses(m cgroups.Manager, s os.Signal) error {
logrus.Warn(err)
}
+ subreaper, err := system.GetSubreaper()
+ if err != nil {
+ // The error here means that PR_GET_CHILD_SUBREAPER is not
+ // supported because this code might run on a kernel older
+ // than 3.4. We don't want to throw an error in that case,
+ // and we simplify things, considering there is no subreaper
+ // set.
+ subreaper = 0
+ }
+
for _, p := range procs {
if s != unix.SIGKILL {
if ok, err := isWaitable(p.Pid); err != nil {
@@ -492,9 +517,16 @@ func signalAllProcesses(m cgroups.Manager, s os.Signal) error {
}
}
- if _, err := p.Wait(); err != nil {
- if !isNoChildren(err) {
- logrus.Warn("wait: ", err)
+ // In case a subreaper has been setup, this code must not
+ // wait for the process. Otherwise, we cannot be sure the
+ // current process will be reaped by the subreaper, while
+ // the subreaper might be waiting for this process in order
+ // to retrieve its exit code.
+ if subreaper == 0 {
+ if _, err := p.Wait(); err != nil {
+ if !isNoChildren(err) {
+ logrus.Warn("wait: ", err)
+ }
}
}
}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/intelrdt.go b/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/intelrdt.go
new file mode 100644
index 000000000..487c630af
--- /dev/null
+++ b/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/intelrdt.go
@@ -0,0 +1,553 @@
+// +build linux
+
+package intelrdt
+
+import (
+ "bufio"
+ "fmt"
+ "io/ioutil"
+ "os"
+ "path/filepath"
+ "strconv"
+ "strings"
+ "sync"
+
+ "github.com/opencontainers/runc/libcontainer/configs"
+)
+
+/*
+ * About Intel RDT/CAT feature:
+ * Intel platforms with new Xeon CPU support Resource Director Technology (RDT).
+ * Intel Cache Allocation Technology (CAT) is a sub-feature of RDT. Currently L3
+ * Cache is the only resource that is supported in RDT.
+ *
+ * This feature provides a way for the software to restrict cache allocation to a
+ * defined 'subset' of L3 cache which may be overlapping with other 'subsets'.
+ * The different subsets are identified by class of service (CLOS) and each CLOS
+ * has a capacity bitmask (CBM).
+ *
+ * For more information about Intel RDT/CAT can be found in the section 17.17
+ * of Intel Software Developer Manual.
+ *
+ * About Intel RDT/CAT kernel interface:
+ * In Linux 4.10 kernel or newer, the interface is defined and exposed via
+ * "resource control" filesystem, which is a "cgroup-like" interface.
+ *
+ * Comparing with cgroups, it has similar process management lifecycle and
+ * interfaces in a container. But unlike cgroups' hierarchy, it has single level
+ * filesystem layout.
+ *
+ * Intel RDT "resource control" filesystem hierarchy:
+ * mount -t resctrl resctrl /sys/fs/resctrl
+ * tree /sys/fs/resctrl
+ * /sys/fs/resctrl/
+ * |-- info
+ * | |-- L3
+ * | |-- cbm_mask
+ * | |-- min_cbm_bits
+ * | |-- num_closids
+ * |-- cpus
+ * |-- schemata
+ * |-- tasks
+ * |-- <container_id>
+ * |-- cpus
+ * |-- schemata
+ * |-- tasks
+ *
+ * For runc, we can make use of `tasks` and `schemata` configuration for L3 cache
+ * resource constraints.
+ *
+ * The file `tasks` has a list of tasks that belongs to this group (e.g.,
+ * <container_id>" group). Tasks can be added to a group by writing the task ID
+ * to the "tasks" file (which will automatically remove them from the previous
+ * group to which they belonged). New tasks created by fork(2) and clone(2) are
+ * added to the same group as their parent. If a pid is not in any sub group, it is
+ * in root group.
+ *
+ * The file `schemata` has allocation bitmasks/values for L3 cache on each socket,
+ * which contains L3 cache id and capacity bitmask (CBM).
+ * Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
+ * For example, on a two-socket machine, L3's schema line could be `L3:0=ff;1=c0`
+ * which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0.
+ *
+ * The valid L3 cache CBM is a *contiguous bits set* and number of bits that can
+ * be set is less than the max bit. The max bits in the CBM is varied among
+ * supported Intel Xeon platforms. In Intel RDT "resource control" filesystem
+ * layout, the CBM in a group should be a subset of the CBM in root. Kernel will
+ * check if it is valid when writing. e.g., 0xfffff in root indicates the max bits
+ * of CBM is 20 bits, which mapping to entire L3 cache capacity. Some valid CBM
+ * values to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
+ *
+ * For more information about Intel RDT/CAT kernel interface:
+ * https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt
+ *
+ * An example for runc:
+ * Consider a two-socket machine with two L3 caches where the default CBM is
+ * 0xfffff and the max CBM length is 20 bits. With this configuration, tasks
+ * inside the container only have access to the "upper" 80% of L3 cache id 0 and
+ * the "lower" 50% L3 cache id 1:
+ *
+ * "linux": {
+ * "intelRdt": {
+ * "l3CacheSchema": "L3:0=ffff0;1=3ff"
+ * }
+ * }
+ */
+
+type Manager interface {
+ // Applies Intel RDT configuration to the process with the specified pid
+ Apply(pid int) error
+
+ // Returns statistics for Intel RDT
+ GetStats() (*Stats, error)
+
+ // Destroys the Intel RDT 'container_id' group
+ Destroy() error
+
+ // Returns Intel RDT path to save in a state file and to be able to
+ // restore the object later
+ GetPath() string
+
+ // Set Intel RDT "resource control" filesystem as configured.
+ Set(container *configs.Config) error
+}
+
+// This implements interface Manager
+type IntelRdtManager struct {
+ mu sync.Mutex
+ Config *configs.Config
+ Id string
+ Path string
+}
+
+const (
+ IntelRdtTasks = "tasks"
+)
+
+var (
+ // The absolute root path of the Intel RDT "resource control" filesystem
+ intelRdtRoot string
+ intelRdtRootLock sync.Mutex
+
+ // The flag to indicate if Intel RDT is supported
+ isEnabled bool
+)
+
+type intelRdtData struct {
+ root string
+ config *configs.Config
+ pid int
+}
+
+// Check if Intel RDT is enabled in init()
+func init() {
+ // 1. Check if hardware and kernel support Intel RDT/CAT feature
+ // "cat_l3" flag is set if supported
+ isFlagSet, err := parseCpuInfoFile("/proc/cpuinfo")
+ if !isFlagSet || err != nil {
+ isEnabled = false
+ return
+ }
+
+ // 2. Check if Intel RDT "resource control" filesystem is mounted
+ // The user guarantees to mount the filesystem
+ isEnabled = isIntelRdtMounted()
+}
+
+// Return the mount point path of Intel RDT "resource control" filesysem
+func findIntelRdtMountpointDir() (string, error) {
+ f, err := os.Open("/proc/self/mountinfo")
+ if err != nil {
+ return "", err
+ }
+ defer f.Close()
+
+ s := bufio.NewScanner(f)
+ for s.Scan() {
+ text := s.Text()
+ fields := strings.Split(text, " ")
+ // Safe as mountinfo encodes mountpoints with spaces as \040.
+ index := strings.Index(text, " - ")
+ postSeparatorFields := strings.Fields(text[index+3:])
+ numPostFields := len(postSeparatorFields)
+
+ // This is an error as we can't detect if the mount is for "Intel RDT"
+ if numPostFields == 0 {
+ return "", fmt.Errorf("Found no fields post '-' in %q", text)
+ }
+
+ if postSeparatorFields[0] == "resctrl" {
+ // Check that the mount is properly formated.
+ if numPostFields < 3 {
+ return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
+ }
+
+ return fields[4], nil
+ }
+ }
+ if err := s.Err(); err != nil {
+ return "", err
+ }
+
+ return "", NewNotFoundError("Intel RDT")
+}
+
+// Gets the root path of Intel RDT "resource control" filesystem
+func getIntelRdtRoot() (string, error) {
+ intelRdtRootLock.Lock()
+ defer intelRdtRootLock.Unlock()
+
+ if intelRdtRoot != "" {
+ return intelRdtRoot, nil
+ }
+
+ root, err := findIntelRdtMountpointDir()
+ if err != nil {
+ return "", err
+ }
+
+ if _, err := os.Stat(root); err != nil {
+ return "", err
+ }
+
+ intelRdtRoot = root
+ return intelRdtRoot, nil
+}
+
+func isIntelRdtMounted() bool {
+ _, err := getIntelRdtRoot()
+ if err != nil {
+ return false
+ }
+
+ return true
+}
+
+func parseCpuInfoFile(path string) (bool, error) {
+ f, err := os.Open(path)
+ if err != nil {
+ return false, err
+ }
+ defer f.Close()
+
+ s := bufio.NewScanner(f)
+ for s.Scan() {
+ if err := s.Err(); err != nil {
+ return false, err
+ }
+
+ text := s.Text()
+ flags := strings.Split(text, " ")
+
+ // "cat_l3" flag is set if Intel RDT/CAT is supported
+ for _, flag := range flags {
+ if flag == "cat_l3" {
+ return true, nil
+ }
+ }
+ }
+ return false, nil
+}
+
+func parseUint(s string, base, bitSize int) (uint64, error) {
+ value, err := strconv.ParseUint(s, base, bitSize)
+ if err != nil {
+ intValue, intErr := strconv.ParseInt(s, base, bitSize)
+ // 1. Handle negative values greater than MinInt64 (and)
+ // 2. Handle negative values lesser than MinInt64
+ if intErr == nil && intValue < 0 {
+ return 0, nil
+ } else if intErr != nil && intErr.(*strconv.NumError).Err == strconv.ErrRange && intValue < 0 {
+ return 0, nil
+ }
+
+ return value, err
+ }
+
+ return value, nil
+}
+
+// Gets a single uint64 value from the specified file.
+func getIntelRdtParamUint(path, file string) (uint64, error) {
+ fileName := filepath.Join(path, file)
+ contents, err := ioutil.ReadFile(fileName)
+ if err != nil {
+ return 0, err
+ }
+
+ res, err := parseUint(strings.TrimSpace(string(contents)), 10, 64)
+ if err != nil {
+ return res, fmt.Errorf("unable to parse %q as a uint from file %q", string(contents), fileName)
+ }
+ return res, nil
+}
+
+// Gets a string value from the specified file
+func getIntelRdtParamString(path, file string) (string, error) {
+ contents, err := ioutil.ReadFile(filepath.Join(path, file))
+ if err != nil {
+ return "", err
+ }
+
+ return strings.TrimSpace(string(contents)), nil
+}
+
+func readTasksFile(dir string) ([]int, error) {
+ f, err := os.Open(filepath.Join(dir, IntelRdtTasks))
+ if err != nil {
+ return nil, err
+ }
+ defer f.Close()
+
+ var (
+ s = bufio.NewScanner(f)
+ out = []int{}
+ )
+
+ for s.Scan() {
+ if t := s.Text(); t != "" {
+ pid, err := strconv.Atoi(t)
+ if err != nil {
+ return nil, err
+ }
+ out = append(out, pid)
+ }
+ }
+ return out, nil
+}
+
+func writeFile(dir, file, data string) error {
+ if dir == "" {
+ return fmt.Errorf("no such directory for %s", file)
+ }
+ if err := ioutil.WriteFile(filepath.Join(dir, file), []byte(data+"\n"), 0700); err != nil {
+ return fmt.Errorf("failed to write %v to %v: %v", data, file, err)
+ }
+ return nil
+}
+
+func getIntelRdtData(c *configs.Config, pid int) (*intelRdtData, error) {
+ rootPath, err := getIntelRdtRoot()
+ if err != nil {
+ return nil, err
+ }
+ return &intelRdtData{
+ root: rootPath,
+ config: c,
+ pid: pid,
+ }, nil
+}
+
+// Get the read-only L3 cache information
+func getL3CacheInfo() (*L3CacheInfo, error) {
+ l3CacheInfo := &L3CacheInfo{}
+
+ rootPath, err := getIntelRdtRoot()
+ if err != nil {
+ return l3CacheInfo, err
+ }
+
+ path := filepath.Join(rootPath, "info", "L3")
+ cbmMask, err := getIntelRdtParamString(path, "cbm_mask")
+ if err != nil {
+ return l3CacheInfo, err
+ }
+ minCbmBits, err := getIntelRdtParamUint(path, "min_cbm_bits")
+ if err != nil {
+ return l3CacheInfo, err
+ }
+ numClosids, err := getIntelRdtParamUint(path, "num_closids")
+ if err != nil {
+ return l3CacheInfo, err
+ }
+
+ l3CacheInfo.CbmMask = cbmMask
+ l3CacheInfo.MinCbmBits = minCbmBits
+ l3CacheInfo.NumClosids = numClosids
+
+ return l3CacheInfo, nil
+}
+
+// WriteIntelRdtTasks writes the specified pid into the "tasks" file
+func WriteIntelRdtTasks(dir string, pid int) error {
+ if dir == "" {
+ return fmt.Errorf("no such directory for %s", IntelRdtTasks)
+ }
+
+ // Dont attach any pid if -1 is specified as a pid
+ if pid != -1 {
+ if err := ioutil.WriteFile(filepath.Join(dir, IntelRdtTasks), []byte(strconv.Itoa(pid)), 0700); err != nil {
+ return fmt.Errorf("failed to write %v to %v: %v", pid, IntelRdtTasks, err)
+ }
+ }
+ return nil
+}
+
+// Check if Intel RDT is enabled
+func IsEnabled() bool {
+ return isEnabled
+}
+
+// Get the 'container_id' path in Intel RDT "resource control" filesystem
+func GetIntelRdtPath(id string) (string, error) {
+ rootPath, err := getIntelRdtRoot()
+ if err != nil {
+ return "", err
+ }
+
+ path := filepath.Join(rootPath, id)
+ return path, nil
+}
+
+// Applies Intel RDT configuration to the process with the specified pid
+func (m *IntelRdtManager) Apply(pid int) (err error) {
+ // If intelRdt is not specified in config, we do nothing
+ if m.Config.IntelRdt == nil {
+ return nil
+ }
+ d, err := getIntelRdtData(m.Config, pid)
+ if err != nil && !IsNotFound(err) {
+ return err
+ }
+
+ m.mu.Lock()
+ defer m.mu.Unlock()
+ path, err := d.join(m.Id)
+ if err != nil {
+ return err
+ }
+
+ m.Path = path
+ return nil
+}
+
+// Destroys the Intel RDT 'container_id' group
+func (m *IntelRdtManager) Destroy() error {
+ m.mu.Lock()
+ defer m.mu.Unlock()
+ if err := os.RemoveAll(m.Path); err != nil {
+ return err
+ }
+ m.Path = ""
+ return nil
+}
+
+// Returns Intel RDT path to save in a state file and to be able to
+// restore the object later
+func (m *IntelRdtManager) GetPath() string {
+ if m.Path == "" {
+ m.Path, _ = GetIntelRdtPath(m.Id)
+ }
+ return m.Path
+}
+
+// Returns statistics for Intel RDT
+func (m *IntelRdtManager) GetStats() (*Stats, error) {
+ // If intelRdt is not specified in config
+ if m.Config.IntelRdt == nil {
+ return nil, nil
+ }
+
+ m.mu.Lock()
+ defer m.mu.Unlock()
+ stats := NewStats()
+
+ // The read-only L3 cache information
+ l3CacheInfo, err := getL3CacheInfo()
+ if err != nil {
+ return nil, err
+ }
+ stats.L3CacheInfo = l3CacheInfo
+
+ // The read-only L3 cache schema in root
+ rootPath, err := getIntelRdtRoot()
+ if err != nil {
+ return nil, err
+ }
+ tmpRootStrings, err := getIntelRdtParamString(rootPath, "schemata")
+ if err != nil {
+ return nil, err
+ }
+ // L3 cache schema is in the first line
+ schemaRootStrings := strings.Split(tmpRootStrings, "\n")
+ stats.L3CacheSchemaRoot = schemaRootStrings[0]
+
+ // The L3 cache schema in 'container_id' group
+ tmpStrings, err := getIntelRdtParamString(m.GetPath(), "schemata")
+ if err != nil {
+ return nil, err
+ }
+ // L3 cache schema is in the first line
+ schemaStrings := strings.Split(tmpStrings, "\n")
+ stats.L3CacheSchema = schemaStrings[0]
+
+ return stats, nil
+}
+
+// Set Intel RDT "resource control" filesystem as configured.
+func (m *IntelRdtManager) Set(container *configs.Config) error {
+ path := m.GetPath()
+
+ // About L3 cache schema file:
+ // The schema has allocation masks/values for L3 cache on each socket,
+ // which contains L3 cache id and capacity bitmask (CBM).
+ // Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
+ // For example, on a two-socket machine, L3's schema line could be:
+ // L3:0=ff;1=c0
+ // Which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0.
+ //
+ // About L3 cache CBM validity:
+ // The valid L3 cache CBM is a *contiguous bits set* and number of
+ // bits that can be set is less than the max bit. The max bits in the
+ // CBM is varied among supported Intel Xeon platforms. In Intel RDT
+ // "resource control" filesystem layout, the CBM in a group should
+ // be a subset of the CBM in root. Kernel will check if it is valid
+ // when writing.
+ // e.g., 0xfffff in root indicates the max bits of CBM is 20 bits,
+ // which mapping to entire L3 cache capacity. Some valid CBM values
+ // to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
+ if container.IntelRdt != nil {
+ l3CacheSchema := container.IntelRdt.L3CacheSchema
+ if l3CacheSchema != "" {
+ if err := writeFile(path, "schemata", l3CacheSchema); err != nil {
+ return err
+ }
+ }
+ }
+
+ return nil
+}
+
+func (raw *intelRdtData) join(id string) (string, error) {
+ path := filepath.Join(raw.root, id)
+ if err := os.MkdirAll(path, 0755); err != nil {
+ return "", err
+ }
+
+ if err := WriteIntelRdtTasks(path, raw.pid); err != nil {
+ return "", err
+ }
+ return path, nil
+}
+
+type NotFoundError struct {
+ ResourceControl string
+}
+
+func (e *NotFoundError) Error() string {
+ return fmt.Sprintf("mountpoint for %s not found", e.ResourceControl)
+}
+
+func NewNotFoundError(res string) error {
+ return &NotFoundError{
+ ResourceControl: res,
+ }
+}
+
+func IsNotFound(err error) bool {
+ if err == nil {
+ return false
+ }
+ _, ok := err.(*NotFoundError)
+ return ok
+}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/stats.go b/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/stats.go
new file mode 100644
index 000000000..095c0a380
--- /dev/null
+++ b/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/stats.go
@@ -0,0 +1,24 @@
+// +build linux
+
+package intelrdt
+
+type L3CacheInfo struct {
+ CbmMask string `json:"cbm_mask,omitempty"`
+ MinCbmBits uint64 `json:"min_cbm_bits,omitempty"`
+ NumClosids uint64 `json:"num_closids,omitempty"`
+}
+
+type Stats struct {
+ // The read-only L3 cache information
+ L3CacheInfo *L3CacheInfo `json:"l3_cache_info,omitempty"`
+
+ // The read-only L3 cache schema in root
+ L3CacheSchemaRoot string `json:"l3_cache_schema_root,omitempty"`
+
+ // The L3 cache schema in 'container_id' group
+ L3CacheSchema string `json:"l3_cache_schema,omitempty"`
+}
+
+func NewStats() *Stats {
+ return &Stats{}
+}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/keys/keyctl.go b/vendor/github.com/opencontainers/runc/libcontainer/keys/keyctl.go
index 82ffa7a88..ce8b4e6b0 100644
--- a/vendor/github.com/opencontainers/runc/libcontainer/keys/keyctl.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/keys/keyctl.go
@@ -29,7 +29,7 @@ func ModKeyringPerm(ringId KeySerial, mask, setbits uint32) error {
return err
}
- res := strings.Split(string(dest), ";")
+ res := strings.Split(dest, ";")
if len(res) < 5 {
return fmt.Errorf("Destination buffer for key description is too small")
}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/message_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/message_linux.go
index 8829b71ad..ab453cde9 100644
--- a/vendor/github.com/opencontainers/runc/libcontainer/message_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/message_linux.go
@@ -18,6 +18,8 @@ const (
SetgroupAttr uint16 = 27285
OomScoreAdjAttr uint16 = 27286
RootlessAttr uint16 = 27287
+ UidmapPathAttr uint16 = 27288
+ GidmapPathAttr uint16 = 27289
)
type Int32msg struct {
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/mount/mount.go b/vendor/github.com/opencontainers/runc/libcontainer/mount/mount.go
new file mode 100644
index 000000000..e8965e081
--- /dev/null
+++ b/vendor/github.com/opencontainers/runc/libcontainer/mount/mount.go
@@ -0,0 +1,23 @@
+package mount
+
+// GetMounts retrieves a list of mounts for the current running process.
+func GetMounts() ([]*Info, error) {
+ return parseMountTable()
+}
+
+// Mounted looks at /proc/self/mountinfo to determine of the specified
+// mountpoint has been mounted
+func Mounted(mountpoint string) (bool, error) {
+ entries, err := parseMountTable()
+ if err != nil {
+ return false, err
+ }
+
+ // Search the table for the mountpoint
+ for _, e := range entries {
+ if e.Mountpoint == mountpoint {
+ return true, nil
+ }
+ }
+ return false, nil
+}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/mount/mount_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/mount/mount_linux.go
new file mode 100644
index 000000000..1e5191928
--- /dev/null
+++ b/vendor/github.com/opencontainers/runc/libcontainer/mount/mount_linux.go
@@ -0,0 +1,82 @@
+// +build linux
+
+package mount
+
+import (
+ "bufio"
+ "fmt"
+ "io"
+ "os"
+ "strings"
+)
+
+const (
+ /* 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
+ (1)(2)(3) (4) (5) (6) (7) (8) (9) (10) (11)
+
+ (1) mount ID: unique identifier of the mount (may be reused after umount)
+ (2) parent ID: ID of parent (or of self for the top of the mount tree)
+ (3) major:minor: value of st_dev for files on filesystem
+ (4) root: root of the mount within the filesystem
+ (5) mount point: mount point relative to the process's root
+ (6) mount options: per mount options
+ (7) optional fields: zero or more fields of the form "tag[:value]"
+ (8) separator: marks the end of the optional fields
+ (9) filesystem type: name of filesystem of the form "type[.subtype]"
+ (10) mount source: filesystem specific information or "none"
+ (11) super options: per super block options*/
+ mountinfoFormat = "%d %d %d:%d %s %s %s %s"
+)
+
+// Parse /proc/self/mountinfo because comparing Dev and ino does not work from
+// bind mounts
+func parseMountTable() ([]*Info, error) {
+ f, err := os.Open("/proc/self/mountinfo")
+ if err != nil {
+ return nil, err
+ }
+ defer f.Close()
+
+ return parseInfoFile(f)
+}
+
+func parseInfoFile(r io.Reader) ([]*Info, error) {
+ var (
+ s = bufio.NewScanner(r)
+ out = []*Info{}
+ )
+
+ for s.Scan() {
+ if err := s.Err(); err != nil {
+ return nil, err
+ }
+
+ var (
+ p = &Info{}
+ text = s.Text()
+ optionalFields string
+ )
+
+ if _, err := fmt.Sscanf(text, mountinfoFormat,
+ &p.ID, &p.Parent, &p.Major, &p.Minor,
+ &p.Root, &p.Mountpoint, &p.Opts, &optionalFields); err != nil {
+ return nil, fmt.Errorf("Scanning '%s' failed: %s", text, err)
+ }
+ // Safe as mountinfo encodes mountpoints with spaces as \040.
+ index := strings.Index(text, " - ")
+ postSeparatorFields := strings.Fields(text[index+3:])
+ if len(postSeparatorFields) < 3 {
+ return nil, fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
+ }
+
+ if optionalFields != "-" {
+ p.Optional = optionalFields
+ }
+
+ p.Fstype = postSeparatorFields[0]
+ p.Source = postSeparatorFields[1]
+ p.VfsOpts = strings.Join(postSeparatorFields[2:], " ")
+ out = append(out, p)
+ }
+ return out, nil
+}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/mount/mountinfo.go b/vendor/github.com/opencontainers/runc/libcontainer/mount/mountinfo.go
new file mode 100644
index 000000000..e3fc3535e
--- /dev/null
+++ b/vendor/github.com/opencontainers/runc/libcontainer/mount/mountinfo.go
@@ -0,0 +1,40 @@
+package mount
+
+// Info reveals information about a particular mounted filesystem. This
+// struct is populated from the content in the /proc/<pid>/mountinfo file.
+type Info struct {
+ // ID is a unique identifier of the mount (may be reused after umount).
+ ID int
+
+ // Parent indicates the ID of the mount parent (or of self for the top of the
+ // mount tree).
+ Parent int
+
+ // Major indicates one half of the device ID which identifies the device class.
+ Major int
+
+ // Minor indicates one half of the device ID which identifies a specific
+ // instance of device.
+ Minor int
+
+ // Root of the mount within the filesystem.
+ Root string
+
+ // Mountpoint indicates the mount point relative to the process's root.
+ Mountpoint string
+
+ // Opts represents mount-specific options.
+ Opts string
+
+ // Optional represents optional fields.
+ Optional string
+
+ // Fstype indicates the type of filesystem, such as EXT3.
+ Fstype string
+
+ // Source indicates filesystem specific information or "none".
+ Source string
+
+ // VfsOpts represents per super block options.
+ VfsOpts string
+}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/notify_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/notify_linux.go
index 81587b9b1..47a06783d 100644
--- a/vendor/github.com/opencontainers/runc/libcontainer/notify_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/notify_linux.go
@@ -44,9 +44,9 @@ func registerMemoryEvent(cgDir string, evName string, arg string) (<-chan struct
ch := make(chan struct{})
go func() {
defer func() {
- close(ch)
eventfd.Close()
evFile.Close()
+ close(ch)
}()
buf := make([]byte, 8)
for {
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c
index 197e6d08e..2c69cee5d 100644
--- a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c
+++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c
@@ -1,3 +1,4 @@
+
#define _GNU_SOURCE
#include <endian.h>
#include <errno.h>
@@ -19,6 +20,7 @@
#include <sys/prctl.h>
#include <sys/socket.h>
#include <sys/types.h>
+#include <sys/wait.h>
#include <linux/limits.h>
#include <linux/netlink.h>
@@ -29,15 +31,15 @@
/* Synchronisation values. */
enum sync_t {
- SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */
- SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */
- SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */
- SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */
- SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */
- SYNC_CHILD_READY = 0x45, /* The child or grandchild is ready to return. */
+ SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */
+ SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */
+ SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */
+ SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */
+ SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */
+ SYNC_CHILD_READY = 0x45, /* The child or grandchild is ready to return. */
/* XXX: This doesn't help with segfaults and other such issues. */
- SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */
+ SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */
};
/* longjmp() arguments. */
@@ -64,7 +66,13 @@ struct clone_t {
struct nlconfig_t {
char *data;
+
+ /* Process settings. */
uint32_t cloneflags;
+ char *oom_score_adj;
+ size_t oom_score_adj_len;
+
+ /* User namespace settings. */
char *uidmap;
size_t uidmap_len;
char *gidmap;
@@ -72,9 +80,13 @@ struct nlconfig_t {
char *namespaces;
size_t namespaces_len;
uint8_t is_setgroup;
+
+ /* Rootless container settings. */
uint8_t is_rootless;
- char *oom_score_adj;
- size_t oom_score_adj_len;
+ char *uidmappath;
+ size_t uidmappath_len;
+ char *gidmappath;
+ size_t gidmappath_len;
};
/*
@@ -89,6 +101,8 @@ struct nlconfig_t {
#define SETGROUP_ATTR 27285
#define OOM_SCORE_ADJ_ATTR 27286
#define ROOTLESS_ATTR 27287
+#define UIDMAPPATH_ATTR 27288
+#define GIDMAPPATH_ATTR 27289
/*
* Use the raw syscall for versions of glibc which don't include a function for
@@ -152,7 +166,7 @@ static int write_file(char *data, size_t data_len, char *pathfmt, ...)
goto out;
}
-out:
+ out:
close(fd);
return ret;
}
@@ -169,16 +183,16 @@ static void update_setgroups(int pid, enum policy_t setgroup)
char *policy;
switch (setgroup) {
- case SETGROUPS_ALLOW:
- policy = "allow";
- break;
- case SETGROUPS_DENY:
- policy = "deny";
- break;
- case SETGROUPS_DEFAULT:
- default:
- /* Nothing to do. */
- return;
+ case SETGROUPS_ALLOW:
+ policy = "allow";
+ break;
+ case SETGROUPS_DENY:
+ policy = "deny";
+ break;
+ case SETGROUPS_DEFAULT:
+ default:
+ /* Nothing to do. */
+ return;
}
if (write_file(policy, strlen(policy), "/proc/%d/setgroups", pid) < 0) {
@@ -191,22 +205,96 @@ static void update_setgroups(int pid, enum policy_t setgroup)
}
}
-static void update_uidmap(int pid, char *map, size_t map_len)
+static int try_mapping_tool(const char *app, int pid, char *map, size_t map_len)
+{
+ int child;
+
+ /*
+ * If @app is NULL, execve will segfault. Just check it here and bail (if
+ * we're in this path, the caller is already getting desparate and there
+ * isn't a backup to this failing). This usually would be a configuration
+ * or programming issue.
+ */
+ if (!app)
+ bail("mapping tool not present");
+
+ child = fork();
+ if (child < 0)
+ bail("failed to fork");
+
+ if (!child) {
+#define MAX_ARGV 20
+ char *argv[MAX_ARGV];
+ char *envp[] = { NULL };
+ char pid_fmt[16];
+ int argc = 0;
+ char *next;
+
+ snprintf(pid_fmt, 16, "%d", pid);
+
+ argv[argc++] = (char *)app;
+ argv[argc++] = pid_fmt;
+ /*
+ * Convert the map string into a list of argument that
+ * newuidmap/newgidmap can understand.
+ */
+
+ while (argc < MAX_ARGV) {
+ if (*map == '\0') {
+ argv[argc++] = NULL;
+ break;
+ }
+ argv[argc++] = map;
+ next = strpbrk(map, "\n ");
+ if (next == NULL)
+ break;
+ *next++ = '\0';
+ map = next + strspn(next, "\n ");
+ }
+
+ execve(app, argv, envp);
+ bail("failed to execv");
+ } else {
+ int status;
+
+ while (true) {
+ if (waitpid(child, &status, 0) < 0) {
+ if (errno == EINTR)
+ continue;
+ bail("failed to waitpid");
+ }
+ if (WIFEXITED(status) || WIFSIGNALED(status))
+ return WEXITSTATUS(status);
+ }
+ }
+
+ return -1;
+}
+
+static void update_uidmap(const char *path, int pid, char *map, size_t map_len)
{
if (map == NULL || map_len <= 0)
return;
- if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0)
- bail("failed to update /proc/%d/uid_map", pid);
+ if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) {
+ if (errno != EPERM)
+ bail("failed to update /proc/%d/uid_map", pid);
+ if (try_mapping_tool(path, pid, map, map_len))
+ bail("failed to use newuid map on %d", pid);
+ }
}
-static void update_gidmap(int pid, char *map, size_t map_len)
+static void update_gidmap(const char *path, int pid, char *map, size_t map_len)
{
if (map == NULL || map_len <= 0)
return;
- if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0)
- bail("failed to update /proc/%d/gid_map", pid);
+ if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) {
+ if (errno != EPERM)
+ bail("failed to update /proc/%d/gid_map", pid);
+ if (try_mapping_tool(path, pid, map, map_len))
+ bail("failed to use newgid map on %d", pid);
+ }
}
static void update_oom_score_adj(char *data, size_t len)
@@ -230,7 +318,7 @@ static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline));
static int clone_parent(jmp_buf *env, int jmpval)
{
struct clone_t ca = {
- .env = env,
+ .env = env,
.jmpval = jmpval,
};
@@ -350,6 +438,14 @@ static void nl_parse(int fd, struct nlconfig_t *config)
config->gidmap = current;
config->gidmap_len = payload_len;
break;
+ case UIDMAPPATH_ATTR:
+ config->uidmappath = current;
+ config->uidmappath_len = payload_len;
+ break;
+ case GIDMAPPATH_ATTR:
+ config->gidmappath = current;
+ config->gidmappath_len = payload_len;
+ break;
case SETGROUP_ATTR:
config->is_setgroup = readint8(current);
break;
@@ -436,7 +532,7 @@ void nsexec(void)
int pipenum;
jmp_buf env;
int sync_child_pipe[2], sync_grandchild_pipe[2];
- struct nlconfig_t config = {0};
+ struct nlconfig_t config = { 0 };
/*
* If we don't have an init pipe, just return to the go routine.
@@ -533,21 +629,21 @@ void nsexec(void)
*/
switch (setjmp(env)) {
- /*
- * Stage 0: We're in the parent. Our job is just to create a new child
- * (stage 1: JUMP_CHILD) process and write its uid_map and
- * gid_map. That process will go on to create a new process, then
- * it will send us its PID which we will send to the bootstrap
- * process.
- */
- case JUMP_PARENT: {
+ /*
+ * Stage 0: We're in the parent. Our job is just to create a new child
+ * (stage 1: JUMP_CHILD) process and write its uid_map and
+ * gid_map. That process will go on to create a new process, then
+ * it will send us its PID which we will send to the bootstrap
+ * process.
+ */
+ case JUMP_PARENT:{
int len;
- pid_t child;
+ pid_t child, first_child = -1;
char buf[JSON_MAX];
bool ready = false;
/* For debugging. */
- prctl(PR_SET_NAME, (unsigned long) "runc:[0:PARENT]", 0, 0, 0);
+ prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0);
/* Start the process of getting a container. */
child = clone_parent(&env, JUMP_CHILD);
@@ -596,8 +692,8 @@ void nsexec(void)
update_setgroups(child, SETGROUPS_DENY);
/* Set up mappings. */
- update_uidmap(child, config.uidmap, config.uidmap_len);
- update_gidmap(child, config.gidmap, config.gidmap_len);
+ update_uidmap(config.uidmappath, child, config.uidmap, config.uidmap_len);
+ update_gidmap(config.gidmappath, child, config.gidmap, config.gidmap_len);
s = SYNC_USERMAP_ACK;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
@@ -605,19 +701,19 @@ void nsexec(void)
bail("failed to sync with child: write(SYNC_USERMAP_ACK)");
}
break;
- case SYNC_RECVPID_PLS: {
- pid_t old = child;
+ case SYNC_RECVPID_PLS:{
+ first_child = child;
/* Get the init_func pid. */
if (read(syncfd, &child, sizeof(child)) != sizeof(child)) {
- kill(old, SIGKILL);
+ kill(first_child, SIGKILL);
bail("failed to sync with child: read(childpid)");
}
/* Send ACK. */
s = SYNC_RECVPID_ACK;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
- kill(old, SIGKILL);
+ kill(first_child, SIGKILL);
kill(child, SIGKILL);
bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
}
@@ -665,8 +761,13 @@ void nsexec(void)
}
}
- /* Send the init_func pid back to our parent. */
- len = snprintf(buf, JSON_MAX, "{\"pid\": %d}\n", child);
+ /*
+ * Send the init_func pid and the pid of the first child back to our parent.
+ *
+ * We need to send both back because we can't reap the first child we created (CLONE_PARENT).
+ * It becomes the responsibility of our parent to reap the first child.
+ */
+ len = snprintf(buf, JSON_MAX, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child);
if (len < 0) {
kill(child, SIGKILL);
bail("unable to generate JSON for child pid");
@@ -679,16 +780,16 @@ void nsexec(void)
exit(0);
}
- /*
- * Stage 1: We're in the first child process. Our job is to join any
- * provided namespaces in the netlink payload and unshare all
- * of the requested namespaces. If we've been asked to
- * CLONE_NEWUSER, we will ask our parent (stage 0) to set up
- * our user mappings for us. Then, we create a new child
- * (stage 2: JUMP_INIT) for PID namespace. We then send the
- * child's PID to our parent (stage 0).
- */
- case JUMP_CHILD: {
+ /*
+ * Stage 1: We're in the first child process. Our job is to join any
+ * provided namespaces in the netlink payload and unshare all
+ * of the requested namespaces. If we've been asked to
+ * CLONE_NEWUSER, we will ask our parent (stage 0) to set up
+ * our user mappings for us. Then, we create a new child
+ * (stage 2: JUMP_INIT) for PID namespace. We then send the
+ * child's PID to our parent (stage 0).
+ */
+ case JUMP_CHILD:{
pid_t child;
enum sync_t s;
@@ -697,7 +798,7 @@ void nsexec(void)
close(sync_child_pipe[1]);
/* For debugging. */
- prctl(PR_SET_NAME, (unsigned long) "runc:[1:CHILD]", 0, 0, 0);
+ prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0);
/*
* We need to setns first. We cannot do this earlier (in stage 0)
@@ -799,13 +900,13 @@ void nsexec(void)
exit(0);
}
- /*
- * Stage 2: We're the final child process, and the only process that will
- * actually return to the Go runtime. Our job is to just do the
- * final cleanup steps and then return to the Go runtime to allow
- * init_linux.go to run.
- */
- case JUMP_INIT: {
+ /*
+ * Stage 2: We're the final child process, and the only process that will
+ * actually return to the Go runtime. Our job is to just do the
+ * final cleanup steps and then return to the Go runtime to allow
+ * init_linux.go to run.
+ */
+ case JUMP_INIT:{
/*
* We're inside the child now, having jumped from the
* start_child() code after forking in the parent.
@@ -819,7 +920,7 @@ void nsexec(void)
close(sync_child_pipe[1]);
/* For debugging. */
- prctl(PR_SET_NAME, (unsigned long) "runc:[2:INIT]", 0, 0, 0);
+ prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0);
if (read(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with parent: read(SYNC_GRANDCHILD)");
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/process.go b/vendor/github.com/opencontainers/runc/libcontainer/process.go
index f1ad08149..86bf7387f 100644
--- a/vendor/github.com/opencontainers/runc/libcontainer/process.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/process.go
@@ -47,6 +47,10 @@ type Process struct {
// ExtraFiles specifies additional open files to be inherited by the container
ExtraFiles []*os.File
+ // Initial sizings for the console
+ ConsoleWidth uint16
+ ConsoleHeight uint16
+
// Capabilities specify the capabilities to keep when executing the process inside the container
// All capabilities not specified will be dropped from the processes capability mask
Capabilities *configs.Capabilities
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go
index 171685ccd..58980b059 100644
--- a/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go
@@ -15,6 +15,7 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
+ "github.com/opencontainers/runc/libcontainer/intelrdt"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/utils"
@@ -49,6 +50,7 @@ type setnsProcess struct {
parentPipe *os.File
childPipe *os.File
cgroupPaths map[string]string
+ intelRdtPath string
config *initConfig
fds []string
process *Process
@@ -83,12 +85,20 @@ func (p *setnsProcess) start() (err error) {
if err = p.execSetns(); err != nil {
return newSystemErrorWithCause(err, "executing setns process")
}
- // We can't join cgroups if we're in a rootless container.
- if !p.config.Rootless && len(p.cgroupPaths) > 0 {
+ if len(p.cgroupPaths) > 0 {
if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil {
return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid())
}
}
+ if p.intelRdtPath != "" {
+ // if Intel RDT "resource control" filesystem path exists
+ _, err := os.Stat(p.intelRdtPath)
+ if err == nil {
+ if err := intelrdt.WriteIntelRdtTasks(p.intelRdtPath, p.pid()); err != nil {
+ return newSystemErrorWithCausef(err, "adding pid %d to Intel RDT resource control filesystem", p.pid())
+ }
+ }
+ }
// set rlimits, this has to be done here because we lose permissions
// to raise the limits once we enter a user-namespace
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
@@ -141,6 +151,16 @@ func (p *setnsProcess) execSetns() error {
p.cmd.Wait()
return newSystemErrorWithCause(err, "reading pid from init pipe")
}
+
+ // Clean up the zombie parent process
+ firstChildProcess, err := os.FindProcess(pid.PidFirstChild)
+ if err != nil {
+ return err
+ }
+
+ // Ignore the error in case the child has already been reaped for any reason
+ _, _ = firstChildProcess.Wait()
+
process, err := os.FindProcess(pid.Pid)
if err != nil {
return err
@@ -183,17 +203,17 @@ func (p *setnsProcess) setExternalDescriptors(newFds []string) {
}
type initProcess struct {
- cmd *exec.Cmd
- parentPipe *os.File
- childPipe *os.File
- config *initConfig
- manager cgroups.Manager
- container *linuxContainer
- fds []string
- process *Process
- bootstrapData io.Reader
- sharePidns bool
- rootDir *os.File
+ cmd *exec.Cmd
+ parentPipe *os.File
+ childPipe *os.File
+ config *initConfig
+ manager cgroups.Manager
+ intelRdtManager intelrdt.Manager
+ container *linuxContainer
+ fds []string
+ process *Process
+ bootstrapData io.Reader
+ sharePidns bool
}
func (p *initProcess) pid() int {
@@ -224,6 +244,16 @@ func (p *initProcess) execSetns() error {
p.cmd.Wait()
return err
}
+
+ // Clean up the zombie parent process
+ firstChildProcess, err := os.FindProcess(pid.PidFirstChild)
+ if err != nil {
+ return err
+ }
+
+ // Ignore the error in case the child has already been reaped for any reason
+ _, _ = firstChildProcess.Wait()
+
process, err := os.FindProcess(pid.Pid)
if err != nil {
return err
@@ -238,17 +268,39 @@ func (p *initProcess) start() error {
err := p.cmd.Start()
p.process.ops = p
p.childPipe.Close()
- p.rootDir.Close()
if err != nil {
p.process.ops = nil
return newSystemErrorWithCause(err, "starting init process command")
}
+ // Do this before syncing with child so that no children can escape the
+ // cgroup. We don't need to worry about not doing this and not being root
+ // because we'd be using the rootless cgroup manager in that case.
+ if err := p.manager.Apply(p.pid()); err != nil {
+ return newSystemErrorWithCause(err, "applying cgroup configuration for process")
+ }
+ if p.intelRdtManager != nil {
+ if err := p.intelRdtManager.Apply(p.pid()); err != nil {
+ return newSystemErrorWithCause(err, "applying Intel RDT configuration for process")
+ }
+ }
+ defer func() {
+ if err != nil {
+ // TODO: should not be the responsibility to call here
+ p.manager.Destroy()
+ if p.intelRdtManager != nil {
+ p.intelRdtManager.Destroy()
+ }
+ }
+ }()
+
if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
}
+
if err := p.execSetns(); err != nil {
return newSystemErrorWithCause(err, "running exec setns process for init")
}
+
// Save the standard descriptor names before the container process
// can potentially move them (e.g., via dup2()). If we don't do this now,
// we won't know at checkpoint time which file descriptor to look up.
@@ -257,18 +309,6 @@ func (p *initProcess) start() error {
return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid())
}
p.setExternalDescriptors(fds)
- // Do this before syncing with child so that no children can escape the
- // cgroup. We don't need to worry about not doing this and not being root
- // because we'd be using the rootless cgroup manager in that case.
- if err := p.manager.Apply(p.pid()); err != nil {
- return newSystemErrorWithCause(err, "applying cgroup configuration for process")
- }
- defer func() {
- if err != nil {
- // TODO: should not be the responsibility to call here
- p.manager.Destroy()
- }
- }()
if err := p.createNetworkInterfaces(); err != nil {
return newSystemErrorWithCause(err, "creating network interfaces")
}
@@ -294,13 +334,20 @@ func (p *initProcess) start() error {
if err := p.manager.Set(p.config.Config); err != nil {
return newSystemErrorWithCause(err, "setting cgroup config for ready process")
}
+ if p.intelRdtManager != nil {
+ if err := p.intelRdtManager.Set(p.config.Config); err != nil {
+ return newSystemErrorWithCause(err, "setting Intel RDT config for ready process")
+ }
+ }
if p.config.Config.Hooks != nil {
+ bundle, annotations := utils.Annotations(p.container.config.Labels)
s := configs.HookState{
- Version: p.container.config.Version,
- ID: p.container.id,
- Pid: p.pid(),
- Bundle: utils.SearchLabels(p.config.Config.Labels, "bundle"),
+ Version: p.container.config.Version,
+ ID: p.container.id,
+ Pid: p.pid(),
+ Bundle: bundle,
+ Annotations: annotations,
}
for i, hook := range p.config.Config.Hooks.Prestart {
if err := hook.Run(s); err != nil {
@@ -319,12 +366,19 @@ func (p *initProcess) start() error {
if err := p.manager.Set(p.config.Config); err != nil {
return newSystemErrorWithCause(err, "setting cgroup config for procHooks process")
}
+ if p.intelRdtManager != nil {
+ if err := p.intelRdtManager.Set(p.config.Config); err != nil {
+ return newSystemErrorWithCause(err, "setting Intel RDT config for procHooks process")
+ }
+ }
if p.config.Config.Hooks != nil {
+ bundle, annotations := utils.Annotations(p.container.config.Labels)
s := configs.HookState{
- Version: p.container.config.Version,
- ID: p.container.id,
- Pid: p.pid(),
- Bundle: utils.SearchLabels(p.config.Config.Labels, "bundle"),
+ Version: p.container.config.Version,
+ ID: p.container.id,
+ Pid: p.pid(),
+ Bundle: bundle,
+ Annotations: annotations,
}
for i, hook := range p.config.Config.Hooks.Prestart {
if err := hook.Run(s); err != nil {
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go
index e2e734a85..73ee2bd69 100644
--- a/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go
@@ -13,11 +13,11 @@ import (
"strings"
"time"
- "github.com/docker/docker/pkg/mount"
- "github.com/docker/docker/pkg/symlink"
+ "github.com/cyphar/filepath-securejoin"
"github.com/mrunalp/fileutils"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
+ "github.com/opencontainers/runc/libcontainer/mount"
"github.com/opencontainers/runc/libcontainer/system"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
"github.com/opencontainers/selinux/go-selinux/label"
@@ -40,7 +40,8 @@ func needsSetupDev(config *configs.Config) bool {
// prepareRootfs sets up the devices, mount points, and filesystems for use
// inside a new mount namespace. It doesn't set anything as ro. You must call
// finalizeRootfs after this function to finish setting up the rootfs.
-func prepareRootfs(pipe io.ReadWriter, config *configs.Config) (err error) {
+func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) {
+ config := iConfig.Config
if err := prepareRoot(config); err != nil {
return newSystemErrorWithCause(err, "preparing rootfs")
}
@@ -80,6 +81,7 @@ func prepareRootfs(pipe io.ReadWriter, config *configs.Config) (err error) {
// The hooks are run after the mounts are setup, but before we switch to the new
// root, so that the old root is still available in the hooks for any mount
// manipulations.
+ // Note that iConfig.Cwd is not guaranteed to exist here.
if err := syncParentHooks(pipe); err != nil {
return err
}
@@ -98,8 +100,10 @@ func prepareRootfs(pipe io.ReadWriter, config *configs.Config) (err error) {
if config.NoPivotRoot {
err = msMoveRoot(config.Rootfs)
- } else {
+ } else if config.Namespaces.Contains(configs.NEWNS) {
err = pivotRoot(config.Rootfs)
+ } else {
+ err = chroot(config.Rootfs)
}
if err != nil {
return newSystemErrorWithCause(err, "jailing process inside rootfs")
@@ -111,6 +115,14 @@ func prepareRootfs(pipe io.ReadWriter, config *configs.Config) (err error) {
}
}
+ if cwd := iConfig.Cwd; cwd != "" {
+ // Note that spec.Process.Cwd can contain unclean value like "../../../../foo/bar...".
+ // However, we are safe to call MkDirAll directly because we are in the jail here.
+ if err := os.MkdirAll(cwd, 0755); err != nil {
+ return err
+ }
+ }
+
return nil
}
@@ -230,7 +242,7 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
// any previous mounts can invalidate the next mount's destination.
// this can happen when a user specifies mounts within other mounts to cause breakouts or other
// evil stuff to try to escape the container's rootfs.
- if dest, err = symlink.FollowSymlinkInScope(dest, rootfs); err != nil {
+ if dest, err = securejoin.SecureJoin(rootfs, m.Destination); err != nil {
return err
}
if err := checkMountDestination(rootfs, dest); err != nil {
@@ -318,7 +330,7 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
// this can happen when a user specifies mounts within other mounts to cause breakouts or other
// evil stuff to try to escape the container's rootfs.
var err error
- if dest, err = symlink.FollowSymlinkInScope(dest, rootfs); err != nil {
+ if dest, err = securejoin.SecureJoin(rootfs, m.Destination); err != nil {
return err
}
if err := checkMountDestination(rootfs, dest); err != nil {
@@ -668,9 +680,12 @@ func pivotRoot(rootfs string) error {
return err
}
- // Make oldroot rprivate to make sure our unmounts don't propagate to the
- // host (and thus bork the machine).
- if err := unix.Mount("", ".", "", unix.MS_PRIVATE|unix.MS_REC, ""); err != nil {
+ // Make oldroot rslave to make sure our unmounts don't propagate to the
+ // host (and thus bork the machine). We don't use rprivate because this is
+ // known to cause issues due to races where we still have a reference to a
+ // mount while a process in the host namespace are trying to operate on
+ // something they think has no mounts (devicemapper in particular).
+ if err := unix.Mount("", ".", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil {
return err
}
// Preform the unmount. MNT_DETACH allows us to unmount /proc/self/cwd.
@@ -689,6 +704,10 @@ func msMoveRoot(rootfs string) error {
if err := unix.Mount(rootfs, "/", "", unix.MS_MOVE, ""); err != nil {
return err
}
+ return chroot(rootfs)
+}
+
+func chroot(rootfs string) error {
if err := unix.Chroot("."); err != nil {
return err
}
@@ -733,7 +752,14 @@ func remountReadonly(m *configs.Mount) error {
flags = m.Flags
)
for i := 0; i < 5; i++ {
- if err := unix.Mount("", dest, "", uintptr(flags|unix.MS_REMOUNT|unix.MS_RDONLY), ""); err != nil {
+ // There is a special case in the kernel for
+ // MS_REMOUNT | MS_BIND, which allows us to change only the
+ // flags even as an unprivileged user (i.e. user namespace)
+ // assuming we don't drop any security related flags (nodev,
+ // nosuid, etc.). So, let's use that case so that we can do
+ // this re-mount without failing in a userns.
+ flags |= unix.MS_REMOUNT | unix.MS_BIND | unix.MS_RDONLY
+ if err := unix.Mount("", dest, "", uintptr(flags), ""); err != nil {
switch err {
case unix.EBUSY:
time.Sleep(100 * time.Millisecond)
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go
index 2523cbf99..d99f3fe64 100644
--- a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go
@@ -22,6 +22,11 @@ var (
actErrno = libseccomp.ActErrno.SetReturnCode(int16(unix.EPERM))
)
+const (
+ // Linux system calls can have at most 6 arguments
+ syscallMaxArguments int = 6
+)
+
// Filters given syscalls in a container, preventing them from being used
// Started in the container init process, and carried over to all child processes
// Setns calls, however, require a separate invocation, as they are not children
@@ -45,11 +50,11 @@ func InitSeccomp(config *configs.Seccomp) error {
for _, arch := range config.Architectures {
scmpArch, err := libseccomp.GetArchFromString(arch)
if err != nil {
- return err
+ return fmt.Errorf("error validating Seccomp architecture: %s", err)
}
if err := filter.AddArch(scmpArch); err != nil {
- return err
+ return fmt.Errorf("error adding architecture to seccomp filter: %s", err)
}
}
@@ -170,29 +175,55 @@ func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall) error {
// Convert the call's action to the libseccomp equivalent
callAct, err := getAction(call.Action)
if err != nil {
- return err
+ return fmt.Errorf("action in seccomp profile is invalid: %s", err)
}
// Unconditional match - just add the rule
if len(call.Args) == 0 {
if err = filter.AddRule(callNum, callAct); err != nil {
- return err
+ return fmt.Errorf("error adding seccomp filter rule for syscall %s: %s", call.Name, err)
}
} else {
- // Conditional match - convert the per-arg rules into library format
+ // If two or more arguments have the same condition,
+ // Revert to old behavior, adding each condition as a separate rule
+ argCounts := make([]uint, syscallMaxArguments)
conditions := []libseccomp.ScmpCondition{}
for _, cond := range call.Args {
newCond, err := getCondition(cond)
if err != nil {
- return err
+ return fmt.Errorf("error creating seccomp syscall condition for syscall %s: %s", call.Name, err)
}
+ argCounts[cond.Index] += 1
+
conditions = append(conditions, newCond)
}
- if err = filter.AddRuleConditional(callNum, callAct, conditions); err != nil {
- return err
+ hasMultipleArgs := false
+ for _, count := range argCounts {
+ if count > 1 {
+ hasMultipleArgs = true
+ break
+ }
+ }
+
+ if hasMultipleArgs {
+ // Revert to old behavior
+ // Add each condition attached to a separate rule
+ for _, cond := range conditions {
+ condArr := []libseccomp.ScmpCondition{cond}
+
+ if err = filter.AddRuleConditional(callNum, callAct, condArr); err != nil {
+ return fmt.Errorf("error adding seccomp rule for syscall %s: %s", call.Name, err)
+ }
+ }
+ } else {
+ // No conditions share same argument
+ // Use new, proper behavior
+ if err = filter.AddRuleConditional(callNum, callAct, conditions); err != nil {
+ return fmt.Errorf("error adding seccomp rule for syscall %s: %s", call.Name, err)
+ }
}
}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/setgroups_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/setgroups_linux.go
deleted file mode 100644
index c7bdb605a..000000000
--- a/vendor/github.com/opencontainers/runc/libcontainer/setgroups_linux.go
+++ /dev/null
@@ -1,11 +0,0 @@
-// +build linux,go1.5
-
-package libcontainer
-
-import "syscall"
-
-// Set the GidMappingsEnableSetgroups member to true, so the process's
-// setgroups proc entry wont be set to 'deny' if GidMappings are set
-func enableSetgroups(sys *syscall.SysProcAttr) {
- sys.GidMappingsEnableSetgroups = true
-}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go
index 35b84219c..096c601e7 100644
--- a/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go
@@ -47,7 +47,10 @@ func (l *linuxSetnsInit) Init() error {
return err
}
}
- if l.config.Config.Seccomp != nil {
+ // Without NoNewPrivileges seccomp is a privileged operation, so we need to
+ // do this before dropping capabilities; otherwise do it as late as possible
+ // just before execve so as few syscalls take place after it as possible.
+ if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
return err
}
@@ -61,5 +64,13 @@ func (l *linuxSetnsInit) Init() error {
if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
return err
}
+ // Set seccomp as close to execve as possible, so as few syscalls take
+ // place afterward (reducing the amount of syscalls that users need to
+ // enable in their seccomp profiles).
+ if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
+ if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
+ return newSystemErrorWithCause(err, "init seccomp")
+ }
+ }
return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go
index 580b3fe45..02ea753ed 100644
--- a/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go
@@ -22,7 +22,7 @@ type linuxStandardInit struct {
pipe *os.File
consoleSocket *os.File
parentPid int
- stateDirFD int
+ fifoFd int
config *initConfig
}
@@ -30,15 +30,15 @@ func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) {
var newperms uint32
if l.config.Config.Namespaces.Contains(configs.NEWUSER) {
- // with user ns we need 'other' search permissions
+ // With user ns we need 'other' search permissions.
newperms = 0x8
} else {
- // without user ns we need 'UID' search permissions
+ // Without user ns we need 'UID' search permissions.
newperms = 0x80000
}
- // create a unique per session container name that we can
- // join in setns; however, other containers can also join it
+ // Create a unique per session container name that we can join in setns;
+ // However, other containers can also join it.
return fmt.Sprintf("_ses.%s", l.config.ContainerId), 0xffffffff, newperms
}
@@ -46,12 +46,12 @@ func (l *linuxStandardInit) Init() error {
if !l.config.Config.NoNewKeyring {
ringname, keepperms, newperms := l.getSessionRingParams()
- // do not inherit the parent's session keyring
+ // Do not inherit the parent's session keyring.
sessKeyId, err := keys.JoinSessionKeyring(ringname)
if err != nil {
return err
}
- // make session keyring searcheable
+ // Make session keyring searcheable.
if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
return err
}
@@ -65,14 +65,9 @@ func (l *linuxStandardInit) Init() error {
}
label.Init()
-
- // prepareRootfs() can be executed only for a new mount namespace.
- if l.config.Config.Namespaces.Contains(configs.NEWNS) {
- if err := prepareRootfs(l.pipe, l.config.Config); err != nil {
- return err
- }
+ if err := prepareRootfs(l.pipe, l.config); err != nil {
+ return err
}
-
// Set up the console. This has to be done *before* we finalize the rootfs,
// but *after* we've given the user the chance to set up all of the mounts
// they wanted.
@@ -150,37 +145,47 @@ func (l *linuxStandardInit) Init() error {
if err := pdeath.Restore(); err != nil {
return err
}
- // compare the parent from the initial start of the init process and make sure that it did not change.
- // if the parent changes that means it died and we were reparented to something else so we should
- // just kill ourself and not cause problems for someone else.
+ // Compare the parent from the initial start of the init process and make
+ // sure that it did not change. if the parent changes that means it died
+ // and we were reparented to something else so we should just kill ourself
+ // and not cause problems for someone else.
if unix.Getppid() != l.parentPid {
return unix.Kill(unix.Getpid(), unix.SIGKILL)
}
- // check for the arg before waiting to make sure it exists and it is returned
- // as a create time error.
+ // Check for the arg before waiting to make sure it exists and it is
+ // returned as a create time error.
name, err := exec.LookPath(l.config.Args[0])
if err != nil {
return err
}
- // close the pipe to signal that we have completed our init.
+ // Close the pipe to signal that we have completed our init.
l.pipe.Close()
- // wait for the fifo to be opened on the other side before
- // exec'ing the users process.
- fd, err := unix.Openat(l.stateDirFD, execFifoFilename, os.O_WRONLY|unix.O_CLOEXEC, 0)
+ // Wait for the FIFO to be opened on the other side before exec-ing the
+ // user process. We open it through /proc/self/fd/$fd, because the fd that
+ // was given to us was an O_PATH fd to the fifo itself. Linux allows us to
+ // re-open an O_PATH fd through /proc.
+ fd, err := unix.Open(fmt.Sprintf("/proc/self/fd/%d", l.fifoFd), unix.O_WRONLY|unix.O_CLOEXEC, 0)
if err != nil {
- return newSystemErrorWithCause(err, "openat exec fifo")
+ return newSystemErrorWithCause(err, "open exec fifo")
}
if _, err := unix.Write(fd, []byte("0")); err != nil {
return newSystemErrorWithCause(err, "write 0 exec fifo")
}
+ // Close the O_PATH fifofd fd before exec because the kernel resets
+ // dumpable in the wrong order. This has been fixed in newer kernels, but
+ // we keep this to ensure CVE-2016-9962 doesn't re-emerge on older kernels.
+ // N.B. the core issue itself (passing dirfds to the host filesystem) has
+ // since been resolved.
+ // https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
+ unix.Close(l.fifoFd)
+ // Set seccomp as close to execve as possible, so as few syscalls take
+ // place afterward (reducing the amount of syscalls that users need to
+ // enable in their seccomp profiles).
if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
return newSystemErrorWithCause(err, "init seccomp")
}
}
- // close the statedir fd before exec because the kernel resets dumpable in the wrong order
- // https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
- unix.Close(l.stateDirFD)
if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
return newSystemErrorWithCause(err, "exec user process")
}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/state_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/state_linux.go
index 44fa6b43a..b45ce23e4 100644
--- a/vendor/github.com/opencontainers/runc/libcontainer/state_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/state_linux.go
@@ -45,6 +45,11 @@ func destroy(c *linuxContainer) error {
}
}
err := c.cgroupManager.Destroy()
+ if c.intelRdtManager != nil {
+ if ierr := c.intelRdtManager.Destroy(); err == nil {
+ err = ierr
+ }
+ }
if rerr := os.RemoveAll(c.root); err == nil {
err = rerr
}
@@ -58,10 +63,12 @@ func destroy(c *linuxContainer) error {
func runPoststopHooks(c *linuxContainer) error {
if c.config.Hooks != nil {
+ bundle, annotations := utils.Annotations(c.config.Labels)
s := configs.HookState{
- Version: c.config.Version,
- ID: c.id,
- Bundle: utils.SearchLabels(c.config.Labels, "bundle"),
+ Version: c.config.Version,
+ ID: c.id,
+ Bundle: bundle,
+ Annotations: annotations,
}
for _, hook := range c.config.Hooks.Poststop {
if err := hook.Run(s); err != nil {
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/stats_freebsd.go b/vendor/github.com/opencontainers/runc/libcontainer/stats_freebsd.go
deleted file mode 100644
index f8d1d689c..000000000
--- a/vendor/github.com/opencontainers/runc/libcontainer/stats_freebsd.go
+++ /dev/null
@@ -1,5 +0,0 @@
-package libcontainer
-
-type Stats struct {
- Interfaces []*NetworkInterface
-}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/stats_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/stats_linux.go
index c629dc67d..29fd641e9 100644
--- a/vendor/github.com/opencontainers/runc/libcontainer/stats_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/stats_linux.go
@@ -1,8 +1,10 @@
package libcontainer
import "github.com/opencontainers/runc/libcontainer/cgroups"
+import "github.com/opencontainers/runc/libcontainer/intelrdt"
type Stats struct {
- Interfaces []*NetworkInterface
- CgroupStats *cgroups.Stats
+ Interfaces []*NetworkInterface
+ CgroupStats *cgroups.Stats
+ IntelRdtStats *intelrdt.Stats
}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/stats_solaris.go b/vendor/github.com/opencontainers/runc/libcontainer/stats_solaris.go
deleted file mode 100644
index da78c1c2e..000000000
--- a/vendor/github.com/opencontainers/runc/libcontainer/stats_solaris.go
+++ /dev/null
@@ -1,7 +0,0 @@
-package libcontainer
-
-// Solaris - TODO
-
-type Stats struct {
- Interfaces []*NetworkInterface
-}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/stats_windows.go b/vendor/github.com/opencontainers/runc/libcontainer/stats_windows.go
deleted file mode 100644
index f8d1d689c..000000000
--- a/vendor/github.com/opencontainers/runc/libcontainer/stats_windows.go
+++ /dev/null
@@ -1,5 +0,0 @@
-package libcontainer
-
-type Stats struct {
- Interfaces []*NetworkInterface
-}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go b/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go
index 4837085a7..5f124cd8b 100644
--- a/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go
@@ -134,3 +134,14 @@ func RunningInUserNS() bool {
func SetSubreaper(i int) error {
return unix.Prctl(PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0)
}
+
+// GetSubreaper returns the subreaper setting for the calling process
+func GetSubreaper() (int, error) {
+ var i uintptr
+
+ if err := unix.Prctl(unix.PR_GET_CHILD_SUBREAPER, uintptr(unsafe.Pointer(&i)), 0, 0, 0); err != nil {
+ return -1, err
+ }
+
+ return int(i), nil
+}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_arm.go b/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_32.go
index 31ff3deb1..c5ca5d862 100644
--- a/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_arm.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_32.go
@@ -1,4 +1,5 @@
-// +build linux,arm
+// +build linux
+// +build 386 arm
package system
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_386.go b/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_386.go
deleted file mode 100644
index 3f7235ed1..000000000
--- a/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_386.go
+++ /dev/null
@@ -1,25 +0,0 @@
-// +build linux,386
-
-package system
-
-import (
- "golang.org/x/sys/unix"
-)
-
-// Setuid sets the uid of the calling thread to the specified uid.
-func Setuid(uid int) (err error) {
- _, _, e1 := unix.RawSyscall(unix.SYS_SETUID32, uintptr(uid), 0, 0)
- if e1 != 0 {
- err = e1
- }
- return
-}
-
-// Setgid sets the gid of the calling thread to the specified gid.
-func Setgid(gid int) (err error) {
- _, _, e1 := unix.RawSyscall(unix.SYS_SETGID32, uintptr(gid), 0, 0)
- if e1 != 0 {
- err = e1
- }
- return
-}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_64.go b/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_64.go
index d7891a2ff..11c3faafb 100644
--- a/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_64.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_64.go
@@ -1,4 +1,5 @@
-// +build linux,arm64 linux,amd64 linux,ppc linux,ppc64 linux,ppc64le linux,s390x
+// +build linux
+// +build arm64 amd64 mips mipsle mips64 mips64le ppc ppc64 ppc64le s390x
package system
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/sysconfig.go b/vendor/github.com/opencontainers/runc/libcontainer/system/sysconfig.go
index b3a07cba3..b8434f105 100644
--- a/vendor/github.com/opencontainers/runc/libcontainer/system/sysconfig.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/system/sysconfig.go
@@ -1,4 +1,4 @@
-// +build cgo,linux cgo,freebsd
+// +build cgo,linux
package system
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unsupported.go
deleted file mode 100644
index 4a8d00acb..000000000
--- a/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unsupported.go
+++ /dev/null
@@ -1,38 +0,0 @@
-// +build !darwin,!dragonfly,!freebsd,!linux,!netbsd,!openbsd,!solaris
-
-package user
-
-import (
- "io"
- "syscall"
-)
-
-func GetPasswdPath() (string, error) {
- return "", ErrUnsupported
-}
-
-func GetPasswd() (io.ReadCloser, error) {
- return nil, ErrUnsupported
-}
-
-func GetGroupPath() (string, error) {
- return "", ErrUnsupported
-}
-
-func GetGroup() (io.ReadCloser, error) {
- return nil, ErrUnsupported
-}
-
-// CurrentUser looks up the current user by their user id in /etc/passwd. If the
-// user cannot be found (or there is no /etc/passwd file on the filesystem),
-// then CurrentUser returns an error.
-func CurrentUser() (User, error) {
- return LookupUid(syscall.Getuid())
-}
-
-// CurrentGroup looks up the current user's group by their primary group id's
-// entry in /etc/passwd. If the group cannot be found (or there is no
-// /etc/group file on the filesystem), then CurrentGroup returns an error.
-func CurrentGroup() (Group, error) {
- return LookupGid(syscall.Getgid())
-}
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go
index 2cbb6491a..c8a9364d5 100644
--- a/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go
@@ -84,12 +84,10 @@ func RecvFd(socket *os.File) (*os.File, error) {
// addition, the file.Name() of the given file will also be sent as
// non-auxiliary data in the same payload (allowing to send contextual
// information for a file descriptor).
-func SendFd(socket, file *os.File) error {
- name := []byte(file.Name())
+func SendFd(socket *os.File, name string, fd uintptr) error {
if len(name) >= MaxNameLen {
- return fmt.Errorf("sendfd: filename too long: %s", file.Name())
+ return fmt.Errorf("sendfd: filename too long: %s", name)
}
- oob := unix.UnixRights(int(file.Fd()))
-
- return unix.Sendmsg(int(socket.Fd()), name, oob, nil, 0)
+ oob := unix.UnixRights(int(fd))
+ return unix.Sendmsg(int(socket.Fd()), []byte(name), oob, nil, 0)
}
diff --git a/vendor/github.com/opencontainers/runc/vendor.conf b/vendor/github.com/opencontainers/runc/vendor.conf
index 9506b5c67..0ab4685fd 100644
--- a/vendor/github.com/opencontainers/runc/vendor.conf
+++ b/vendor/github.com/opencontainers/runc/vendor.conf
@@ -5,7 +5,7 @@ github.com/opencontainers/runtime-spec v1.0.0
# Core libcontainer functionality.
github.com/mrunalp/fileutils ed869b029674c0e9ce4c0dfa781405c2d9946d08
github.com/opencontainers/selinux v1.0.0-rc1
-github.com/seccomp/libseccomp-golang 32f571b70023028bd57d9288c20efbcb237f3ce0
+github.com/seccomp/libseccomp-golang 84e90a91acea0f4e51e62bc1a75de18b1fc0790f
github.com/sirupsen/logrus a3f95b5c423586578a4e099b11a46c2479628cac
github.com/syndtr/gocapability db04d3cc01c8b54962a58ec7e491717d06cfcc16
github.com/vishvananda/netlink 1e2e08e8a2dcdacaae3f14ac44c5cfa31361f270
@@ -15,7 +15,11 @@ github.com/coreos/pkg v3
github.com/godbus/dbus v3
github.com/golang/protobuf 18c9bb3261723cd5401db4d0c9fbc5c3b6c70fe8
# Command-line interface.
-github.com/docker/docker 0f5c9d301b9b1cca66b3ea0f9dec3b5317d3686d
+github.com/cyphar/filepath-securejoin v0.2.1
github.com/docker/go-units v0.2.0
github.com/urfave/cli d53eb991652b1d438abdd34ce4bfa3ef1539108e
-golang.org/x/sys 0e0164865330d5cf1c00247be08330bf96e2f87c https://github.com/golang/sys
+golang.org/x/sys 7ddbeae9ae08c6a06a59597f0c9edbc5ff2444ce https://github.com/golang/sys
+
+# console dependencies
+github.com/containerd/console 84eeaae905fa414d03e07bcd6c8d3f19e7cf180e
+github.com/pkg/errors v0.8.0