diff options
author | Matthew Heon <matthew.heon@gmail.com> | 2017-11-01 11:24:59 -0400 |
---|---|---|
committer | Matthew Heon <matthew.heon@gmail.com> | 2017-11-01 11:24:59 -0400 |
commit | a031b83a09a8628435317a03f199cdc18b78262f (patch) | |
tree | bc017a96769ce6de33745b8b0b1304ccf38e9df0 /vendor/github.com/opencontainers/runc/libcontainer/configs | |
parent | 2b74391cd5281f6fdf391ff8ad50fd1490f6bf89 (diff) | |
download | podman-a031b83a09a8628435317a03f199cdc18b78262f.tar.gz podman-a031b83a09a8628435317a03f199cdc18b78262f.tar.bz2 podman-a031b83a09a8628435317a03f199cdc18b78262f.zip |
Initial checkin from CRI-O repo
Signed-off-by: Matthew Heon <matthew.heon@gmail.com>
Diffstat (limited to 'vendor/github.com/opencontainers/runc/libcontainer/configs')
19 files changed, 1393 insertions, 0 deletions
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go new file mode 100644 index 000000000..e0f3ca165 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go @@ -0,0 +1,61 @@ +package configs + +import "fmt" + +// blockIODevice holds major:minor format supported in blkio cgroup +type blockIODevice struct { + // Major is the device's major number + Major int64 `json:"major"` + // Minor is the device's minor number + Minor int64 `json:"minor"` +} + +// WeightDevice struct holds a `major:minor weight`|`major:minor leaf_weight` pair +type WeightDevice struct { + blockIODevice + // Weight is the bandwidth rate for the device, range is from 10 to 1000 + Weight uint16 `json:"weight"` + // LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only + LeafWeight uint16 `json:"leafWeight"` +} + +// NewWeightDevice returns a configured WeightDevice pointer +func NewWeightDevice(major, minor int64, weight, leafWeight uint16) *WeightDevice { + wd := &WeightDevice{} + wd.Major = major + wd.Minor = minor + wd.Weight = weight + wd.LeafWeight = leafWeight + return wd +} + +// WeightString formats the struct to be writable to the cgroup specific file +func (wd *WeightDevice) WeightString() string { + return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.Weight) +} + +// LeafWeightString formats the struct to be writable to the cgroup specific file +func (wd *WeightDevice) LeafWeightString() string { + return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.LeafWeight) +} + +// ThrottleDevice struct holds a `major:minor rate_per_second` pair +type ThrottleDevice struct { + blockIODevice + // Rate is the IO rate limit per cgroup per device + Rate uint64 `json:"rate"` +} + +// NewThrottleDevice returns a configured ThrottleDevice pointer +func NewThrottleDevice(major, minor int64, rate uint64) *ThrottleDevice { + td := &ThrottleDevice{} + td.Major = major + td.Minor = minor + td.Rate = rate + return td +} + +// String formats the struct to be writable to the cgroup specific file +func (td *ThrottleDevice) String() string { + return fmt.Sprintf("%d:%d %d", td.Major, td.Minor, td.Rate) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go new file mode 100644 index 000000000..e15a662f5 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go @@ -0,0 +1,122 @@ +package configs + +type FreezerState string + +const ( + Undefined FreezerState = "" + Frozen FreezerState = "FROZEN" + Thawed FreezerState = "THAWED" +) + +type Cgroup struct { + // Deprecated, use Path instead + Name string `json:"name,omitempty"` + + // name of parent of cgroup or slice + // Deprecated, use Path instead + Parent string `json:"parent,omitempty"` + + // Path specifies the path to cgroups that are created and/or joined by the container. + // The path is assumed to be relative to the host system cgroup mountpoint. + Path string `json:"path"` + + // ScopePrefix describes prefix for the scope name + ScopePrefix string `json:"scope_prefix"` + + // Paths represent the absolute cgroups paths to join. + // This takes precedence over Path. + Paths map[string]string + + // Resources contains various cgroups settings to apply + *Resources +} + +type Resources struct { + // If this is true allow access to any kind of device within the container. If false, allow access only to devices explicitly listed in the allowed_devices list. + // Deprecated + AllowAllDevices *bool `json:"allow_all_devices,omitempty"` + // Deprecated + AllowedDevices []*Device `json:"allowed_devices,omitempty"` + // Deprecated + DeniedDevices []*Device `json:"denied_devices,omitempty"` + + Devices []*Device `json:"devices"` + + // Memory limit (in bytes) + Memory int64 `json:"memory"` + + // Memory reservation or soft_limit (in bytes) + MemoryReservation int64 `json:"memory_reservation"` + + // Total memory usage (memory + swap); set `-1` to enable unlimited swap + MemorySwap int64 `json:"memory_swap"` + + // Kernel memory limit (in bytes) + KernelMemory int64 `json:"kernel_memory"` + + // Kernel memory limit for TCP use (in bytes) + KernelMemoryTCP int64 `json:"kernel_memory_tcp"` + + // CPU shares (relative weight vs. other containers) + CpuShares uint64 `json:"cpu_shares"` + + // CPU hardcap limit (in usecs). Allowed cpu time in a given period. + CpuQuota int64 `json:"cpu_quota"` + + // CPU period to be used for hardcapping (in usecs). 0 to use system default. + CpuPeriod uint64 `json:"cpu_period"` + + // How many time CPU will use in realtime scheduling (in usecs). + CpuRtRuntime int64 `json:"cpu_rt_quota"` + + // CPU period to be used for realtime scheduling (in usecs). + CpuRtPeriod uint64 `json:"cpu_rt_period"` + + // CPU to use + CpusetCpus string `json:"cpuset_cpus"` + + // MEM to use + CpusetMems string `json:"cpuset_mems"` + + // Process limit; set <= `0' to disable limit. + PidsLimit int64 `json:"pids_limit"` + + // Specifies per cgroup weight, range is from 10 to 1000. + BlkioWeight uint16 `json:"blkio_weight"` + + // Specifies tasks' weight in the given cgroup while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only + BlkioLeafWeight uint16 `json:"blkio_leaf_weight"` + + // Weight per cgroup per device, can override BlkioWeight. + BlkioWeightDevice []*WeightDevice `json:"blkio_weight_device"` + + // IO read rate limit per cgroup per device, bytes per second. + BlkioThrottleReadBpsDevice []*ThrottleDevice `json:"blkio_throttle_read_bps_device"` + + // IO write rate limit per cgroup per device, bytes per second. + BlkioThrottleWriteBpsDevice []*ThrottleDevice `json:"blkio_throttle_write_bps_device"` + + // IO read rate limit per cgroup per device, IO per second. + BlkioThrottleReadIOPSDevice []*ThrottleDevice `json:"blkio_throttle_read_iops_device"` + + // IO write rate limit per cgroup per device, IO per second. + BlkioThrottleWriteIOPSDevice []*ThrottleDevice `json:"blkio_throttle_write_iops_device"` + + // set the freeze value for the process + Freezer FreezerState `json:"freezer"` + + // Hugetlb limit (in bytes) + HugetlbLimit []*HugepageLimit `json:"hugetlb_limit"` + + // Whether to disable OOM Killer + OomKillDisable bool `json:"oom_kill_disable"` + + // Tuning swappiness behaviour per cgroup + MemorySwappiness *uint64 `json:"memory_swappiness"` + + // Set priority of network traffic for container + NetPrioIfpriomap []*IfPrioMap `json:"net_prio_ifpriomap"` + + // Set class identifier for container's network packets + NetClsClassid uint32 `json:"net_cls_classid_u"` +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unsupported.go new file mode 100644 index 000000000..95e2830a4 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unsupported.go @@ -0,0 +1,6 @@ +// +build !windows,!linux,!freebsd + +package configs + +type Cgroup struct { +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_windows.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_windows.go new file mode 100644 index 000000000..d74847b0d --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_windows.go @@ -0,0 +1,6 @@ +package configs + +// TODO Windows: This can ultimately be entirely factored out on Windows as +// cgroups are a Unix-specific construct. +type Cgroup struct { +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go new file mode 100644 index 000000000..269fffff3 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go @@ -0,0 +1,344 @@ +package configs + +import ( + "bytes" + "encoding/json" + "fmt" + "os/exec" + "time" + + "github.com/opencontainers/runtime-spec/specs-go" + + "github.com/sirupsen/logrus" +) + +type Rlimit struct { + Type int `json:"type"` + Hard uint64 `json:"hard"` + Soft uint64 `json:"soft"` +} + +// IDMap represents UID/GID Mappings for User Namespaces. +type IDMap struct { + ContainerID int `json:"container_id"` + HostID int `json:"host_id"` + Size int `json:"size"` +} + +// Seccomp represents syscall restrictions +// By default, only the native architecture of the kernel is allowed to be used +// for syscalls. Additional architectures can be added by specifying them in +// Architectures. +type Seccomp struct { + DefaultAction Action `json:"default_action"` + Architectures []string `json:"architectures"` + Syscalls []*Syscall `json:"syscalls"` +} + +// Action is taken upon rule match in Seccomp +type Action int + +const ( + Kill Action = iota + 1 + Errno + Trap + Allow + Trace +) + +// Operator is a comparison operator to be used when matching syscall arguments in Seccomp +type Operator int + +const ( + EqualTo Operator = iota + 1 + NotEqualTo + GreaterThan + GreaterThanOrEqualTo + LessThan + LessThanOrEqualTo + MaskEqualTo +) + +// Arg is a rule to match a specific syscall argument in Seccomp +type Arg struct { + Index uint `json:"index"` + Value uint64 `json:"value"` + ValueTwo uint64 `json:"value_two"` + Op Operator `json:"op"` +} + +// Syscall is a rule to match a syscall in Seccomp +type Syscall struct { + Name string `json:"name"` + Action Action `json:"action"` + Args []*Arg `json:"args"` +} + +// TODO Windows. Many of these fields should be factored out into those parts +// which are common across platforms, and those which are platform specific. + +// Config defines configuration options for executing a process inside a contained environment. +type Config struct { + // NoPivotRoot will use MS_MOVE and a chroot to jail the process into the container's rootfs + // This is a common option when the container is running in ramdisk + NoPivotRoot bool `json:"no_pivot_root"` + + // ParentDeathSignal specifies the signal that is sent to the container's process in the case + // that the parent process dies. + ParentDeathSignal int `json:"parent_death_signal"` + + // Path to a directory containing the container's root filesystem. + Rootfs string `json:"rootfs"` + + // Readonlyfs will remount the container's rootfs as readonly where only externally mounted + // bind mounts are writtable. + Readonlyfs bool `json:"readonlyfs"` + + // Specifies the mount propagation flags to be applied to /. + RootPropagation int `json:"rootPropagation"` + + // Mounts specify additional source and destination paths that will be mounted inside the container's + // rootfs and mount namespace if specified + Mounts []*Mount `json:"mounts"` + + // The device nodes that should be automatically created within the container upon container start. Note, make sure that the node is marked as allowed in the cgroup as well! + Devices []*Device `json:"devices"` + + MountLabel string `json:"mount_label"` + + // Hostname optionally sets the container's hostname if provided + Hostname string `json:"hostname"` + + // Namespaces specifies the container's namespaces that it should setup when cloning the init process + // If a namespace is not provided that namespace is shared from the container's parent process + Namespaces Namespaces `json:"namespaces"` + + // Capabilities specify the capabilities to keep when executing the process inside the container + // All capabilities not specified will be dropped from the processes capability mask + Capabilities *Capabilities `json:"capabilities"` + + // Networks specifies the container's network setup to be created + Networks []*Network `json:"networks"` + + // Routes can be specified to create entries in the route table as the container is started + Routes []*Route `json:"routes"` + + // Cgroups specifies specific cgroup settings for the various subsystems that the container is + // placed into to limit the resources the container has available + Cgroups *Cgroup `json:"cgroups"` + + // AppArmorProfile specifies the profile to apply to the process running in the container and is + // change at the time the process is execed + AppArmorProfile string `json:"apparmor_profile,omitempty"` + + // ProcessLabel specifies the label to apply to the process running in the container. It is + // commonly used by selinux + ProcessLabel string `json:"process_label,omitempty"` + + // Rlimits specifies the resource limits, such as max open files, to set in the container + // If Rlimits are not set, the container will inherit rlimits from the parent process + Rlimits []Rlimit `json:"rlimits,omitempty"` + + // OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores + // for a process. Valid values are between the range [-1000, '1000'], where processes with + // higher scores are preferred for being killed. + // More information about kernel oom score calculation here: https://lwn.net/Articles/317814/ + OomScoreAdj int `json:"oom_score_adj"` + + // UidMappings is an array of User ID mappings for User Namespaces + UidMappings []IDMap `json:"uid_mappings"` + + // GidMappings is an array of Group ID mappings for User Namespaces + GidMappings []IDMap `json:"gid_mappings"` + + // MaskPaths specifies paths within the container's rootfs to mask over with a bind + // mount pointing to /dev/null as to prevent reads of the file. + MaskPaths []string `json:"mask_paths"` + + // ReadonlyPaths specifies paths within the container's rootfs to remount as read-only + // so that these files prevent any writes. + ReadonlyPaths []string `json:"readonly_paths"` + + // Sysctl is a map of properties and their values. It is the equivalent of using + // sysctl -w my.property.name value in Linux. + Sysctl map[string]string `json:"sysctl"` + + // Seccomp allows actions to be taken whenever a syscall is made within the container. + // A number of rules are given, each having an action to be taken if a syscall matches it. + // A default action to be taken if no rules match is also given. + Seccomp *Seccomp `json:"seccomp"` + + // NoNewPrivileges controls whether processes in the container can gain additional privileges. + NoNewPrivileges bool `json:"no_new_privileges,omitempty"` + + // Hooks are a collection of actions to perform at various container lifecycle events. + // CommandHooks are serialized to JSON, but other hooks are not. + Hooks *Hooks + + // Version is the version of opencontainer specification that is supported. + Version string `json:"version"` + + // Labels are user defined metadata that is stored in the config and populated on the state + Labels []string `json:"labels"` + + // NoNewKeyring will not allocated a new session keyring for the container. It will use the + // callers keyring in this case. + NoNewKeyring bool `json:"no_new_keyring"` + + // Rootless specifies whether the container is a rootless container. + Rootless bool `json:"rootless"` +} + +type Hooks struct { + // Prestart commands are executed after the container namespaces are created, + // but before the user supplied command is executed from init. + Prestart []Hook + + // Poststart commands are executed after the container init process starts. + Poststart []Hook + + // Poststop commands are executed after the container init process exits. + Poststop []Hook +} + +type Capabilities struct { + // Bounding is the set of capabilities checked by the kernel. + Bounding []string + // Effective is the set of capabilities checked by the kernel. + Effective []string + // Inheritable is the capabilities preserved across execve. + Inheritable []string + // Permitted is the limiting superset for effective capabilities. + Permitted []string + // Ambient is the ambient set of capabilities that are kept. + Ambient []string +} + +func (hooks *Hooks) UnmarshalJSON(b []byte) error { + var state struct { + Prestart []CommandHook + Poststart []CommandHook + Poststop []CommandHook + } + + if err := json.Unmarshal(b, &state); err != nil { + return err + } + + deserialize := func(shooks []CommandHook) (hooks []Hook) { + for _, shook := range shooks { + hooks = append(hooks, shook) + } + + return hooks + } + + hooks.Prestart = deserialize(state.Prestart) + hooks.Poststart = deserialize(state.Poststart) + hooks.Poststop = deserialize(state.Poststop) + return nil +} + +func (hooks Hooks) MarshalJSON() ([]byte, error) { + serialize := func(hooks []Hook) (serializableHooks []CommandHook) { + for _, hook := range hooks { + switch chook := hook.(type) { + case CommandHook: + serializableHooks = append(serializableHooks, chook) + default: + logrus.Warnf("cannot serialize hook of type %T, skipping", hook) + } + } + + return serializableHooks + } + + return json.Marshal(map[string]interface{}{ + "prestart": serialize(hooks.Prestart), + "poststart": serialize(hooks.Poststart), + "poststop": serialize(hooks.Poststop), + }) +} + +// HookState is the payload provided to a hook on execution. +type HookState specs.State + +type Hook interface { + // Run executes the hook with the provided state. + Run(HookState) error +} + +// NewFunctionHook will call the provided function when the hook is run. +func NewFunctionHook(f func(HookState) error) FuncHook { + return FuncHook{ + run: f, + } +} + +type FuncHook struct { + run func(HookState) error +} + +func (f FuncHook) Run(s HookState) error { + return f.run(s) +} + +type Command struct { + Path string `json:"path"` + Args []string `json:"args"` + Env []string `json:"env"` + Dir string `json:"dir"` + Timeout *time.Duration `json:"timeout"` +} + +// NewCommandHook will execute the provided command when the hook is run. +func NewCommandHook(cmd Command) CommandHook { + return CommandHook{ + Command: cmd, + } +} + +type CommandHook struct { + Command +} + +func (c Command) Run(s HookState) error { + b, err := json.Marshal(s) + if err != nil { + return err + } + var stdout, stderr bytes.Buffer + cmd := exec.Cmd{ + Path: c.Path, + Args: c.Args, + Env: c.Env, + Stdin: bytes.NewReader(b), + Stdout: &stdout, + Stderr: &stderr, + } + if err := cmd.Start(); err != nil { + return err + } + errC := make(chan error, 1) + go func() { + err := cmd.Wait() + if err != nil { + err = fmt.Errorf("error running hook: %v, stdout: %s, stderr: %s", err, stdout.String(), stderr.String()) + } + errC <- err + }() + var timerCh <-chan time.Time + if c.Timeout != nil { + timer := time.NewTimer(*c.Timeout) + defer timer.Stop() + timerCh = timer.C + } + select { + case err := <-errC: + return err + case <-timerCh: + cmd.Process.Kill() + cmd.Wait() + return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds()) + } +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/config_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/config_linux.go new file mode 100644 index 000000000..07da10804 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config_linux.go @@ -0,0 +1,61 @@ +package configs + +import "fmt" + +// HostUID gets the translated uid for the process on host which could be +// different when user namespaces are enabled. +func (c Config) HostUID(containerId int) (int, error) { + if c.Namespaces.Contains(NEWUSER) { + if c.UidMappings == nil { + return -1, fmt.Errorf("User namespaces enabled, but no uid mappings found.") + } + id, found := c.hostIDFromMapping(containerId, c.UidMappings) + if !found { + return -1, fmt.Errorf("User namespaces enabled, but no user mapping found.") + } + return id, nil + } + // Return unchanged id. + return containerId, nil +} + +// HostRootUID gets the root uid for the process on host which could be non-zero +// when user namespaces are enabled. +func (c Config) HostRootUID() (int, error) { + return c.HostUID(0) +} + +// HostGID gets the translated gid for the process on host which could be +// different when user namespaces are enabled. +func (c Config) HostGID(containerId int) (int, error) { + if c.Namespaces.Contains(NEWUSER) { + if c.GidMappings == nil { + return -1, fmt.Errorf("User namespaces enabled, but no gid mappings found.") + } + id, found := c.hostIDFromMapping(containerId, c.GidMappings) + if !found { + return -1, fmt.Errorf("User namespaces enabled, but no group mapping found.") + } + return id, nil + } + // Return unchanged id. + return containerId, nil +} + +// HostRootGID gets the root gid for the process on host which could be non-zero +// when user namespaces are enabled. +func (c Config) HostRootGID() (int, error) { + return c.HostGID(0) +} + +// Utility function that gets a host ID for a container ID from user namespace map +// if that ID is present in the map. +func (c Config) hostIDFromMapping(containerID int, uMap []IDMap) (int, bool) { + for _, m := range uMap { + if (containerID >= m.ContainerID) && (containerID <= (m.ContainerID + m.Size - 1)) { + hostID := m.HostID + (containerID - m.ContainerID) + return hostID, true + } + } + return -1, false +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/device.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/device.go new file mode 100644 index 000000000..8701bb212 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/device.go @@ -0,0 +1,57 @@ +package configs + +import ( + "fmt" + "os" +) + +const ( + Wildcard = -1 +) + +// TODO Windows: This can be factored out in the future + +type Device struct { + // Device type, block, char, etc. + Type rune `json:"type"` + + // Path to the device. + Path string `json:"path"` + + // Major is the device's major number. + Major int64 `json:"major"` + + // Minor is the device's minor number. + Minor int64 `json:"minor"` + + // Cgroup permissions format, rwm. + Permissions string `json:"permissions"` + + // FileMode permission bits for the device. + FileMode os.FileMode `json:"file_mode"` + + // Uid of the device. + Uid uint32 `json:"uid"` + + // Gid of the device. + Gid uint32 `json:"gid"` + + // Write the file to the allowed list + Allow bool `json:"allow"` +} + +func (d *Device) CgroupString() string { + return fmt.Sprintf("%c %s:%s %s", d.Type, deviceNumberString(d.Major), deviceNumberString(d.Minor), d.Permissions) +} + +func (d *Device) Mkdev() int { + return int((d.Major << 8) | (d.Minor & 0xff) | ((d.Minor & 0xfff00) << 12)) +} + +// deviceNumberString converts the device number to a string return result. +func deviceNumberString(number int64) string { + if number == Wildcard { + return "*" + } + return fmt.Sprint(number) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go new file mode 100644 index 000000000..4d348d217 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go @@ -0,0 +1,111 @@ +// +build linux freebsd + +package configs + +var ( + // DefaultSimpleDevices are devices that are to be both allowed and created. + DefaultSimpleDevices = []*Device{ + // /dev/null and zero + { + Path: "/dev/null", + Type: 'c', + Major: 1, + Minor: 3, + Permissions: "rwm", + FileMode: 0666, + }, + { + Path: "/dev/zero", + Type: 'c', + Major: 1, + Minor: 5, + Permissions: "rwm", + FileMode: 0666, + }, + + { + Path: "/dev/full", + Type: 'c', + Major: 1, + Minor: 7, + Permissions: "rwm", + FileMode: 0666, + }, + + // consoles and ttys + { + Path: "/dev/tty", + Type: 'c', + Major: 5, + Minor: 0, + Permissions: "rwm", + FileMode: 0666, + }, + + // /dev/urandom,/dev/random + { + Path: "/dev/urandom", + Type: 'c', + Major: 1, + Minor: 9, + Permissions: "rwm", + FileMode: 0666, + }, + { + Path: "/dev/random", + Type: 'c', + Major: 1, + Minor: 8, + Permissions: "rwm", + FileMode: 0666, + }, + } + DefaultAllowedDevices = append([]*Device{ + // allow mknod for any device + { + Type: 'c', + Major: Wildcard, + Minor: Wildcard, + Permissions: "m", + }, + { + Type: 'b', + Major: Wildcard, + Minor: Wildcard, + Permissions: "m", + }, + + { + Path: "/dev/console", + Type: 'c', + Major: 5, + Minor: 1, + Permissions: "rwm", + }, + // /dev/pts/ - pts namespaces are "coming soon" + { + Path: "", + Type: 'c', + Major: 136, + Minor: Wildcard, + Permissions: "rwm", + }, + { + Path: "", + Type: 'c', + Major: 5, + Minor: 2, + Permissions: "rwm", + }, + + // tuntap + { + Path: "", + Type: 'c', + Major: 10, + Minor: 200, + Permissions: "rwm", + }, + }, DefaultSimpleDevices...) + DefaultAutoCreatedDevices = append([]*Device{}, DefaultSimpleDevices...) +) diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/hugepage_limit.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/hugepage_limit.go new file mode 100644 index 000000000..d30216380 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/hugepage_limit.go @@ -0,0 +1,9 @@ +package configs + +type HugepageLimit struct { + // which type of hugepage to limit. + Pagesize string `json:"page_size"` + + // usage limit for hugepage. + Limit uint64 `json:"limit"` +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/interface_priority_map.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/interface_priority_map.go new file mode 100644 index 000000000..9a0395eaf --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/interface_priority_map.go @@ -0,0 +1,14 @@ +package configs + +import ( + "fmt" +) + +type IfPrioMap struct { + Interface string `json:"interface"` + Priority int64 `json:"priority"` +} + +func (i *IfPrioMap) CgroupString() string { + return fmt.Sprintf("%s %d", i.Interface, i.Priority) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go new file mode 100644 index 000000000..670757ddb --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go @@ -0,0 +1,39 @@ +package configs + +const ( + // EXT_COPYUP is a directive to copy up the contents of a directory when + // a tmpfs is mounted over it. + EXT_COPYUP = 1 << iota +) + +type Mount struct { + // Source path for the mount. + Source string `json:"source"` + + // Destination path for the mount inside the container. + Destination string `json:"destination"` + + // Device the mount is for. + Device string `json:"device"` + + // Mount flags. + Flags int `json:"flags"` + + // Propagation Flags + PropagationFlags []int `json:"propagation_flags"` + + // Mount data applied to the mount. + Data string `json:"data"` + + // Relabel source if set, "z" indicates shared, "Z" indicates unshared. + Relabel string `json:"relabel"` + + // Extensions are additional flags that are specific to runc. + Extensions int `json:"extensions"` + + // Optional Command to be run before Source is mounted. + PremountCmds []Command `json:"premount_cmds"` + + // Optional Command to be run after Source is mounted. + PostmountCmds []Command `json:"postmount_cmds"` +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces.go new file mode 100644 index 000000000..a3329a31a --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces.go @@ -0,0 +1,5 @@ +package configs + +type NamespaceType string + +type Namespaces []Namespace diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go new file mode 100644 index 000000000..5fc171a57 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go @@ -0,0 +1,122 @@ +package configs + +import ( + "fmt" + "os" + "sync" +) + +const ( + NEWNET NamespaceType = "NEWNET" + NEWPID NamespaceType = "NEWPID" + NEWNS NamespaceType = "NEWNS" + NEWUTS NamespaceType = "NEWUTS" + NEWIPC NamespaceType = "NEWIPC" + NEWUSER NamespaceType = "NEWUSER" +) + +var ( + nsLock sync.Mutex + supportedNamespaces = make(map[NamespaceType]bool) +) + +// NsName converts the namespace type to its filename +func NsName(ns NamespaceType) string { + switch ns { + case NEWNET: + return "net" + case NEWNS: + return "mnt" + case NEWPID: + return "pid" + case NEWIPC: + return "ipc" + case NEWUSER: + return "user" + case NEWUTS: + return "uts" + } + return "" +} + +// IsNamespaceSupported returns whether a namespace is available or +// not +func IsNamespaceSupported(ns NamespaceType) bool { + nsLock.Lock() + defer nsLock.Unlock() + supported, ok := supportedNamespaces[ns] + if ok { + return supported + } + nsFile := NsName(ns) + // if the namespace type is unknown, just return false + if nsFile == "" { + return false + } + _, err := os.Stat(fmt.Sprintf("/proc/self/ns/%s", nsFile)) + // a namespace is supported if it exists and we have permissions to read it + supported = err == nil + supportedNamespaces[ns] = supported + return supported +} + +func NamespaceTypes() []NamespaceType { + return []NamespaceType{ + NEWUSER, // Keep user NS always first, don't move it. + NEWIPC, + NEWUTS, + NEWNET, + NEWPID, + NEWNS, + } +} + +// Namespace defines configuration for each namespace. It specifies an +// alternate path that is able to be joined via setns. +type Namespace struct { + Type NamespaceType `json:"type"` + Path string `json:"path"` +} + +func (n *Namespace) GetPath(pid int) string { + return fmt.Sprintf("/proc/%d/ns/%s", pid, NsName(n.Type)) +} + +func (n *Namespaces) Remove(t NamespaceType) bool { + i := n.index(t) + if i == -1 { + return false + } + *n = append((*n)[:i], (*n)[i+1:]...) + return true +} + +func (n *Namespaces) Add(t NamespaceType, path string) { + i := n.index(t) + if i == -1 { + *n = append(*n, Namespace{Type: t, Path: path}) + return + } + (*n)[i].Path = path +} + +func (n *Namespaces) index(t NamespaceType) int { + for i, ns := range *n { + if ns.Type == t { + return i + } + } + return -1 +} + +func (n *Namespaces) Contains(t NamespaceType) bool { + return n.index(t) != -1 +} + +func (n *Namespaces) PathOf(t NamespaceType) string { + i := n.index(t) + if i == -1 { + return "" + } + return (*n)[i].Path +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go new file mode 100644 index 000000000..4ce6813d2 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go @@ -0,0 +1,31 @@ +// +build linux + +package configs + +import "golang.org/x/sys/unix" + +func (n *Namespace) Syscall() int { + return namespaceInfo[n.Type] +} + +var namespaceInfo = map[NamespaceType]int{ + NEWNET: unix.CLONE_NEWNET, + NEWNS: unix.CLONE_NEWNS, + NEWUSER: unix.CLONE_NEWUSER, + NEWIPC: unix.CLONE_NEWIPC, + NEWUTS: unix.CLONE_NEWUTS, + NEWPID: unix.CLONE_NEWPID, +} + +// CloneFlags parses the container's Namespaces options to set the correct +// flags on clone, unshare. This function returns flags only for new namespaces. +func (n *Namespaces) CloneFlags() uintptr { + var flag int + for _, v := range *n { + if v.Path != "" { + continue + } + flag |= namespaceInfo[v.Type] + } + return uintptr(flag) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall_unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall_unsupported.go new file mode 100644 index 000000000..5d9a5c81f --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall_unsupported.go @@ -0,0 +1,13 @@ +// +build !linux,!windows + +package configs + +func (n *Namespace) Syscall() int { + panic("No namespace syscall support") +} + +// CloneFlags parses the container's Namespaces options to set the correct +// flags on clone, unshare. This function returns flags only for new namespaces. +func (n *Namespaces) CloneFlags() uintptr { + panic("No namespace syscall support") +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_unsupported.go new file mode 100644 index 000000000..19bf713de --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_unsupported.go @@ -0,0 +1,8 @@ +// +build !linux + +package configs + +// Namespace defines configuration for each namespace. It specifies an +// alternate path that is able to be joined via setns. +type Namespace struct { +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/network.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/network.go new file mode 100644 index 000000000..ccdb228e1 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/network.go @@ -0,0 +1,72 @@ +package configs + +// Network defines configuration for a container's networking stack +// +// The network configuration can be omitted from a container causing the +// container to be setup with the host's networking stack +type Network struct { + // Type sets the networks type, commonly veth and loopback + Type string `json:"type"` + + // Name of the network interface + Name string `json:"name"` + + // The bridge to use. + Bridge string `json:"bridge"` + + // MacAddress contains the MAC address to set on the network interface + MacAddress string `json:"mac_address"` + + // Address contains the IPv4 and mask to set on the network interface + Address string `json:"address"` + + // Gateway sets the gateway address that is used as the default for the interface + Gateway string `json:"gateway"` + + // IPv6Address contains the IPv6 and mask to set on the network interface + IPv6Address string `json:"ipv6_address"` + + // IPv6Gateway sets the ipv6 gateway address that is used as the default for the interface + IPv6Gateway string `json:"ipv6_gateway"` + + // Mtu sets the mtu value for the interface and will be mirrored on both the host and + // container's interfaces if a pair is created, specifically in the case of type veth + // Note: This does not apply to loopback interfaces. + Mtu int `json:"mtu"` + + // TxQueueLen sets the tx_queuelen value for the interface and will be mirrored on both the host and + // container's interfaces if a pair is created, specifically in the case of type veth + // Note: This does not apply to loopback interfaces. + TxQueueLen int `json:"txqueuelen"` + + // HostInterfaceName is a unique name of a veth pair that resides on in the host interface of the + // container. + HostInterfaceName string `json:"host_interface_name"` + + // HairpinMode specifies if hairpin NAT should be enabled on the virtual interface + // bridge port in the case of type veth + // Note: This is unsupported on some systems. + // Note: This does not apply to loopback interfaces. + HairpinMode bool `json:"hairpin_mode"` +} + +// Routes can be specified to create entries in the route table as the container is started +// +// All of destination, source, and gateway should be either IPv4 or IPv6. +// One of the three options must be present, and omitted entries will use their +// IP family default for the route table. For IPv4 for example, setting the +// gateway to 1.2.3.4 and the interface to eth0 will set up a standard +// destination of 0.0.0.0(or *) when viewed in the route table. +type Route struct { + // Sets the destination and mask, should be a CIDR. Accepts IPv4 and IPv6 + Destination string `json:"destination"` + + // Sets the source and mask, should be a CIDR. Accepts IPv4 and IPv6 + Source string `json:"source"` + + // Sets the gateway. Accepts IPv4 and IPv6 + Gateway string `json:"gateway"` + + // The device to set this route up for, for example: eth0 + InterfaceName string `json:"interface_name"` +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go new file mode 100644 index 000000000..0cebfaf80 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go @@ -0,0 +1,117 @@ +package validate + +import ( + "fmt" + "os" + "reflect" + "strings" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +var ( + geteuid = os.Geteuid + getegid = os.Getegid +) + +func (v *ConfigValidator) rootless(config *configs.Config) error { + if err := rootlessMappings(config); err != nil { + return err + } + if err := rootlessMount(config); err != nil { + return err + } + // Currently, cgroups cannot effectively be used in rootless containers. + // The new cgroup namespace doesn't really help us either because it doesn't + // have nice interactions with the user namespace (we're working with upstream + // to fix this). + if err := rootlessCgroup(config); err != nil { + return err + } + + // XXX: We currently can't verify the user config at all, because + // configs.Config doesn't store the user-related configs. So this + // has to be verified by setupUser() in init_linux.go. + + return nil +} + +func rootlessMappings(config *configs.Config) error { + rootuid, err := config.HostRootUID() + if err != nil { + return fmt.Errorf("failed to get root uid from uidMappings: %v", err) + } + if euid := geteuid(); euid != 0 { + if !config.Namespaces.Contains(configs.NEWUSER) { + return fmt.Errorf("rootless containers require user namespaces") + } + if rootuid != euid { + return fmt.Errorf("rootless containers cannot map container root to a different host user") + } + } + + rootgid, err := config.HostRootGID() + if err != nil { + return fmt.Errorf("failed to get root gid from gidMappings: %v", err) + } + + // Similar to the above test, we need to make sure that we aren't trying to + // map to a group ID that we don't have the right to be. + if rootgid != getegid() { + return fmt.Errorf("rootless containers cannot map container root to a different host group") + } + + // We can only map one user and group inside a container (our own). + if len(config.UidMappings) != 1 || config.UidMappings[0].Size != 1 { + return fmt.Errorf("rootless containers cannot map more than one user") + } + if len(config.GidMappings) != 1 || config.GidMappings[0].Size != 1 { + return fmt.Errorf("rootless containers cannot map more than one group") + } + + return nil +} + +// cgroup verifies that the user isn't trying to set any cgroup limits or paths. +func rootlessCgroup(config *configs.Config) error { + // Nothing set at all. + if config.Cgroups == nil || config.Cgroups.Resources == nil { + return nil + } + + // Used for comparing to the zero value. + left := reflect.ValueOf(*config.Cgroups.Resources) + right := reflect.Zero(left.Type()) + + // This is all we need to do, since specconv won't add cgroup options in + // rootless mode. + if !reflect.DeepEqual(left.Interface(), right.Interface()) { + return fmt.Errorf("cannot specify resource limits in rootless container") + } + + return nil +} + +// mount verifies that the user isn't trying to set up any mounts they don't have +// the rights to do. In addition, it makes sure that no mount has a `uid=` or +// `gid=` option that doesn't resolve to root. +func rootlessMount(config *configs.Config) error { + // XXX: We could whitelist allowed devices at this point, but I'm not + // convinced that's a good idea. The kernel is the best arbiter of + // access control. + + for _, mount := range config.Mounts { + // Check that the options list doesn't contain any uid= or gid= entries + // that don't resolve to root. + for _, opt := range strings.Split(mount.Data, ",") { + if strings.HasPrefix(opt, "uid=") && opt != "uid=0" { + return fmt.Errorf("cannot specify uid= mount options in rootless containers where argument isn't 0") + } + if strings.HasPrefix(opt, "gid=") && opt != "gid=0" { + return fmt.Errorf("cannot specify gid= mount options in rootless containers where argument isn't 0") + } + } + } + + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go new file mode 100644 index 000000000..828434544 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go @@ -0,0 +1,195 @@ +package validate + +import ( + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/opencontainers/runc/libcontainer/configs" + selinux "github.com/opencontainers/selinux/go-selinux" +) + +type Validator interface { + Validate(*configs.Config) error +} + +func New() Validator { + return &ConfigValidator{} +} + +type ConfigValidator struct { +} + +func (v *ConfigValidator) Validate(config *configs.Config) error { + if err := v.rootfs(config); err != nil { + return err + } + if err := v.network(config); err != nil { + return err + } + if err := v.hostname(config); err != nil { + return err + } + if err := v.security(config); err != nil { + return err + } + if err := v.usernamespace(config); err != nil { + return err + } + if err := v.sysctl(config); err != nil { + return err + } + if config.Rootless { + if err := v.rootless(config); err != nil { + return err + } + } + return nil +} + +// rootfs validates if the rootfs is an absolute path and is not a symlink +// to the container's root filesystem. +func (v *ConfigValidator) rootfs(config *configs.Config) error { + if _, err := os.Stat(config.Rootfs); err != nil { + if os.IsNotExist(err) { + return fmt.Errorf("rootfs (%s) does not exist", config.Rootfs) + } + return err + } + cleaned, err := filepath.Abs(config.Rootfs) + if err != nil { + return err + } + if cleaned, err = filepath.EvalSymlinks(cleaned); err != nil { + return err + } + if filepath.Clean(config.Rootfs) != cleaned { + return fmt.Errorf("%s is not an absolute path or is a symlink", config.Rootfs) + } + return nil +} + +func (v *ConfigValidator) network(config *configs.Config) error { + if !config.Namespaces.Contains(configs.NEWNET) { + if len(config.Networks) > 0 || len(config.Routes) > 0 { + return fmt.Errorf("unable to apply network settings without a private NET namespace") + } + } + return nil +} + +func (v *ConfigValidator) hostname(config *configs.Config) error { + if config.Hostname != "" && !config.Namespaces.Contains(configs.NEWUTS) { + return fmt.Errorf("unable to set hostname without a private UTS namespace") + } + return nil +} + +func (v *ConfigValidator) security(config *configs.Config) error { + // restrict sys without mount namespace + if (len(config.MaskPaths) > 0 || len(config.ReadonlyPaths) > 0) && + !config.Namespaces.Contains(configs.NEWNS) { + return fmt.Errorf("unable to restrict sys entries without a private MNT namespace") + } + if config.ProcessLabel != "" && !selinux.GetEnabled() { + return fmt.Errorf("selinux label is specified in config, but selinux is disabled or not supported") + } + + return nil +} + +func (v *ConfigValidator) usernamespace(config *configs.Config) error { + if config.Namespaces.Contains(configs.NEWUSER) { + if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { + return fmt.Errorf("USER namespaces aren't enabled in the kernel") + } + } else { + if config.UidMappings != nil || config.GidMappings != nil { + return fmt.Errorf("User namespace mappings specified, but USER namespace isn't enabled in the config") + } + } + return nil +} + +// sysctl validates that the specified sysctl keys are valid or not. +// /proc/sys isn't completely namespaced and depending on which namespaces +// are specified, a subset of sysctls are permitted. +func (v *ConfigValidator) sysctl(config *configs.Config) error { + validSysctlMap := map[string]bool{ + "kernel.msgmax": true, + "kernel.msgmnb": true, + "kernel.msgmni": true, + "kernel.sem": true, + "kernel.shmall": true, + "kernel.shmmax": true, + "kernel.shmmni": true, + "kernel.shm_rmid_forced": true, + } + + for s := range config.Sysctl { + if validSysctlMap[s] || strings.HasPrefix(s, "fs.mqueue.") { + if config.Namespaces.Contains(configs.NEWIPC) { + continue + } else { + return fmt.Errorf("sysctl %q is not allowed in the hosts ipc namespace", s) + } + } + if strings.HasPrefix(s, "net.") { + if config.Namespaces.Contains(configs.NEWNET) { + if path := config.Namespaces.PathOf(configs.NEWNET); path != "" { + if err := checkHostNs(s, path); err != nil { + return err + } + } + continue + } else { + return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", s) + } + } + return fmt.Errorf("sysctl %q is not in a separate kernel namespace", s) + } + + return nil +} + +func isSymbolicLink(path string) (bool, error) { + fi, err := os.Lstat(path) + if err != nil { + return false, err + } + + return fi.Mode()&os.ModeSymlink == os.ModeSymlink, nil +} + +// checkHostNs checks whether network sysctl is used in host namespace. +func checkHostNs(sysctlConfig string, path string) error { + var currentProcessNetns = "/proc/self/ns/net" + // readlink on the current processes network namespace + destOfCurrentProcess, err := os.Readlink(currentProcessNetns) + if err != nil { + return fmt.Errorf("read soft link %q error", currentProcessNetns) + } + + // First check if the provided path is a symbolic link + symLink, err := isSymbolicLink(path) + if err != nil { + return fmt.Errorf("could not check that %q is a symlink: %v", path, err) + } + + if symLink == false { + // The provided namespace is not a symbolic link, + // it is not the host namespace. + return nil + } + + // readlink on the path provided in the struct + destOfContainer, err := os.Readlink(path) + if err != nil { + return fmt.Errorf("read soft link %q error", path) + } + if destOfContainer == destOfCurrentProcess { + return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", sysctlConfig) + } + return nil +} |