package configs import ( "bytes" "encoding/json" "fmt" "os/exec" "time" "github.com/sirupsen/logrus" "github.com/opencontainers/runc/libcontainer/devices" "github.com/opencontainers/runtime-spec/specs-go" ) type Rlimit struct { Type int `json:"type"` Hard uint64 `json:"hard"` Soft uint64 `json:"soft"` } // IDMap represents UID/GID Mappings for User Namespaces. type IDMap struct { ContainerID int `json:"container_id"` HostID int `json:"host_id"` Size int `json:"size"` } // Seccomp represents syscall restrictions // By default, only the native architecture of the kernel is allowed to be used // for syscalls. Additional architectures can be added by specifying them in // Architectures. type Seccomp struct { DefaultAction Action `json:"default_action"` Architectures []string `json:"architectures"` Syscalls []*Syscall `json:"syscalls"` DefaultErrnoRet *uint `json:"default_errno_ret"` ListenerPath string `json:"listener_path,omitempty"` ListenerMetadata string `json:"listener_metadata,omitempty"` } // Action is taken upon rule match in Seccomp type Action int const ( Kill Action = iota + 1 Errno Trap Allow Trace Log Notify KillThread KillProcess ) // Operator is a comparison operator to be used when matching syscall arguments in Seccomp type Operator int const ( EqualTo Operator = iota + 1 NotEqualTo GreaterThan GreaterThanOrEqualTo LessThan LessThanOrEqualTo MaskEqualTo ) // Arg is a rule to match a specific syscall argument in Seccomp type Arg struct { Index uint `json:"index"` Value uint64 `json:"value"` ValueTwo uint64 `json:"value_two"` Op Operator `json:"op"` } // Syscall is a rule to match a syscall in Seccomp type Syscall struct { Name string `json:"name"` Action Action `json:"action"` ErrnoRet *uint `json:"errnoRet"` Args []*Arg `json:"args"` } // TODO Windows. Many of these fields should be factored out into those parts // which are common across platforms, and those which are platform specific. // Config defines configuration options for executing a process inside a contained environment. type Config struct { // NoPivotRoot will use MS_MOVE and a chroot to jail the process into the container's rootfs // This is a common option when the container is running in ramdisk NoPivotRoot bool `json:"no_pivot_root"` // ParentDeathSignal specifies the signal that is sent to the container's process in the case // that the parent process dies. ParentDeathSignal int `json:"parent_death_signal"` // Path to a directory containing the container's root filesystem. Rootfs string `json:"rootfs"` // Umask is the umask to use inside of the container. Umask *uint32 `json:"umask"` // Readonlyfs will remount the container's rootfs as readonly where only externally mounted // bind mounts are writtable. Readonlyfs bool `json:"readonlyfs"` // Specifies the mount propagation flags to be applied to /. RootPropagation int `json:"rootPropagation"` // Mounts specify additional source and destination paths that will be mounted inside the container's // rootfs and mount namespace if specified Mounts []*Mount `json:"mounts"` // The device nodes that should be automatically created within the container upon container start. Note, make sure that the node is marked as allowed in the cgroup as well! Devices []*devices.Device `json:"devices"` MountLabel string `json:"mount_label"` // Hostname optionally sets the container's hostname if provided Hostname string `json:"hostname"` // Namespaces specifies the container's namespaces that it should setup when cloning the init process // If a namespace is not provided that namespace is shared from the container's parent process Namespaces Namespaces `json:"namespaces"` // Capabilities specify the capabilities to keep when executing the process inside the container // All capabilities not specified will be dropped from the processes capability mask Capabilities *Capabilities `json:"capabilities"` // Networks specifies the container's network setup to be created Networks []*Network `json:"networks"` // Routes can be specified to create entries in the route table as the container is started Routes []*Route `json:"routes"` // Cgroups specifies specific cgroup settings for the various subsystems that the container is // placed into to limit the resources the container has available Cgroups *Cgroup `json:"cgroups"` // AppArmorProfile specifies the profile to apply to the process running in the container and is // change at the time the process is execed AppArmorProfile string `json:"apparmor_profile,omitempty"` // ProcessLabel specifies the label to apply to the process running in the container. It is // commonly used by selinux ProcessLabel string `json:"process_label,omitempty"` // Rlimits specifies the resource limits, such as max open files, to set in the container // If Rlimits are not set, the container will inherit rlimits from the parent process Rlimits []Rlimit `json:"rlimits,omitempty"` // OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores // for a process. Valid values are between the range [-1000, '1000'], where processes with // higher scores are preferred for being killed. If it is unset then we don't touch the current // value. // More information about kernel oom score calculation here: https://lwn.net/Articles/317814/ OomScoreAdj *int `json:"oom_score_adj,omitempty"` // UidMappings is an array of User ID mappings for User Namespaces UidMappings []IDMap `json:"uid_mappings"` // GidMappings is an array of Group ID mappings for User Namespaces GidMappings []IDMap `json:"gid_mappings"` // MaskPaths specifies paths within the container's rootfs to mask over with a bind // mount pointing to /dev/null as to prevent reads of the file. MaskPaths []string `json:"mask_paths"` // ReadonlyPaths specifies paths within the container's rootfs to remount as read-only // so that these files prevent any writes. ReadonlyPaths []string `json:"readonly_paths"` // Sysctl is a map of properties and their values. It is the equivalent of using // sysctl -w my.property.name value in Linux. Sysctl map[string]string `json:"sysctl"` // Seccomp allows actions to be taken whenever a syscall is made within the container. // A number of rules are given, each having an action to be taken if a syscall matches it. // A default action to be taken if no rules match is also given. Seccomp *Seccomp `json:"seccomp"` // NoNewPrivileges controls whether processes in the container can gain additional privileges. NoNewPrivileges bool `json:"no_new_privileges,omitempty"` // Hooks are a collection of actions to perform at various container lifecycle events. // CommandHooks are serialized to JSON, but other hooks are not. Hooks Hooks // Version is the version of opencontainer specification that is supported. Version string `json:"version"` // Labels are user defined metadata that is stored in the config and populated on the state Labels []string `json:"labels"` // NoNewKeyring will not allocated a new session keyring for the container. It will use the // callers keyring in this case. NoNewKeyring bool `json:"no_new_keyring"` // IntelRdt specifies settings for Intel RDT group that the container is placed into // to limit the resources (e.g., L3 cache, memory bandwidth) the container has available IntelRdt *IntelRdt `json:"intel_rdt,omitempty"` // RootlessEUID is set when the runc was launched with non-zero EUID. // Note that RootlessEUID is set to false when launched with EUID=0 in userns. // When RootlessEUID is set, runc creates a new userns for the container. // (config.json needs to contain userns settings) RootlessEUID bool `json:"rootless_euid,omitempty"` // RootlessCgroups is set when unlikely to have the full access to cgroups. // When RootlessCgroups is set, cgroups errors are ignored. RootlessCgroups bool `json:"rootless_cgroups,omitempty"` } type ( HookName string HookList []Hook Hooks map[HookName]HookList ) const ( // Prestart commands are executed after the container namespaces are created, // but before the user supplied command is executed from init. // Note: This hook is now deprecated // Prestart commands are called in the Runtime namespace. Prestart HookName = "prestart" // CreateRuntime commands MUST be called as part of the create operation after // the runtime environment has been created but before the pivot_root has been executed. // CreateRuntime is called immediately after the deprecated Prestart hook. // CreateRuntime commands are called in the Runtime Namespace. CreateRuntime HookName = "createRuntime" // CreateContainer commands MUST be called as part of the create operation after // the runtime environment has been created but before the pivot_root has been executed. // CreateContainer commands are called in the Container namespace. CreateContainer HookName = "createContainer" // StartContainer commands MUST be called as part of the start operation and before // the container process is started. // StartContainer commands are called in the Container namespace. StartContainer HookName = "startContainer" // Poststart commands are executed after the container init process starts. // Poststart commands are called in the Runtime Namespace. Poststart HookName = "poststart" // Poststop commands are executed after the container init process exits. // Poststop commands are called in the Runtime Namespace. Poststop HookName = "poststop" ) // KnownHookNames returns the known hook names. // Used by `runc features`. func KnownHookNames() []string { return []string{ string(Prestart), // deprecated string(CreateRuntime), string(CreateContainer), string(StartContainer), string(Poststart), string(Poststop), } } type Capabilities struct { // Bounding is the set of capabilities checked by the kernel. Bounding []string // Effective is the set of capabilities checked by the kernel. Effective []string // Inheritable is the capabilities preserved across execve. Inheritable []string // Permitted is the limiting superset for effective capabilities. Permitted []string // Ambient is the ambient set of capabilities that are kept. Ambient []string } func (hooks HookList) RunHooks(state *specs.State) error { for i, h := range hooks { if err := h.Run(state); err != nil { return fmt.Errorf("error running hook #%d: %w", i, err) } } return nil } func (hooks *Hooks) UnmarshalJSON(b []byte) error { var state map[HookName][]CommandHook if err := json.Unmarshal(b, &state); err != nil { return err } *hooks = Hooks{} for n, commandHooks := range state { if len(commandHooks) == 0 { continue } (*hooks)[n] = HookList{} for _, h := range commandHooks { (*hooks)[n] = append((*hooks)[n], h) } } return nil } func (hooks *Hooks) MarshalJSON() ([]byte, error) { serialize := func(hooks []Hook) (serializableHooks []CommandHook) { for _, hook := range hooks { switch chook := hook.(type) { case CommandHook: serializableHooks = append(serializableHooks, chook) default: logrus.Warnf("cannot serialize hook of type %T, skipping", hook) } } return serializableHooks } return json.Marshal(map[string]interface{}{ "prestart": serialize((*hooks)[Prestart]), "createRuntime": serialize((*hooks)[CreateRuntime]), "createContainer": serialize((*hooks)[CreateContainer]), "startContainer": serialize((*hooks)[StartContainer]), "poststart": serialize((*hooks)[Poststart]), "poststop": serialize((*hooks)[Poststop]), }) } type Hook interface { // Run executes the hook with the provided state. Run(*specs.State) error } // NewFunctionHook will call the provided function when the hook is run. func NewFunctionHook(f func(*specs.State) error) FuncHook { return FuncHook{ run: f, } } type FuncHook struct { run func(*specs.State) error } func (f FuncHook) Run(s *specs.State) error { return f.run(s) } type Command struct { Path string `json:"path"` Args []string `json:"args"` Env []string `json:"env"` Dir string `json:"dir"` Timeout *time.Duration `json:"timeout"` } // NewCommandHook will execute the provided command when the hook is run. func NewCommandHook(cmd Command) CommandHook { return CommandHook{ Command: cmd, } } type CommandHook struct { Command } func (c Command) Run(s *specs.State) error { b, err := json.Marshal(s) if err != nil { return err } var stdout, stderr bytes.Buffer cmd := exec.Cmd{ Path: c.Path, Args: c.Args, Env: c.Env, Stdin: bytes.NewReader(b), Stdout: &stdout, Stderr: &stderr, } if err := cmd.Start(); err != nil { return err } errC := make(chan error, 1) go func() { err := cmd.Wait() if err != nil { err = fmt.Errorf("error running hook: %w, stdout: %s, stderr: %s", err, stdout.String(), stderr.String()) } errC <- err }() var timerCh <-chan time.Time if c.Timeout != nil { timer := time.NewTimer(*c.Timeout) defer timer.Stop() timerCh = timer.C } select { case err := <-errC: return err case <-timerCh: _ = cmd.Process.Kill() <-errC return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds()) } }