aboutsummaryrefslogtreecommitdiff
path: root/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/opencontainers/runc/libcontainer/process_linux.go')
-rw-r--r--vendor/github.com/opencontainers/runc/libcontainer/process_linux.go493
1 files changed, 493 insertions, 0 deletions
diff --git a/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go
new file mode 100644
index 000000000..171685ccd
--- /dev/null
+++ b/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go
@@ -0,0 +1,493 @@
+// +build linux
+
+package libcontainer
+
+import (
+ "encoding/json"
+ "errors"
+ "fmt"
+ "io"
+ "os"
+ "os/exec"
+ "path/filepath"
+ "strconv"
+ "syscall" // only for Signal
+
+ "github.com/opencontainers/runc/libcontainer/cgroups"
+ "github.com/opencontainers/runc/libcontainer/configs"
+ "github.com/opencontainers/runc/libcontainer/system"
+ "github.com/opencontainers/runc/libcontainer/utils"
+
+ "golang.org/x/sys/unix"
+)
+
+type parentProcess interface {
+ // pid returns the pid for the running process.
+ pid() int
+
+ // start starts the process execution.
+ start() error
+
+ // send a SIGKILL to the process and wait for the exit.
+ terminate() error
+
+ // wait waits on the process returning the process state.
+ wait() (*os.ProcessState, error)
+
+ // startTime returns the process start time.
+ startTime() (uint64, error)
+
+ signal(os.Signal) error
+
+ externalDescriptors() []string
+
+ setExternalDescriptors(fds []string)
+}
+
+type setnsProcess struct {
+ cmd *exec.Cmd
+ parentPipe *os.File
+ childPipe *os.File
+ cgroupPaths map[string]string
+ config *initConfig
+ fds []string
+ process *Process
+ bootstrapData io.Reader
+}
+
+func (p *setnsProcess) startTime() (uint64, error) {
+ stat, err := system.Stat(p.pid())
+ return stat.StartTime, err
+}
+
+func (p *setnsProcess) signal(sig os.Signal) error {
+ s, ok := sig.(syscall.Signal)
+ if !ok {
+ return errors.New("os: unsupported signal type")
+ }
+ return unix.Kill(p.pid(), s)
+}
+
+func (p *setnsProcess) start() (err error) {
+ defer p.parentPipe.Close()
+ err = p.cmd.Start()
+ p.childPipe.Close()
+ if err != nil {
+ return newSystemErrorWithCause(err, "starting setns process")
+ }
+ if p.bootstrapData != nil {
+ if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
+ return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
+ }
+ }
+ if err = p.execSetns(); err != nil {
+ return newSystemErrorWithCause(err, "executing setns process")
+ }
+ // We can't join cgroups if we're in a rootless container.
+ if !p.config.Rootless && len(p.cgroupPaths) > 0 {
+ if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil {
+ return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid())
+ }
+ }
+ // set rlimits, this has to be done here because we lose permissions
+ // to raise the limits once we enter a user-namespace
+ if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
+ return newSystemErrorWithCause(err, "setting rlimits for process")
+ }
+ if err := utils.WriteJSON(p.parentPipe, p.config); err != nil {
+ return newSystemErrorWithCause(err, "writing config to pipe")
+ }
+
+ ierr := parseSync(p.parentPipe, func(sync *syncT) error {
+ switch sync.Type {
+ case procReady:
+ // This shouldn't happen.
+ panic("unexpected procReady in setns")
+ case procHooks:
+ // This shouldn't happen.
+ panic("unexpected procHooks in setns")
+ default:
+ return newSystemError(fmt.Errorf("invalid JSON payload from child"))
+ }
+ })
+
+ if err := unix.Shutdown(int(p.parentPipe.Fd()), unix.SHUT_WR); err != nil {
+ return newSystemErrorWithCause(err, "calling shutdown on init pipe")
+ }
+ // Must be done after Shutdown so the child will exit and we can wait for it.
+ if ierr != nil {
+ p.wait()
+ return ierr
+ }
+ return nil
+}
+
+// execSetns runs the process that executes C code to perform the setns calls
+// because setns support requires the C process to fork off a child and perform the setns
+// before the go runtime boots, we wait on the process to die and receive the child's pid
+// over the provided pipe.
+func (p *setnsProcess) execSetns() error {
+ status, err := p.cmd.Process.Wait()
+ if err != nil {
+ p.cmd.Wait()
+ return newSystemErrorWithCause(err, "waiting on setns process to finish")
+ }
+ if !status.Success() {
+ p.cmd.Wait()
+ return newSystemError(&exec.ExitError{ProcessState: status})
+ }
+ var pid *pid
+ if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
+ p.cmd.Wait()
+ return newSystemErrorWithCause(err, "reading pid from init pipe")
+ }
+ process, err := os.FindProcess(pid.Pid)
+ if err != nil {
+ return err
+ }
+ p.cmd.Process = process
+ p.process.ops = p
+ return nil
+}
+
+// terminate sends a SIGKILL to the forked process for the setns routine then waits to
+// avoid the process becoming a zombie.
+func (p *setnsProcess) terminate() error {
+ if p.cmd.Process == nil {
+ return nil
+ }
+ err := p.cmd.Process.Kill()
+ if _, werr := p.wait(); err == nil {
+ err = werr
+ }
+ return err
+}
+
+func (p *setnsProcess) wait() (*os.ProcessState, error) {
+ err := p.cmd.Wait()
+
+ // Return actual ProcessState even on Wait error
+ return p.cmd.ProcessState, err
+}
+
+func (p *setnsProcess) pid() int {
+ return p.cmd.Process.Pid
+}
+
+func (p *setnsProcess) externalDescriptors() []string {
+ return p.fds
+}
+
+func (p *setnsProcess) setExternalDescriptors(newFds []string) {
+ p.fds = newFds
+}
+
+type initProcess struct {
+ cmd *exec.Cmd
+ parentPipe *os.File
+ childPipe *os.File
+ config *initConfig
+ manager cgroups.Manager
+ container *linuxContainer
+ fds []string
+ process *Process
+ bootstrapData io.Reader
+ sharePidns bool
+ rootDir *os.File
+}
+
+func (p *initProcess) pid() int {
+ return p.cmd.Process.Pid
+}
+
+func (p *initProcess) externalDescriptors() []string {
+ return p.fds
+}
+
+// execSetns runs the process that executes C code to perform the setns calls
+// because setns support requires the C process to fork off a child and perform the setns
+// before the go runtime boots, we wait on the process to die and receive the child's pid
+// over the provided pipe.
+// This is called by initProcess.start function
+func (p *initProcess) execSetns() error {
+ status, err := p.cmd.Process.Wait()
+ if err != nil {
+ p.cmd.Wait()
+ return err
+ }
+ if !status.Success() {
+ p.cmd.Wait()
+ return &exec.ExitError{ProcessState: status}
+ }
+ var pid *pid
+ if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
+ p.cmd.Wait()
+ return err
+ }
+ process, err := os.FindProcess(pid.Pid)
+ if err != nil {
+ return err
+ }
+ p.cmd.Process = process
+ p.process.ops = p
+ return nil
+}
+
+func (p *initProcess) start() error {
+ defer p.parentPipe.Close()
+ err := p.cmd.Start()
+ p.process.ops = p
+ p.childPipe.Close()
+ p.rootDir.Close()
+ if err != nil {
+ p.process.ops = nil
+ return newSystemErrorWithCause(err, "starting init process command")
+ }
+ if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
+ return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
+ }
+ if err := p.execSetns(); err != nil {
+ return newSystemErrorWithCause(err, "running exec setns process for init")
+ }
+ // Save the standard descriptor names before the container process
+ // can potentially move them (e.g., via dup2()). If we don't do this now,
+ // we won't know at checkpoint time which file descriptor to look up.
+ fds, err := getPipeFds(p.pid())
+ if err != nil {
+ return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid())
+ }
+ p.setExternalDescriptors(fds)
+ // Do this before syncing with child so that no children can escape the
+ // cgroup. We don't need to worry about not doing this and not being root
+ // because we'd be using the rootless cgroup manager in that case.
+ if err := p.manager.Apply(p.pid()); err != nil {
+ return newSystemErrorWithCause(err, "applying cgroup configuration for process")
+ }
+ defer func() {
+ if err != nil {
+ // TODO: should not be the responsibility to call here
+ p.manager.Destroy()
+ }
+ }()
+ if err := p.createNetworkInterfaces(); err != nil {
+ return newSystemErrorWithCause(err, "creating network interfaces")
+ }
+ if err := p.sendConfig(); err != nil {
+ return newSystemErrorWithCause(err, "sending config to init process")
+ }
+ var (
+ sentRun bool
+ sentResume bool
+ )
+
+ ierr := parseSync(p.parentPipe, func(sync *syncT) error {
+ switch sync.Type {
+ case procReady:
+ // set rlimits, this has to be done here because we lose permissions
+ // to raise the limits once we enter a user-namespace
+ if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
+ return newSystemErrorWithCause(err, "setting rlimits for ready process")
+ }
+ // call prestart hooks
+ if !p.config.Config.Namespaces.Contains(configs.NEWNS) {
+ // Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions.
+ if err := p.manager.Set(p.config.Config); err != nil {
+ return newSystemErrorWithCause(err, "setting cgroup config for ready process")
+ }
+
+ if p.config.Config.Hooks != nil {
+ s := configs.HookState{
+ Version: p.container.config.Version,
+ ID: p.container.id,
+ Pid: p.pid(),
+ Bundle: utils.SearchLabels(p.config.Config.Labels, "bundle"),
+ }
+ for i, hook := range p.config.Config.Hooks.Prestart {
+ if err := hook.Run(s); err != nil {
+ return newSystemErrorWithCausef(err, "running prestart hook %d", i)
+ }
+ }
+ }
+ }
+ // Sync with child.
+ if err := writeSync(p.parentPipe, procRun); err != nil {
+ return newSystemErrorWithCause(err, "writing syncT 'run'")
+ }
+ sentRun = true
+ case procHooks:
+ // Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions.
+ if err := p.manager.Set(p.config.Config); err != nil {
+ return newSystemErrorWithCause(err, "setting cgroup config for procHooks process")
+ }
+ if p.config.Config.Hooks != nil {
+ s := configs.HookState{
+ Version: p.container.config.Version,
+ ID: p.container.id,
+ Pid: p.pid(),
+ Bundle: utils.SearchLabels(p.config.Config.Labels, "bundle"),
+ }
+ for i, hook := range p.config.Config.Hooks.Prestart {
+ if err := hook.Run(s); err != nil {
+ return newSystemErrorWithCausef(err, "running prestart hook %d", i)
+ }
+ }
+ }
+ // Sync with child.
+ if err := writeSync(p.parentPipe, procResume); err != nil {
+ return newSystemErrorWithCause(err, "writing syncT 'resume'")
+ }
+ sentResume = true
+ default:
+ return newSystemError(fmt.Errorf("invalid JSON payload from child"))
+ }
+
+ return nil
+ })
+
+ if !sentRun {
+ return newSystemErrorWithCause(ierr, "container init")
+ }
+ if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume {
+ return newSystemError(fmt.Errorf("could not synchronise after executing prestart hooks with container process"))
+ }
+ if err := unix.Shutdown(int(p.parentPipe.Fd()), unix.SHUT_WR); err != nil {
+ return newSystemErrorWithCause(err, "shutting down init pipe")
+ }
+
+ // Must be done after Shutdown so the child will exit and we can wait for it.
+ if ierr != nil {
+ p.wait()
+ return ierr
+ }
+ return nil
+}
+
+func (p *initProcess) wait() (*os.ProcessState, error) {
+ err := p.cmd.Wait()
+ if err != nil {
+ return p.cmd.ProcessState, err
+ }
+ // we should kill all processes in cgroup when init is died if we use host PID namespace
+ if p.sharePidns {
+ signalAllProcesses(p.manager, unix.SIGKILL)
+ }
+ return p.cmd.ProcessState, nil
+}
+
+func (p *initProcess) terminate() error {
+ if p.cmd.Process == nil {
+ return nil
+ }
+ err := p.cmd.Process.Kill()
+ if _, werr := p.wait(); err == nil {
+ err = werr
+ }
+ return err
+}
+
+func (p *initProcess) startTime() (uint64, error) {
+ stat, err := system.Stat(p.pid())
+ return stat.StartTime, err
+}
+
+func (p *initProcess) sendConfig() error {
+ // send the config to the container's init process, we don't use JSON Encode
+ // here because there might be a problem in JSON decoder in some cases, see:
+ // https://github.com/docker/docker/issues/14203#issuecomment-174177790
+ return utils.WriteJSON(p.parentPipe, p.config)
+}
+
+func (p *initProcess) createNetworkInterfaces() error {
+ for _, config := range p.config.Config.Networks {
+ strategy, err := getStrategy(config.Type)
+ if err != nil {
+ return err
+ }
+ n := &network{
+ Network: *config,
+ }
+ if err := strategy.create(n, p.pid()); err != nil {
+ return err
+ }
+ p.config.Networks = append(p.config.Networks, n)
+ }
+ return nil
+}
+
+func (p *initProcess) signal(sig os.Signal) error {
+ s, ok := sig.(syscall.Signal)
+ if !ok {
+ return errors.New("os: unsupported signal type")
+ }
+ return unix.Kill(p.pid(), s)
+}
+
+func (p *initProcess) setExternalDescriptors(newFds []string) {
+ p.fds = newFds
+}
+
+func getPipeFds(pid int) ([]string, error) {
+ fds := make([]string, 3)
+
+ dirPath := filepath.Join("/proc", strconv.Itoa(pid), "/fd")
+ for i := 0; i < 3; i++ {
+ // XXX: This breaks if the path is not a valid symlink (which can
+ // happen in certain particularly unlucky mount namespace setups).
+ f := filepath.Join(dirPath, strconv.Itoa(i))
+ target, err := os.Readlink(f)
+ if err != nil {
+ // Ignore permission errors, for rootless containers and other
+ // non-dumpable processes. if we can't get the fd for a particular
+ // file, there's not much we can do.
+ if os.IsPermission(err) {
+ continue
+ }
+ return fds, err
+ }
+ fds[i] = target
+ }
+ return fds, nil
+}
+
+// InitializeIO creates pipes for use with the process's stdio and returns the
+// opposite side for each. Do not use this if you want to have a pseudoterminal
+// set up for you by libcontainer (TODO: fix that too).
+// TODO: This is mostly unnecessary, and should be handled by clients.
+func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) {
+ var fds []uintptr
+ i = &IO{}
+ // cleanup in case of an error
+ defer func() {
+ if err != nil {
+ for _, fd := range fds {
+ unix.Close(int(fd))
+ }
+ }
+ }()
+ // STDIN
+ r, w, err := os.Pipe()
+ if err != nil {
+ return nil, err
+ }
+ fds = append(fds, r.Fd(), w.Fd())
+ p.Stdin, i.Stdin = r, w
+ // STDOUT
+ if r, w, err = os.Pipe(); err != nil {
+ return nil, err
+ }
+ fds = append(fds, r.Fd(), w.Fd())
+ p.Stdout, i.Stdout = w, r
+ // STDERR
+ if r, w, err = os.Pipe(); err != nil {
+ return nil, err
+ }
+ fds = append(fds, r.Fd(), w.Fd())
+ p.Stderr, i.Stderr = w, r
+ // change ownership of the pipes incase we are in a user namespace
+ for _, fd := range fds {
+ if err := unix.Fchown(int(fd), rootuid, rootgid); err != nil {
+ return nil, err
+ }
+ }
+ return i, nil
+}