package libpod import ( "fmt" "os" "os/exec" "path/filepath" "runtime" "strings" runcconfig "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/devices" "github.com/containers/common/pkg/cgroups" "github.com/containers/common/pkg/config" "github.com/containers/podman/v4/pkg/errorhandling" "github.com/containers/podman/v4/pkg/rootless" "github.com/containers/podman/v4/utils" pmount "github.com/containers/storage/pkg/mount" spec "github.com/opencontainers/runtime-spec/specs-go" "github.com/opencontainers/selinux/go-selinux/label" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) { type result struct { restoreDuration int64 err error } ch := make(chan result) go func() { runtime.LockOSThread() restoreDuration, err := func() (int64, error) { fd, err := os.Open(fmt.Sprintf("/proc/%d/task/%d/ns/mnt", os.Getpid(), unix.Gettid())) if err != nil { return 0, err } defer errorhandling.CloseQuiet(fd) // create a new mountns on the current thread if err = unix.Unshare(unix.CLONE_NEWNS); err != nil { return 0, err } defer func() { if err := unix.Setns(int(fd.Fd()), unix.CLONE_NEWNS); err != nil { logrus.Errorf("Unable to clone new namespace: %q", err) } }() // don't spread our mounts around. We are setting only /sys to be slave // so that the cleanup process is still able to umount the storage and the // changes are propagated to the host. err = unix.Mount("/sys", "/sys", "none", unix.MS_REC|unix.MS_SLAVE, "") if err != nil { return 0, fmt.Errorf("cannot make /sys slave: %w", err) } mounts, err := pmount.GetMounts() if err != nil { return 0, err } for _, m := range mounts { if !strings.HasPrefix(m.Mountpoint, "/sys/kernel") { continue } err = unix.Unmount(m.Mountpoint, 0) if err != nil && !os.IsNotExist(err) { return 0, fmt.Errorf("cannot unmount %s: %w", m.Mountpoint, err) } } return r.createOCIContainer(ctr, restoreOptions) }() ch <- result{ restoreDuration: restoreDuration, err: err, } }() res := <-ch return res.restoreDuration, res.err } // Run the closure with the container's socket label set func (r *ConmonOCIRuntime) withContainerSocketLabel(ctr *Container, closure func() error) error { runtime.LockOSThread() if err := label.SetSocketLabel(ctr.ProcessLabel()); err != nil { return err } err := closure() // Ignore error returned from SetSocketLabel("") call, // can't recover. if labelErr := label.SetSocketLabel(""); labelErr == nil { // Unlock the thread only if the process label could be restored // successfully. Otherwise leave the thread locked and the Go runtime // will terminate it once it returns to the threads pool. runtime.UnlockOSThread() } else { logrus.Errorf("Unable to reset socket label: %q", labelErr) } return err } // moveConmonToCgroupAndSignal gets a container's cgroupParent and moves the conmon process to that cgroup // it then signals for conmon to start by sending nonce data down the start fd func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec.Cmd, startFd *os.File) error { mustCreateCgroup := true if ctr.config.NoCgroups { mustCreateCgroup = false } // If cgroup creation is disabled - just signal. switch ctr.config.CgroupsMode { case "disabled", "no-conmon", cgroupSplit: mustCreateCgroup = false } // $INVOCATION_ID is set by systemd when running as a service. if ctr.runtime.RemoteURI() == "" && os.Getenv("INVOCATION_ID") != "" { mustCreateCgroup = false } if mustCreateCgroup { // Usually rootless users are not allowed to configure cgroupfs. // There are cases though, where it is allowed, e.g. if the cgroup // is manually configured and chowned). Avoid detecting all // such cases and simply use a lower log level. logLevel := logrus.WarnLevel if rootless.IsRootless() { logLevel = logrus.InfoLevel } // TODO: This should be a switch - we are not guaranteed that // there are only 2 valid cgroup managers cgroupParent := ctr.CgroupParent() cgroupPath := filepath.Join(ctr.config.CgroupParent, "conmon") Resource := ctr.Spec().Linux.Resources cgroupResources, err := GetLimits(Resource) if err != nil { logrus.StandardLogger().Log(logLevel, "Could not get ctr resources") } if ctr.CgroupManager() == config.SystemdCgroupsManager { unitName := createUnitName("libpod-conmon", ctr.ID()) realCgroupParent := cgroupParent splitParent := strings.Split(cgroupParent, "/") if strings.HasSuffix(cgroupParent, ".slice") && len(splitParent) > 1 { realCgroupParent = splitParent[len(splitParent)-1] } logrus.Infof("Running conmon under slice %s and unitName %s", realCgroupParent, unitName) if err := utils.RunUnderSystemdScope(cmd.Process.Pid, realCgroupParent, unitName); err != nil { logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to systemd sandbox cgroup: %v", err) } } else { control, err := cgroups.New(cgroupPath, &cgroupResources) if err != nil { logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to cgroupfs sandbox cgroup: %v", err) } else if err := control.AddPid(cmd.Process.Pid); err != nil { // we need to remove this defer and delete the cgroup once conmon exits // maybe need a conmon monitor? logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to cgroupfs sandbox cgroup: %v", err) } } } /* We set the cgroup, now the child can start creating children */ if err := writeConmonPipeData(startFd); err != nil { return err } return nil } // GetLimits converts spec resource limits to cgroup consumable limits func GetLimits(resource *spec.LinuxResources) (runcconfig.Resources, error) { if resource == nil { resource = &spec.LinuxResources{} } final := &runcconfig.Resources{} devs := []*devices.Rule{} // Devices for _, entry := range resource.Devices { if entry.Major == nil || entry.Minor == nil { continue } runeType := 'a' switch entry.Type { case "b": runeType = 'b' case "c": runeType = 'c' } devs = append(devs, &devices.Rule{ Type: devices.Type(runeType), Major: *entry.Major, Minor: *entry.Minor, Permissions: devices.Permissions(entry.Access), Allow: entry.Allow, }) } final.Devices = devs // HugepageLimits pageLimits := []*runcconfig.HugepageLimit{} for _, entry := range resource.HugepageLimits { pageLimits = append(pageLimits, &runcconfig.HugepageLimit{ Pagesize: entry.Pagesize, Limit: entry.Limit, }) } final.HugetlbLimit = pageLimits // Networking netPriorities := []*runcconfig.IfPrioMap{} if resource.Network != nil { for _, entry := range resource.Network.Priorities { netPriorities = append(netPriorities, &runcconfig.IfPrioMap{ Interface: entry.Name, Priority: int64(entry.Priority), }) } } final.NetPrioIfpriomap = netPriorities rdma := make(map[string]runcconfig.LinuxRdma) for name, entry := range resource.Rdma { rdma[name] = runcconfig.LinuxRdma{HcaHandles: entry.HcaHandles, HcaObjects: entry.HcaObjects} } final.Rdma = rdma // Memory if resource.Memory != nil { if resource.Memory.Limit != nil { final.Memory = *resource.Memory.Limit } if resource.Memory.Reservation != nil { final.MemoryReservation = *resource.Memory.Reservation } if resource.Memory.Swap != nil { final.MemorySwap = *resource.Memory.Swap } if resource.Memory.Swappiness != nil { final.MemorySwappiness = resource.Memory.Swappiness } } // CPU if resource.CPU != nil { if resource.CPU.Period != nil { final.CpuPeriod = *resource.CPU.Period } if resource.CPU.Quota != nil { final.CpuQuota = *resource.CPU.Quota } if resource.CPU.RealtimePeriod != nil { final.CpuRtPeriod = *resource.CPU.RealtimePeriod } if resource.CPU.RealtimeRuntime != nil { final.CpuRtRuntime = *resource.CPU.RealtimeRuntime } if resource.CPU.Shares != nil { final.CpuShares = *resource.CPU.Shares } final.CpusetCpus = resource.CPU.Cpus final.CpusetMems = resource.CPU.Mems } // BlkIO if resource.BlockIO != nil { if len(resource.BlockIO.ThrottleReadBpsDevice) > 0 { for _, entry := range resource.BlockIO.ThrottleReadBpsDevice { throttle := &runcconfig.ThrottleDevice{} dev := &runcconfig.BlockIODevice{ Major: entry.Major, Minor: entry.Minor, } throttle.BlockIODevice = *dev throttle.Rate = entry.Rate final.BlkioThrottleReadBpsDevice = append(final.BlkioThrottleReadBpsDevice, throttle) } } if len(resource.BlockIO.ThrottleWriteBpsDevice) > 0 { for _, entry := range resource.BlockIO.ThrottleWriteBpsDevice { throttle := &runcconfig.ThrottleDevice{} dev := &runcconfig.BlockIODevice{ Major: entry.Major, Minor: entry.Minor, } throttle.BlockIODevice = *dev throttle.Rate = entry.Rate final.BlkioThrottleWriteBpsDevice = append(final.BlkioThrottleWriteBpsDevice, throttle) } } if len(resource.BlockIO.ThrottleReadIOPSDevice) > 0 { for _, entry := range resource.BlockIO.ThrottleReadIOPSDevice { throttle := &runcconfig.ThrottleDevice{} dev := &runcconfig.BlockIODevice{ Major: entry.Major, Minor: entry.Minor, } throttle.BlockIODevice = *dev throttle.Rate = entry.Rate final.BlkioThrottleReadIOPSDevice = append(final.BlkioThrottleReadIOPSDevice, throttle) } } if len(resource.BlockIO.ThrottleWriteIOPSDevice) > 0 { for _, entry := range resource.BlockIO.ThrottleWriteIOPSDevice { throttle := &runcconfig.ThrottleDevice{} dev := &runcconfig.BlockIODevice{ Major: entry.Major, Minor: entry.Minor, } throttle.BlockIODevice = *dev throttle.Rate = entry.Rate final.BlkioThrottleWriteIOPSDevice = append(final.BlkioThrottleWriteIOPSDevice, throttle) } } if resource.BlockIO.LeafWeight != nil { final.BlkioLeafWeight = *resource.BlockIO.LeafWeight } if resource.BlockIO.Weight != nil { final.BlkioWeight = *resource.BlockIO.Weight } if len(resource.BlockIO.WeightDevice) > 0 { for _, entry := range resource.BlockIO.WeightDevice { weight := &runcconfig.WeightDevice{} dev := &runcconfig.BlockIODevice{ Major: entry.Major, Minor: entry.Minor, } if entry.Weight != nil { weight.Weight = *entry.Weight } if entry.LeafWeight != nil { weight.LeafWeight = *entry.LeafWeight } weight.BlockIODevice = *dev final.BlkioWeightDevice = append(final.BlkioWeightDevice, weight) } } } // Pids if resource.Pids != nil { final.PidsLimit = resource.Pids.Limit } // Networking if resource.Network != nil { if resource.Network.ClassID != nil { final.NetClsClassid = *resource.Network.ClassID } } // Unified state final.Unified = resource.Unified return *final, nil }