diff options
Diffstat (limited to 'pkg/specgen/generate/oci_linux.go')
-rw-r--r-- | pkg/specgen/generate/oci_linux.go | 331 |
1 files changed, 331 insertions, 0 deletions
diff --git a/pkg/specgen/generate/oci_linux.go b/pkg/specgen/generate/oci_linux.go new file mode 100644 index 000000000..341853de5 --- /dev/null +++ b/pkg/specgen/generate/oci_linux.go @@ -0,0 +1,331 @@ +package generate + +import ( + "context" + "encoding/json" + "fmt" + "path" + "strings" + + "github.com/containers/common/libimage" + "github.com/containers/common/pkg/cgroups" + "github.com/containers/common/pkg/config" + "github.com/containers/podman/v4/libpod" + "github.com/containers/podman/v4/libpod/define" + "github.com/containers/podman/v4/pkg/rootless" + "github.com/containers/podman/v4/pkg/specgen" + spec "github.com/opencontainers/runtime-spec/specs-go" + "github.com/opencontainers/runtime-tools/generate" + "golang.org/x/sys/unix" +) + +func setProcOpts(s *specgen.SpecGenerator, g *generate.Generator) { + if s.ProcOpts == nil { + return + } + for i := range g.Config.Mounts { + if g.Config.Mounts[i].Destination == "/proc" { + g.Config.Mounts[i].Options = s.ProcOpts + return + } + } +} + +// canMountSys is a best-effort heuristic to detect whether mounting a new sysfs is permitted in the container +func canMountSys(isRootless, isNewUserns bool, s *specgen.SpecGenerator) bool { + if s.NetNS.IsHost() && (isRootless || isNewUserns) { + return false + } + if isNewUserns { + switch s.NetNS.NSMode { + case specgen.Slirp, specgen.Private, specgen.NoNetwork, specgen.Bridge: + return true + default: + return false + } + } + return true +} + +func getCgroupPermissons(unmask []string) string { + ro := "ro" + rw := "rw" + cgroup := "/sys/fs/cgroup" + + cgroupv2, _ := cgroups.IsCgroup2UnifiedMode() + if !cgroupv2 { + return ro + } + + if unmask != nil && unmask[0] == "ALL" { + return rw + } + + for _, p := range unmask { + if path.Clean(p) == cgroup { + return rw + } + } + return ro +} + +// SpecGenToOCI returns the base configuration for the container. +func SpecGenToOCI(ctx context.Context, s *specgen.SpecGenerator, rt *libpod.Runtime, rtc *config.Config, newImage *libimage.Image, mounts []spec.Mount, pod *libpod.Pod, finalCmd []string, compatibleOptions *libpod.InfraInherit) (*spec.Spec, error) { + cgroupPerm := getCgroupPermissons(s.Unmask) + + g, err := generate.New("linux") + if err != nil { + return nil, err + } + // Remove the default /dev/shm mount to ensure we overwrite it + g.RemoveMount("/dev/shm") + g.HostSpecific = true + addCgroup := true + + isRootless := rootless.IsRootless() + isNewUserns := s.UserNS.IsContainer() || s.UserNS.IsPath() || s.UserNS.IsPrivate() + + canMountSys := canMountSys(isRootless, isNewUserns, s) + + if s.Privileged && canMountSys { + cgroupPerm = "rw" + g.RemoveMount("/sys") + sysMnt := spec.Mount{ + Destination: "/sys", + Type: "sysfs", + Source: "sysfs", + Options: []string{"rprivate", "nosuid", "noexec", "nodev", "rw"}, + } + g.AddMount(sysMnt) + } + if !canMountSys { + addCgroup = false + g.RemoveMount("/sys") + r := "ro" + if s.Privileged { + r = "rw" + } + sysMnt := spec.Mount{ + Destination: "/sys", + Type: "bind", // should we use a constant for this, like createconfig? + Source: "/sys", + Options: []string{"rprivate", "nosuid", "noexec", "nodev", r, "rbind"}, + } + g.AddMount(sysMnt) + if !s.Privileged && isRootless { + g.AddLinuxMaskedPaths("/sys/kernel") + } + } + gid5Available := true + if isRootless { + nGids, err := rootless.GetAvailableGids() + if err != nil { + return nil, err + } + gid5Available = nGids >= 5 + } + // When using a different user namespace, check that the GID 5 is mapped inside + // the container. + if gid5Available && (s.IDMappings != nil && len(s.IDMappings.GIDMap) > 0) { + mappingFound := false + for _, r := range s.IDMappings.GIDMap { + if r.ContainerID <= 5 && 5 < r.ContainerID+r.Size { + mappingFound = true + break + } + } + if !mappingFound { + gid5Available = false + } + } + if !gid5Available { + // If we have no GID mappings, the gid=5 default option would fail, so drop it. + g.RemoveMount("/dev/pts") + devPts := spec.Mount{ + Destination: "/dev/pts", + Type: "devpts", + Source: "devpts", + Options: []string{"rprivate", "nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620"}, + } + g.AddMount(devPts) + } + + inUserNS := isRootless || isNewUserns + + if inUserNS && s.IpcNS.IsHost() { + g.RemoveMount("/dev/mqueue") + devMqueue := spec.Mount{ + Destination: "/dev/mqueue", + Type: "bind", // constant ? + Source: "/dev/mqueue", + Options: []string{"bind", "nosuid", "noexec", "nodev"}, + } + g.AddMount(devMqueue) + } + if inUserNS && s.PidNS.IsHost() { + g.RemoveMount("/proc") + procMount := spec.Mount{ + Destination: "/proc", + Type: define.TypeBind, + Source: "/proc", + Options: []string{"rbind", "nosuid", "noexec", "nodev"}, + } + g.AddMount(procMount) + } + + if addCgroup { + cgroupMnt := spec.Mount{ + Destination: "/sys/fs/cgroup", + Type: "cgroup", + Source: "cgroup", + Options: []string{"rprivate", "nosuid", "noexec", "nodev", "relatime", cgroupPerm}, + } + g.AddMount(cgroupMnt) + } + + g.Config.Linux.Personality = s.Personality + + g.SetProcessCwd(s.WorkDir) + + g.SetProcessArgs(finalCmd) + + g.SetProcessTerminal(s.Terminal) + + for key, val := range s.Annotations { + g.AddAnnotation(key, val) + } + + if s.ResourceLimits != nil { + out, err := json.Marshal(s.ResourceLimits) + if err != nil { + return nil, err + } + err = json.Unmarshal(out, g.Config.Linux.Resources) + if err != nil { + return nil, err + } + g.Config.Linux.Resources = s.ResourceLimits + } + + weightDevices, err := WeightDevices(s.WeightDevice) + if err != nil { + return nil, err + } + if len(weightDevices) > 0 { + for _, dev := range weightDevices { + g.AddLinuxResourcesBlockIOWeightDevice(dev.Major, dev.Minor, *dev.Weight) + } + } + + // Devices + // set the default rule at the beginning of device configuration + if !inUserNS && !s.Privileged { + g.AddLinuxResourcesDevice(false, "", nil, nil, "rwm") + } + + var userDevices []spec.LinuxDevice + + if !s.Privileged { + // add default devices from containers.conf + for _, device := range rtc.Containers.Devices { + if err = DevicesFromPath(&g, device); err != nil { + return nil, err + } + } + if len(compatibleOptions.HostDeviceList) > 0 && len(s.Devices) == 0 { + userDevices = compatibleOptions.HostDeviceList + } else { + userDevices = s.Devices + } + // add default devices specified by caller + for _, device := range userDevices { + if err = DevicesFromPath(&g, device.Path); err != nil { + return nil, err + } + } + } + s.HostDeviceList = userDevices + + // set the devices cgroup when not running in a user namespace + if !inUserNS && !s.Privileged { + for _, dev := range s.DeviceCgroupRule { + g.AddLinuxResourcesDevice(true, dev.Type, dev.Major, dev.Minor, dev.Access) + } + } + + BlockAccessToKernelFilesystems(s.Privileged, s.PidNS.IsHost(), s.Mask, s.Unmask, &g) + + g.ClearProcessEnv() + for name, val := range s.Env { + g.AddProcessEnv(name, val) + } + + addRlimits(s, &g) + + // NAMESPACES + if err := specConfigureNamespaces(s, &g, rt, pod); err != nil { + return nil, err + } + configSpec := g.Config + + if err := securityConfigureGenerator(s, &g, newImage, rtc); err != nil { + return nil, err + } + + // BIND MOUNTS + configSpec.Mounts = SupersedeUserMounts(mounts, configSpec.Mounts) + // Process mounts to ensure correct options + if err := InitFSMounts(configSpec.Mounts); err != nil { + return nil, err + } + + // Add annotations + if configSpec.Annotations == nil { + configSpec.Annotations = make(map[string]string) + } + + if s.Remove { + configSpec.Annotations[define.InspectAnnotationAutoremove] = define.InspectResponseTrue + } else { + configSpec.Annotations[define.InspectAnnotationAutoremove] = define.InspectResponseFalse + } + + if len(s.VolumesFrom) > 0 { + configSpec.Annotations[define.InspectAnnotationVolumesFrom] = strings.Join(s.VolumesFrom, ",") + } + + if s.Privileged { + configSpec.Annotations[define.InspectAnnotationPrivileged] = define.InspectResponseTrue + } else { + configSpec.Annotations[define.InspectAnnotationPrivileged] = define.InspectResponseFalse + } + + if s.Init { + configSpec.Annotations[define.InspectAnnotationInit] = define.InspectResponseTrue + } else { + configSpec.Annotations[define.InspectAnnotationInit] = define.InspectResponseFalse + } + + if s.OOMScoreAdj != nil { + g.SetProcessOOMScoreAdj(*s.OOMScoreAdj) + } + setProcOpts(s, &g) + + return configSpec, nil +} + +func WeightDevices(wtDevices map[string]spec.LinuxWeightDevice) ([]spec.LinuxWeightDevice, error) { + devs := []spec.LinuxWeightDevice{} + for k, v := range wtDevices { + statT := unix.Stat_t{} + if err := unix.Stat(k, &statT); err != nil { + return nil, fmt.Errorf("failed to inspect '%s' in --blkio-weight-device: %w", k, err) + } + dev := new(spec.LinuxWeightDevice) + dev.Major = (int64(unix.Major(uint64(statT.Rdev)))) //nolint: unconvert + dev.Minor = (int64(unix.Minor(uint64(statT.Rdev)))) //nolint: unconvert + dev.Weight = v.Weight + devs = append(devs, *dev) + } + return devs, nil +} |