package createconfig import ( "os" "path" "path/filepath" "strings" "github.com/containers/libpod/libpod" "github.com/containers/libpod/pkg/rootless" "github.com/containers/storage/pkg/mount" pmount "github.com/containers/storage/pkg/mount" "github.com/docker/docker/oci/caps" "github.com/docker/go-units" "github.com/opencontainers/runc/libcontainer/user" spec "github.com/opencontainers/runtime-spec/specs-go" "github.com/opencontainers/runtime-tools/generate" "github.com/pkg/errors" "github.com/sirupsen/logrus" ) const cpuPeriod = 100000 func supercedeUserMounts(mounts []spec.Mount, configMount []spec.Mount) []spec.Mount { if len(mounts) > 0 { // If we have overlappings mounts, remove them from the spec in favor of // the user-added volume mounts destinations := make(map[string]bool) for _, mount := range mounts { destinations[path.Clean(mount.Destination)] = true } // Copy all mounts from spec to defaultMounts, except for // - mounts overridden by a user supplied mount; // - all mounts under /dev if a user supplied /dev is present; mountDev := destinations["/dev"] for _, mount := range configMount { if _, ok := destinations[path.Clean(mount.Destination)]; !ok { if mountDev && strings.HasPrefix(mount.Destination, "/dev/") { // filter out everything under /dev if /dev is user-mounted continue } logrus.Debugf("Adding mount %s", mount.Destination) mounts = append(mounts, mount) } } return mounts } return configMount } // Split named volumes from normal volumes func splitNamedVolumes(mounts []spec.Mount) ([]spec.Mount, []*libpod.ContainerNamedVolume) { newMounts := make([]spec.Mount, 0) namedVolumes := make([]*libpod.ContainerNamedVolume, 0) for _, mount := range mounts { // If it's not a named volume, append unconditionally if mount.Type != TypeBind { newMounts = append(newMounts, mount) continue } // Volumes that are not named volumes must be an absolute or // relative path. // Volume names may not begin with a non-alphanumeric character // so the HasPrefix() check is safe here. if strings.HasPrefix(mount.Source, "/") || strings.HasPrefix(mount.Source, ".") { newMounts = append(newMounts, mount) } else { namedVolume := new(libpod.ContainerNamedVolume) namedVolume.Name = mount.Source namedVolume.Dest = mount.Destination namedVolume.Options = mount.Options namedVolumes = append(namedVolumes, namedVolume) } } return newMounts, namedVolumes } func getAvailableGids() (int64, error) { idMap, err := user.ParseIDMapFile("/proc/self/gid_map") if err != nil { return 0, err } count := int64(0) for _, r := range idMap { count += r.Count } return count, nil } // CreateConfigToOCISpec parses information needed to create a container into an OCI runtime spec func (config *CreateConfig) createConfigToOCISpec() (*spec.Spec, error) { //nolint cgroupPerm := "ro" g, err := generate.New("linux") if err != nil { return nil, err } // Remove the default /dev/shm mount to ensure we overwrite it g.RemoveMount("/dev/shm") g.HostSpecific = true addCgroup := true canMountSys := true isRootless := rootless.IsRootless() inUserNS := isRootless || (len(config.IDMappings.UIDMap) > 0 || len(config.IDMappings.GIDMap) > 0) && !config.UsernsMode.IsHost() if inUserNS && config.NetMode.IsHost() { canMountSys = false } if config.Privileged && canMountSys { cgroupPerm = "rw" g.RemoveMount("/sys") sysMnt := spec.Mount{ Destination: "/sys", Type: "sysfs", Source: "sysfs", Options: []string{"rprivate", "nosuid", "noexec", "nodev", "rw"}, } g.AddMount(sysMnt) } else if !canMountSys { addCgroup = false g.RemoveMount("/sys") r := "ro" if config.Privileged { r = "rw" } sysMnt := spec.Mount{ Destination: "/sys", Type: TypeBind, Source: "/sys", Options: []string{"rprivate", "nosuid", "noexec", "nodev", r, "rbind"}, } g.AddMount(sysMnt) if !config.Privileged && isRootless { g.AddLinuxMaskedPaths("/sys/kernel") } } if isRootless { nGids, err := getAvailableGids() if err != nil { return nil, err } if nGids < 5 { // If we have no GID mappings, the gid=5 default option would fail, so drop it. g.RemoveMount("/dev/pts") devPts := spec.Mount{ Destination: "/dev/pts", Type: "devpts", Source: "devpts", Options: []string{"rprivate", "nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620"}, } g.AddMount(devPts) } } if inUserNS && config.IpcMode.IsHost() { g.RemoveMount("/dev/mqueue") devMqueue := spec.Mount{ Destination: "/dev/mqueue", Type: TypeBind, Source: "/dev/mqueue", Options: []string{"bind", "nosuid", "noexec", "nodev"}, } g.AddMount(devMqueue) } if inUserNS && config.PidMode.IsHost() { g.RemoveMount("/proc") procMount := spec.Mount{ Destination: "/proc", Type: TypeBind, Source: "/proc", Options: []string{"rbind", "nosuid", "noexec", "nodev"}, } g.AddMount(procMount) } if addCgroup { cgroupMnt := spec.Mount{ Destination: "/sys/fs/cgroup", Type: "cgroup", Source: "cgroup", Options: []string{"rprivate", "nosuid", "noexec", "nodev", "relatime", cgroupPerm}, } g.AddMount(cgroupMnt) } g.SetProcessCwd(config.WorkDir) g.SetProcessArgs(config.Command) g.SetProcessTerminal(config.Tty) for key, val := range config.Annotations { g.AddAnnotation(key, val) } g.SetRootReadonly(config.ReadOnlyRootfs) if config.HTTPProxy { for _, envSpec := range []string{ "http_proxy", "HTTP_PROXY", "https_proxy", "HTTPS_PROXY", "ftp_proxy", "FTP_PROXY", "no_proxy", "NO_PROXY", } { envVal := os.Getenv(envSpec) if envVal != "" { g.AddProcessEnv(envSpec, envVal) } } } hostname := config.Hostname if hostname == "" && (config.NetMode.IsHost() || config.UtsMode.IsHost()) { hostname, err = os.Hostname() if err != nil { return nil, errors.Wrap(err, "unable to retrieve hostname") } } g.RemoveHostname() if config.Hostname != "" || !config.UtsMode.IsHost() { // Set the hostname in the OCI configuration only // if specified by the user or if we are creating // a new UTS namespace. g.SetHostname(hostname) } g.AddProcessEnv("HOSTNAME", hostname) for sysctlKey, sysctlVal := range config.Sysctl { g.AddLinuxSysctl(sysctlKey, sysctlVal) } g.AddProcessEnv("container", "podman") addedResources := false // RESOURCES - MEMORY if config.Resources.Memory != 0 { g.SetLinuxResourcesMemoryLimit(config.Resources.Memory) // If a swap limit is not explicitly set, also set a swap limit // Default to double the memory limit if config.Resources.MemorySwap == 0 { g.SetLinuxResourcesMemorySwap(2 * config.Resources.Memory) } addedResources = true } if config.Resources.MemoryReservation != 0 { g.SetLinuxResourcesMemoryReservation(config.Resources.MemoryReservation) addedResources = true } if config.Resources.MemorySwap != 0 { g.SetLinuxResourcesMemorySwap(config.Resources.MemorySwap) addedResources = true } if config.Resources.KernelMemory != 0 { g.SetLinuxResourcesMemoryKernel(config.Resources.KernelMemory) addedResources = true } if config.Resources.MemorySwappiness != -1 { g.SetLinuxResourcesMemorySwappiness(uint64(config.Resources.MemorySwappiness)) addedResources = true } g.SetLinuxResourcesMemoryDisableOOMKiller(config.Resources.DisableOomKiller) g.SetProcessOOMScoreAdj(config.Resources.OomScoreAdj) // RESOURCES - CPU if config.Resources.CPUShares != 0 { g.SetLinuxResourcesCPUShares(config.Resources.CPUShares) addedResources = true } if config.Resources.CPUQuota != 0 { g.SetLinuxResourcesCPUQuota(config.Resources.CPUQuota) addedResources = true } if config.Resources.CPUPeriod != 0 { g.SetLinuxResourcesCPUPeriod(config.Resources.CPUPeriod) addedResources = true } if config.Resources.CPUs != 0 { g.SetLinuxResourcesCPUPeriod(cpuPeriod) g.SetLinuxResourcesCPUQuota(int64(config.Resources.CPUs * cpuPeriod)) addedResources = true } if config.Resources.CPURtRuntime != 0 { g.SetLinuxResourcesCPURealtimeRuntime(config.Resources.CPURtRuntime) addedResources = true } if config.Resources.CPURtPeriod != 0 { g.SetLinuxResourcesCPURealtimePeriod(config.Resources.CPURtPeriod) addedResources = true } if config.Resources.CPUsetCPUs != "" { g.SetLinuxResourcesCPUCpus(config.Resources.CPUsetCPUs) addedResources = true } if config.Resources.CPUsetMems != "" { g.SetLinuxResourcesCPUMems(config.Resources.CPUsetMems) addedResources = true } // Devices if config.Privileged { // If privileged, we need to add all the host devices to the // spec. We do not add the user provided ones because we are // already adding them all. if !rootless.IsRootless() { if err := config.AddPrivilegedDevices(&g); err != nil { return nil, err } } } else { for _, devicePath := range config.Devices { if err := devicesFromPath(&g, devicePath); err != nil { return nil, err } } } for _, uidmap := range config.IDMappings.UIDMap { g.AddLinuxUIDMapping(uint32(uidmap.HostID), uint32(uidmap.ContainerID), uint32(uidmap.Size)) } for _, gidmap := range config.IDMappings.GIDMap { g.AddLinuxGIDMapping(uint32(gidmap.HostID), uint32(gidmap.ContainerID), uint32(gidmap.Size)) } // SECURITY OPTS g.SetProcessNoNewPrivileges(config.NoNewPrivs) g.SetProcessApparmorProfile(config.ApparmorProfile) blockAccessToKernelFilesystems(config, &g) // RESOURCES - PIDS if config.Resources.PidsLimit != 0 { g.SetLinuxResourcesPidsLimit(config.Resources.PidsLimit) addedResources = true } for _, i := range config.Tmpfs { // Default options if nothing passed options := []string{"rw", "rprivate", "noexec", "nosuid", "nodev", "size=65536k"} spliti := strings.SplitN(i, ":", 2) if len(spliti) > 1 { if _, _, err := mount.ParseTmpfsOptions(spliti[1]); err != nil { return nil, err } options = strings.Split(spliti[1], ",") } tmpfsMnt := spec.Mount{ Destination: spliti[0], Type: "tmpfs", Source: "tmpfs", Options: append(options, "tmpcopyup"), } g.AddMount(tmpfsMnt) } for _, m := range config.Mounts { if m.Type == "tmpfs" { g.AddMount(m) } } if config.ReadOnlyRootfs && config.ReadOnlyTmpfs { options := []string{"rw", "rprivate", "nosuid", "nodev", "tmpcopyup"} for _, i := range []string{"/tmp", "/var/tmp"} { if libpod.MountExists(g.Config.Mounts, i) { continue } // Default options if nothing passed tmpfsMnt := spec.Mount{ Destination: i, Type: "tmpfs", Source: "tmpfs", Options: options, } g.AddMount(tmpfsMnt) } if !libpod.MountExists(g.Config.Mounts, "/run") { tmpfsMnt := spec.Mount{ Destination: "/run", Type: "tmpfs", Source: "tmpfs", Options: append(options, "noexec", "size=65536k"), } g.AddMount(tmpfsMnt) } } for name, val := range config.Env { g.AddProcessEnv(name, val) } if err := addRlimits(config, &g); err != nil { return nil, err } if err := addPidNS(config, &g); err != nil { return nil, err } if err := addUserNS(config, &g); err != nil { return nil, err } if err := addNetNS(config, &g); err != nil { return nil, err } if err := addUTSNS(config, &g); err != nil { return nil, err } if err := addIpcNS(config, &g); err != nil { return nil, err } configSpec := g.Config // HANDLE CAPABILITIES // NOTE: Must happen before SECCOMP if !config.Privileged { if err := setupCapabilities(config, configSpec); err != nil { return nil, err } } else { g.SetupPrivileged(true) } // HANDLE SECCOMP if config.SeccompProfilePath != "unconfined" { seccompConfig, err := getSeccompConfig(config, configSpec) if err != nil { return nil, err } configSpec.Linux.Seccomp = seccompConfig } // Clear default Seccomp profile from Generator for privileged containers if config.SeccompProfilePath == "unconfined" || config.Privileged { configSpec.Linux.Seccomp = nil } // BIND MOUNTS if err := config.GetVolumesFrom(); err != nil { return nil, errors.Wrap(err, "error getting volume mounts from --volumes-from flag") } volumeMounts, err := config.GetVolumeMounts(configSpec.Mounts) if err != nil { return nil, errors.Wrapf(err, "error getting volume mounts") } configSpec.Mounts = supercedeUserMounts(volumeMounts, configSpec.Mounts) //--mount configSpec.Mounts = supercedeUserMounts(config.initFSMounts(), configSpec.Mounts) // Split normal mounts and named volumes newMounts, namedVolumes := splitNamedVolumes(configSpec.Mounts) configSpec.Mounts = newMounts config.NamedVolumes = namedVolumes // BLOCK IO blkio, err := config.CreateBlockIO() if err != nil { return nil, errors.Wrapf(err, "error creating block io") } if blkio != nil { configSpec.Linux.Resources.BlockIO = blkio addedResources = true } if rootless.IsRootless() { if addedResources { return nil, errors.New("invalid configuration, cannot set resources with rootless containers") } configSpec.Linux.Resources = &spec.LinuxResources{} } // Make sure that the bind mounts keep options like nosuid, noexec, nodev. mounts, err := pmount.GetMounts() if err != nil { return nil, err } for i := range configSpec.Mounts { m := &configSpec.Mounts[i] isBind := false for _, o := range m.Options { if o == "bind" || o == "rbind" { isBind = true break } } if !isBind { continue } mount, err := findMount(m.Source, mounts) if err != nil { return nil, err } if mount == nil { continue } next_option: for _, o := range strings.Split(mount.Opts, ",") { if o == "nosuid" || o == "noexec" || o == "nodev" { for _, e := range m.Options { if e == o { continue next_option } } m.Options = append(m.Options, o) } } } return configSpec, nil } func findMount(target string, mounts []*pmount.Info) (*pmount.Info, error) { var err error target, err = filepath.Abs(target) if err != nil { return nil, errors.Wrapf(err, "cannot resolve %s", target) } var bestSoFar *pmount.Info for _, i := range mounts { if bestSoFar != nil && len(bestSoFar.Mountpoint) > len(i.Mountpoint) { // Won't be better than what we have already found continue } if strings.HasPrefix(target, i.Mountpoint) { bestSoFar = i } } return bestSoFar, nil } func blockAccessToKernelFilesystems(config *CreateConfig, g *generate.Generator) { if !config.Privileged { for _, mp := range []string{ "/proc/acpi", "/proc/kcore", "/proc/keys", "/proc/latency_stats", "/proc/timer_list", "/proc/timer_stats", "/proc/sched_debug", "/proc/scsi", "/sys/firmware", "/sys/fs/selinux", } { g.AddLinuxMaskedPaths(mp) } if config.PidMode.IsHost() && rootless.IsRootless() { return } for _, rp := range []string{ "/proc/asound", "/proc/bus", "/proc/fs", "/proc/irq", "/proc/sys", "/proc/sysrq-trigger", } { g.AddLinuxReadonlyPaths(rp) } } } func addPidNS(config *CreateConfig, g *generate.Generator) error { pidMode := config.PidMode if IsNS(string(pidMode)) { return g.AddOrReplaceLinuxNamespace(string(spec.PIDNamespace), NS(string(pidMode))) } if pidMode.IsHost() { return g.RemoveLinuxNamespace(string(spec.PIDNamespace)) } if pidMode.IsContainer() { logrus.Debug("using container pidmode") } if IsPod(string(pidMode)) { logrus.Debug("using pod pidmode") } return nil } func addUserNS(config *CreateConfig, g *generate.Generator) error { if IsNS(string(config.UsernsMode)) { g.AddOrReplaceLinuxNamespace(spec.UserNamespace, NS(string(config.UsernsMode))) // runc complains if no mapping is specified, even if we join another ns. So provide a dummy mapping g.AddLinuxUIDMapping(uint32(0), uint32(0), uint32(1)) g.AddLinuxGIDMapping(uint32(0), uint32(0), uint32(1)) } if (len(config.IDMappings.UIDMap) > 0 || len(config.IDMappings.GIDMap) > 0) && !config.UsernsMode.IsHost() { g.AddOrReplaceLinuxNamespace(spec.UserNamespace, "") } return nil } func addNetNS(config *CreateConfig, g *generate.Generator) error { netMode := config.NetMode if netMode.IsHost() { logrus.Debug("Using host netmode") return g.RemoveLinuxNamespace(spec.NetworkNamespace) } else if netMode.IsNone() { logrus.Debug("Using none netmode") return nil } else if netMode.IsBridge() { logrus.Debug("Using bridge netmode") return nil } else if netMode.IsContainer() { logrus.Debug("Using container netmode") return nil } else if IsNS(string(netMode)) { logrus.Debug("Using ns netmode") return g.AddOrReplaceLinuxNamespace(spec.NetworkNamespace, NS(string(netMode))) } else if IsPod(string(netMode)) { logrus.Debug("Using pod netmode, unless pod is not sharing") return nil } else if netMode.IsSlirp4netns() { logrus.Debug("Using slirp4netns netmode") return nil } else if netMode.IsUserDefined() { logrus.Debug("Using user defined netmode") return nil } return errors.Errorf("unknown network mode") } func addUTSNS(config *CreateConfig, g *generate.Generator) error { utsMode := config.UtsMode if IsNS(string(utsMode)) { return g.AddOrReplaceLinuxNamespace(string(spec.UTSNamespace), NS(string(utsMode))) } if utsMode.IsHost() { return g.RemoveLinuxNamespace(spec.UTSNamespace) } return nil } func addIpcNS(config *CreateConfig, g *generate.Generator) error { ipcMode := config.IpcMode if IsNS(string(ipcMode)) { return g.AddOrReplaceLinuxNamespace(string(spec.IPCNamespace), NS(string(ipcMode))) } if ipcMode.IsHost() { return g.RemoveLinuxNamespace(spec.IPCNamespace) } if ipcMode.IsContainer() { logrus.Debug("Using container ipcmode") } return nil } func addRlimits(config *CreateConfig, g *generate.Generator) error { var ( kernelMax uint64 = 1048576 isRootless = rootless.IsRootless() nofileSet = false nprocSet = false ) for _, u := range config.Resources.Ulimit { ul, err := units.ParseUlimit(u) if err != nil { return errors.Wrapf(err, "ulimit option %q requires name=SOFT:HARD, failed to be parsed", u) } if ul.Name == "nofile" { nofileSet = true } else if ul.Name == "nproc" { nprocSet = true } g.AddProcessRlimits("RLIMIT_"+strings.ToUpper(ul.Name), uint64(ul.Hard), uint64(ul.Soft)) } // If not explicitly overridden by the user, default number of open // files and number of processes to the maximum they can be set to // (without overriding a sysctl) if !nofileSet && !isRootless { g.AddProcessRlimits("RLIMIT_NOFILE", kernelMax, kernelMax) } if !nprocSet && !isRootless { g.AddProcessRlimits("RLIMIT_NPROC", kernelMax, kernelMax) } return nil } func setupCapabilities(config *CreateConfig, configSpec *spec.Spec) error { useNotRoot := func(user string) bool { if user == "" || user == "root" || user == "0" { return false } return true } var err error var caplist []string bounding := configSpec.Process.Capabilities.Bounding if useNotRoot(config.User) { configSpec.Process.Capabilities.Bounding = caplist } caplist, err = caps.TweakCapabilities(configSpec.Process.Capabilities.Bounding, config.CapAdd, config.CapDrop, nil, false) if err != nil { return err } configSpec.Process.Capabilities.Bounding = caplist configSpec.Process.Capabilities.Permitted = caplist configSpec.Process.Capabilities.Inheritable = caplist configSpec.Process.Capabilities.Effective = caplist configSpec.Process.Capabilities.Ambient = caplist if useNotRoot(config.User) { caplist, err = caps.TweakCapabilities(bounding, config.CapAdd, config.CapDrop, nil, false) if err != nil { return err } } configSpec.Process.Capabilities.Bounding = caplist return nil }