package createconfig import ( "os" "path" "strings" "github.com/containers/libpod/libpod" "github.com/containers/libpod/pkg/rootless" "github.com/docker/docker/daemon/caps" "github.com/docker/docker/pkg/mount" "github.com/docker/go-units" spec "github.com/opencontainers/runtime-spec/specs-go" "github.com/opencontainers/runtime-tools/generate" "github.com/pkg/errors" "github.com/sirupsen/logrus" ) const cpuPeriod = 100000 // CreateConfigToOCISpec parses information needed to create a container into an OCI runtime spec func CreateConfigToOCISpec(config *CreateConfig) (*spec.Spec, error) { //nolint cgroupPerm := "ro" g, err := generate.New("linux") if err != nil { return nil, err } g.HostSpecific = true addCgroup := true canMountSys := true isRootless := rootless.IsRootless() inUserNS := isRootless || (len(config.IDMappings.UIDMap) > 0 || len(config.IDMappings.GIDMap) > 0) && !config.UsernsMode.IsHost() if inUserNS && config.NetMode.IsHost() { canMountSys = false } if config.Privileged && canMountSys { cgroupPerm = "rw" g.RemoveMount("/sys") sysMnt := spec.Mount{ Destination: "/sys", Type: "sysfs", Source: "sysfs", Options: []string{"private", "nosuid", "noexec", "nodev", "rw"}, } g.AddMount(sysMnt) } else if !canMountSys { addCgroup = false g.RemoveMount("/sys") r := "ro" if config.Privileged { r = "rw" } sysMnt := spec.Mount{ Destination: "/sys", Type: "bind", Source: "/sys", Options: []string{"nosuid", "noexec", "nodev", r, "rbind"}, } g.AddMount(sysMnt) } if isRootless { g.RemoveMount("/dev/pts") devPts := spec.Mount{ Destination: "/dev/pts", Type: "devpts", Source: "devpts", Options: []string{"private", "nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620"}, } g.AddMount(devPts) } if inUserNS && config.IpcMode.IsHost() { g.RemoveMount("/dev/mqueue") devMqueue := spec.Mount{ Destination: "/dev/mqueue", Type: "bind", Source: "/dev/mqueue", Options: []string{"bind", "nosuid", "noexec", "nodev"}, } g.AddMount(devMqueue) } if inUserNS && config.PidMode.IsHost() { g.RemoveMount("/proc") procMount := spec.Mount{ Destination: "/proc", Type: "bind", Source: "/proc", Options: []string{"rbind", "nosuid", "noexec", "nodev"}, } g.AddMount(procMount) } if addCgroup { cgroupMnt := spec.Mount{ Destination: "/sys/fs/cgroup", Type: "cgroup", Source: "cgroup", Options: []string{"private", "nosuid", "noexec", "nodev", "relatime", cgroupPerm}, } g.AddMount(cgroupMnt) } g.SetProcessCwd(config.WorkDir) g.SetProcessArgs(config.Command) g.SetProcessTerminal(config.Tty) for key, val := range config.Annotations { g.AddAnnotation(key, val) } g.SetRootReadonly(config.ReadOnlyRootfs) hostname := config.Hostname if hostname == "" && (config.NetMode.IsHost() || config.UtsMode.IsHost()) { hostname, err = os.Hostname() if err != nil { return nil, errors.Wrap(err, "unable to retrieve hostname") } } g.RemoveHostname() if config.Hostname != "" || !config.UtsMode.IsHost() { // Set the hostname in the OCI configuration only // if specified by the user or if we are creating // a new UTS namespace. g.SetHostname(hostname) } g.AddProcessEnv("HOSTNAME", hostname) for sysctlKey, sysctlVal := range config.Sysctl { g.AddLinuxSysctl(sysctlKey, sysctlVal) } g.AddProcessEnv("container", "podman") canAddResources := !rootless.IsRootless() if canAddResources { // RESOURCES - MEMORY if config.Resources.Memory != 0 { g.SetLinuxResourcesMemoryLimit(config.Resources.Memory) // If a swap limit is not explicitly set, also set a swap limit // Default to double the memory limit if config.Resources.MemorySwap == 0 { g.SetLinuxResourcesMemorySwap(2 * config.Resources.Memory) } } if config.Resources.MemoryReservation != 0 { g.SetLinuxResourcesMemoryReservation(config.Resources.MemoryReservation) } if config.Resources.MemorySwap != 0 { g.SetLinuxResourcesMemorySwap(config.Resources.MemorySwap) } if config.Resources.KernelMemory != 0 { g.SetLinuxResourcesMemoryKernel(config.Resources.KernelMemory) } if config.Resources.MemorySwappiness != -1 { g.SetLinuxResourcesMemorySwappiness(uint64(config.Resources.MemorySwappiness)) } g.SetLinuxResourcesMemoryDisableOOMKiller(config.Resources.DisableOomKiller) g.SetProcessOOMScoreAdj(config.Resources.OomScoreAdj) // RESOURCES - CPU if config.Resources.CPUShares != 0 { g.SetLinuxResourcesCPUShares(config.Resources.CPUShares) } if config.Resources.CPUQuota != 0 { g.SetLinuxResourcesCPUQuota(config.Resources.CPUQuota) } if config.Resources.CPUPeriod != 0 { g.SetLinuxResourcesCPUPeriod(config.Resources.CPUPeriod) } if config.Resources.CPUs != 0 { g.SetLinuxResourcesCPUPeriod(cpuPeriod) g.SetLinuxResourcesCPUQuota(int64(config.Resources.CPUs * cpuPeriod)) } if config.Resources.CPURtRuntime != 0 { g.SetLinuxResourcesCPURealtimeRuntime(config.Resources.CPURtRuntime) } if config.Resources.CPURtPeriod != 0 { g.SetLinuxResourcesCPURealtimePeriod(config.Resources.CPURtPeriod) } if config.Resources.CPUsetCPUs != "" { g.SetLinuxResourcesCPUCpus(config.Resources.CPUsetCPUs) } if config.Resources.CPUsetMems != "" { g.SetLinuxResourcesCPUMems(config.Resources.CPUsetMems) } // Devices if config.Privileged { // If privileged, we need to add all the host devices to the // spec. We do not add the user provided ones because we are // already adding them all. if err := config.AddPrivilegedDevices(&g); err != nil { return nil, err } } else { for _, device := range config.Devices { if err := addDevice(&g, device); err != nil { return nil, err } } } } for _, uidmap := range config.IDMappings.UIDMap { g.AddLinuxUIDMapping(uint32(uidmap.HostID), uint32(uidmap.ContainerID), uint32(uidmap.Size)) } for _, gidmap := range config.IDMappings.GIDMap { g.AddLinuxGIDMapping(uint32(gidmap.HostID), uint32(gidmap.ContainerID), uint32(gidmap.Size)) } // SECURITY OPTS g.SetProcessNoNewPrivileges(config.NoNewPrivs) g.SetProcessApparmorProfile(config.ApparmorProfile) g.SetProcessSelinuxLabel(config.ProcessLabel) g.SetLinuxMountLabel(config.MountLabel) if canAddResources { blockAccessToKernelFilesystems(config, &g) // RESOURCES - PIDS if config.Resources.PidsLimit != 0 { g.SetLinuxResourcesPidsLimit(config.Resources.PidsLimit) } } if config.Systemd && (strings.HasSuffix(config.Command[0], "init") || strings.HasSuffix(config.Command[0], "systemd")) { if err := setupSystemd(config, &g); err != nil { return nil, errors.Wrap(err, "failed to setup systemd") } } for _, i := range config.Tmpfs { // Default options if nothing passed options := []string{"rw", "private", "noexec", "nosuid", "nodev", "size=65536k"} spliti := strings.SplitN(i, ":", 2) if len(spliti) > 1 { if _, _, err := mount.ParseTmpfsOptions(spliti[1]); err != nil { return nil, err } options = strings.Split(spliti[1], ",") } tmpfsMnt := spec.Mount{ Destination: spliti[0], Type: "tmpfs", Source: "tmpfs", Options: append(options, "tmpcopyup"), } g.AddMount(tmpfsMnt) } for name, val := range config.Env { g.AddProcessEnv(name, val) } if err := addRlimits(config, &g); err != nil { return nil, err } if err := addPidNS(config, &g); err != nil { return nil, err } if err := addUserNS(config, &g); err != nil { return nil, err } if err := addNetNS(config, &g); err != nil { return nil, err } if err := addUTSNS(config, &g); err != nil { return nil, err } if err := addIpcNS(config, &g); err != nil { return nil, err } configSpec := g.Config // HANDLE CAPABILITIES // NOTE: Must happen before SECCOMP if !config.Privileged { if err := setupCapabilities(config, configSpec); err != nil { return nil, err } } else { g.SetupPrivileged(true) } // HANDLE SECCOMP if config.SeccompProfilePath != "unconfined" { seccompConfig, err := getSeccompConfig(config, configSpec) if err != nil { return nil, err } configSpec.Linux.Seccomp = seccompConfig } // Clear default Seccomp profile from Generator for privileged containers if config.SeccompProfilePath == "unconfined" || config.Privileged { configSpec.Linux.Seccomp = nil } // BIND MOUNTS if err := config.GetVolumesFrom(); err != nil { return nil, errors.Wrap(err, "error getting volume mounts from --volumes-from flag") } mounts, err := config.GetVolumeMounts(configSpec.Mounts) if err != nil { return nil, errors.Wrapf(err, "error getting volume mounts") } if len(mounts) > 0 { // If we have overlappings mounts, remove them from the spec in favor of // the user-added volume mounts destinations := make(map[string]bool) for _, mount := range mounts { destinations[path.Clean(mount.Destination)] = true } for _, mount := range configSpec.Mounts { if _, ok := destinations[path.Clean(mount.Destination)]; !ok { logrus.Debugf("Adding mount %s", mount.Destination) mounts = append(mounts, mount) } } configSpec.Mounts = mounts } if err := g.SetLinuxRootPropagation("shared"); err != nil { return nil, errors.Wrapf(err, "failed to set propagation to rslave") } if canAddResources { // BLOCK IO blkio, err := config.CreateBlockIO() if err != nil { return nil, errors.Wrapf(err, "error creating block io") } if blkio != nil { configSpec.Linux.Resources.BlockIO = blkio } } // If we cannot add resources be sure everything is cleared out if !canAddResources { configSpec.Linux.Resources = &spec.LinuxResources{} } return configSpec, nil } func blockAccessToKernelFilesystems(config *CreateConfig, g *generate.Generator) { if !config.Privileged { for _, mp := range []string{ "/proc/acpi", "/proc/kcore", "/proc/keys", "/proc/latency_stats", "/proc/timer_list", "/proc/timer_stats", "/proc/sched_debug", "/proc/scsi", "/sys/firmware", } { g.AddLinuxMaskedPaths(mp) } for _, rp := range []string{ "/proc/asound", "/proc/bus", "/proc/fs", "/proc/irq", "/proc/sys", "/proc/sysrq-trigger", } { g.AddLinuxReadonlyPaths(rp) } } } // systemd expects to have /run, /run/lock and /tmp on tmpfs // It also expects to be able to write to /sys/fs/cgroup/systemd and /var/log/journal func setupSystemd(config *CreateConfig, g *generate.Generator) error { mounts, err := config.GetVolumeMounts([]spec.Mount{}) if err != nil { return err } options := []string{"rw", "private", "noexec", "nosuid", "nodev"} for _, dest := range []string{"/run", "/run/lock", "/sys/fs/cgroup/systemd"} { if libpod.MountExists(mounts, dest) { continue } tmpfsMnt := spec.Mount{ Destination: dest, Type: "tmpfs", Source: "tmpfs", Options: append(options, "tmpcopyup", "size=65536k"), } g.AddMount(tmpfsMnt) } for _, dest := range []string{"/tmp", "/var/log/journal"} { if libpod.MountExists(mounts, dest) { continue } tmpfsMnt := spec.Mount{ Destination: dest, Type: "tmpfs", Source: "tmpfs", Options: append(options, "tmpcopyup"), } g.AddMount(tmpfsMnt) } return nil } func addPidNS(config *CreateConfig, g *generate.Generator) error { pidMode := config.PidMode if IsNS(string(pidMode)) { return g.AddOrReplaceLinuxNamespace(string(spec.PIDNamespace), NS(string(pidMode))) } if pidMode.IsHost() { return g.RemoveLinuxNamespace(string(spec.PIDNamespace)) } if pidMode.IsContainer() { logrus.Debug("using container pidmode") } if IsPod(string(pidMode)) { logrus.Debug("using pod pidmode") } return nil } func addUserNS(config *CreateConfig, g *generate.Generator) error { if IsNS(string(config.UsernsMode)) { g.AddOrReplaceLinuxNamespace(spec.UserNamespace, NS(string(config.UsernsMode))) // runc complains if no mapping is specified, even if we join another ns. So provide a dummy mapping g.AddLinuxUIDMapping(uint32(0), uint32(0), uint32(1)) g.AddLinuxGIDMapping(uint32(0), uint32(0), uint32(1)) } if (len(config.IDMappings.UIDMap) > 0 || len(config.IDMappings.GIDMap) > 0) && !config.UsernsMode.IsHost() { g.AddOrReplaceLinuxNamespace(spec.UserNamespace, "") } return nil } func addNetNS(config *CreateConfig, g *generate.Generator) error { netMode := config.NetMode if netMode.IsHost() { logrus.Debug("Using host netmode") return g.RemoveLinuxNamespace(spec.NetworkNamespace) } else if netMode.IsNone() { logrus.Debug("Using none netmode") return nil } else if netMode.IsBridge() { logrus.Debug("Using bridge netmode") return nil } else if netMode.IsContainer() { logrus.Debug("Using container netmode") return nil } else if IsNS(string(netMode)) { logrus.Debug("Using ns netmode") return g.AddOrReplaceLinuxNamespace(spec.NetworkNamespace, NS(string(netMode))) } else if IsPod(string(netMode)) { logrus.Debug("Using pod netmode, unless pod is not sharing") return nil } else if netMode.IsUserDefined() { logrus.Debug("Using user defined netmode") return nil } return errors.Errorf("unknown network mode") } func addUTSNS(config *CreateConfig, g *generate.Generator) error { utsMode := config.UtsMode if IsNS(string(utsMode)) { return g.AddOrReplaceLinuxNamespace(string(spec.UTSNamespace), NS(string(utsMode))) } if utsMode.IsHost() { return g.RemoveLinuxNamespace(spec.UTSNamespace) } return nil } func addIpcNS(config *CreateConfig, g *generate.Generator) error { ipcMode := config.IpcMode if IsNS(string(ipcMode)) { return g.AddOrReplaceLinuxNamespace(string(spec.IPCNamespace), NS(string(ipcMode))) } if ipcMode.IsHost() { return g.RemoveLinuxNamespace(spec.IPCNamespace) } if ipcMode.IsContainer() { logrus.Debug("Using container ipcmode") } return nil } func addRlimits(config *CreateConfig, g *generate.Generator) error { var ( kernelMax uint64 = 1048576 isRootless = rootless.IsRootless() nofileSet = false nprocSet = false ) for _, u := range config.Resources.Ulimit { ul, err := units.ParseUlimit(u) if err != nil { return errors.Wrapf(err, "ulimit option %q requires name=SOFT:HARD, failed to be parsed", u) } if ul.Name == "nofile" { nofileSet = true } else if ul.Name == "nproc" { nprocSet = true } g.AddProcessRlimits("RLIMIT_"+strings.ToUpper(ul.Name), uint64(ul.Hard), uint64(ul.Soft)) } // If not explicitly overridden by the user, default number of open // files and number of processes to the maximum they can be set to // (without overriding a sysctl) if !nofileSet && !isRootless { g.AddProcessRlimits("RLIMIT_NOFILE", kernelMax, kernelMax) } if !nprocSet && !isRootless { g.AddProcessRlimits("RLIMIT_NPROC", kernelMax, kernelMax) } return nil } func setupCapabilities(config *CreateConfig, configSpec *spec.Spec) error { useNotRoot := func(user string) bool { if user == "" || user == "root" || user == "0" { return false } return true } var err error var caplist []string bounding := configSpec.Process.Capabilities.Bounding if useNotRoot(config.User) { configSpec.Process.Capabilities.Bounding = caplist } caplist, err = caps.TweakCapabilities(configSpec.Process.Capabilities.Bounding, config.CapAdd, config.CapDrop) if err != nil { return err } configSpec.Process.Capabilities.Bounding = caplist configSpec.Process.Capabilities.Permitted = caplist configSpec.Process.Capabilities.Inheritable = caplist configSpec.Process.Capabilities.Effective = caplist configSpec.Process.Capabilities.Ambient = caplist if useNotRoot(config.User) { caplist, err = caps.TweakCapabilities(bounding, config.CapAdd, config.CapDrop) if err != nil { return err } } configSpec.Process.Capabilities.Bounding = caplist return nil }