diff options
Diffstat (limited to 'pkg/spec/spec.go')
-rw-r--r-- | pkg/spec/spec.go | 422 |
1 files changed, 422 insertions, 0 deletions
diff --git a/pkg/spec/spec.go b/pkg/spec/spec.go new file mode 100644 index 000000000..959a24213 --- /dev/null +++ b/pkg/spec/spec.go @@ -0,0 +1,422 @@ +package createconfig + +import ( + "strings" + + "github.com/docker/docker/daemon/caps" + "github.com/docker/docker/pkg/mount" + "github.com/docker/docker/profiles/seccomp" + "github.com/docker/go-units" + "github.com/opencontainers/runc/libcontainer/devices" + spec "github.com/opencontainers/runtime-spec/specs-go" + "github.com/opencontainers/runtime-tools/generate" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" + "io/ioutil" +) + +const cpuPeriod = 100000 + +// CreateConfigToOCISpec parses information needed to create a container into an OCI runtime spec +func CreateConfigToOCISpec(config *CreateConfig) (*spec.Spec, error) { //nolint + cgroupPerm := "ro" + g := generate.New() + g.HostSpecific = true + addCgroup := true + if config.Privileged { + cgroupPerm = "rw" + g.RemoveMount("/sys") + sysMnt := spec.Mount{ + Destination: "/sys", + Type: "sysfs", + Source: "sysfs", + Options: []string{"nosuid", "noexec", "nodev", "rw"}, + } + g.AddMount(sysMnt) + } else if !config.UsernsMode.IsHost() && config.NetMode.IsHost() { + addCgroup = false + g.RemoveMount("/sys") + sysMnt := spec.Mount{ + Destination: "/sys", + Type: "bind", + Source: "/sys", + Options: []string{"nosuid", "noexec", "nodev", "ro", "rbind"}, + } + g.AddMount(sysMnt) + } + + if addCgroup { + cgroupMnt := spec.Mount{ + Destination: "/sys/fs/cgroup", + Type: "cgroup", + Source: "cgroup", + Options: []string{"nosuid", "noexec", "nodev", "relatime", cgroupPerm}, + } + g.AddMount(cgroupMnt) + } + g.SetProcessCwd(config.WorkDir) + g.SetProcessArgs(config.Command) + g.SetProcessTerminal(config.Tty) + + for key, val := range config.GetAnnotations() { + g.AddAnnotation(key, val) + } + g.SetRootReadonly(config.ReadOnlyRootfs) + g.SetHostname(config.Hostname) + if config.Hostname != "" { + g.AddProcessEnv("HOSTNAME", config.Hostname) + } + for sysctlKey, sysctlVal := range config.Sysctl { + g.AddLinuxSysctl(sysctlKey, sysctlVal) + } + g.AddProcessEnv("container", "podman") + + // RESOURCES - MEMORY + if config.Resources.Memory != 0 { + g.SetLinuxResourcesMemoryLimit(config.Resources.Memory) + } + if config.Resources.MemoryReservation != 0 { + g.SetLinuxResourcesMemoryReservation(config.Resources.MemoryReservation) + } + if config.Resources.MemorySwap != 0 { + g.SetLinuxResourcesMemorySwap(config.Resources.MemorySwap) + } + if config.Resources.KernelMemory != 0 { + g.SetLinuxResourcesMemoryKernel(config.Resources.KernelMemory) + } + if config.Resources.MemorySwappiness != -1 { + g.SetLinuxResourcesMemorySwappiness(uint64(config.Resources.MemorySwappiness)) + } + g.SetLinuxResourcesMemoryDisableOOMKiller(config.Resources.DisableOomKiller) + g.SetProcessOOMScoreAdj(config.Resources.OomScoreAdj) + + // RESOURCES - CPU + if config.Resources.CPUShares != 0 { + g.SetLinuxResourcesCPUShares(config.Resources.CPUShares) + } + if config.Resources.CPUQuota != 0 { + g.SetLinuxResourcesCPUQuota(config.Resources.CPUQuota) + } + if config.Resources.CPUPeriod != 0 { + g.SetLinuxResourcesCPUPeriod(config.Resources.CPUPeriod) + } + if config.Resources.CPUs != 0 { + g.SetLinuxResourcesCPUPeriod(cpuPeriod) + g.SetLinuxResourcesCPUQuota(int64(config.Resources.CPUs * cpuPeriod)) + } + if config.Resources.CPURtRuntime != 0 { + g.SetLinuxResourcesCPURealtimeRuntime(config.Resources.CPURtRuntime) + } + if config.Resources.CPURtPeriod != 0 { + g.SetLinuxResourcesCPURealtimePeriod(config.Resources.CPURtPeriod) + } + if config.Resources.CPUsetCPUs != "" { + g.SetLinuxResourcesCPUCpus(config.Resources.CPUsetCPUs) + } + if config.Resources.CPUsetMems != "" { + g.SetLinuxResourcesCPUMems(config.Resources.CPUsetMems) + } + + // Devices + if config.Privileged { + // If privileged, we need to add all the host devices to the + // spec. We do not add the user provided ones because we are + // already adding them all. + if err := config.AddPrivilegedDevices(&g); err != nil { + return nil, err + } + } else { + for _, device := range config.Devices { + if err := addDevice(&g, device); err != nil { + return nil, err + } + } + } + + for _, uidmap := range config.IDMappings.UIDMap { + g.AddLinuxUIDMapping(uint32(uidmap.HostID), uint32(uidmap.ContainerID), uint32(uidmap.Size)) + } + for _, gidmap := range config.IDMappings.GIDMap { + g.AddLinuxGIDMapping(uint32(gidmap.HostID), uint32(gidmap.ContainerID), uint32(gidmap.Size)) + } + // SECURITY OPTS + g.SetProcessNoNewPrivileges(config.NoNewPrivs) + g.SetProcessApparmorProfile(config.ApparmorProfile) + g.SetProcessSelinuxLabel(config.ProcessLabel) + g.SetLinuxMountLabel(config.MountLabel) + blockAccessToKernelFilesystems(config, &g) + + // RESOURCES - PIDS + if config.Resources.PidsLimit != 0 { + g.SetLinuxResourcesPidsLimit(config.Resources.PidsLimit) + } + + for _, i := range config.Tmpfs { + // Default options if nothing passed + options := []string{"rw", "noexec", "nosuid", "nodev", "size=65536k"} + spliti := strings.SplitN(i, ":", 2) + if len(spliti) > 1 { + if _, _, err := mount.ParseTmpfsOptions(spliti[1]); err != nil { + return nil, err + } + options = strings.Split(spliti[1], ",") + } + tmpfsMnt := spec.Mount{ + Destination: spliti[0], + Type: "tmpfs", + Source: "tmpfs", + Options: append(options, "tmpcopyup"), + } + g.AddMount(tmpfsMnt) + } + + for name, val := range config.Env { + g.AddProcessEnv(name, val) + } + + if err := addRlimits(config, &g); err != nil { + return nil, err + } + + if err := addPidNS(config, &g); err != nil { + return nil, err + } + + if err := addUserNS(config, &g); err != nil { + return nil, err + } + + if err := addNetNS(config, &g); err != nil { + return nil, err + } + + if err := addUTSNS(config, &g); err != nil { + return nil, err + } + + if err := addIpcNS(config, &g); err != nil { + return nil, err + } + configSpec := g.Spec() + + // HANDLE CAPABILITIES + // NOTE: Must happen before SECCOMP + if !config.Privileged { + if err := setupCapabilities(config, configSpec); err != nil { + return nil, err + } + } else { + g.SetupPrivileged(true) + } + + // HANDLE SECCOMP + if config.SeccompProfilePath != "unconfined" { + if config.SeccompProfilePath != "" { + seccompProfile, err := ioutil.ReadFile(config.SeccompProfilePath) + if err != nil { + return nil, errors.Wrapf(err, "opening seccomp profile (%s) failed", config.SeccompProfilePath) + } + seccompConfig, err := seccomp.LoadProfile(string(seccompProfile), configSpec) + if err != nil { + return nil, errors.Wrapf(err, "loading seccomp profile (%s) failed", config.SeccompProfilePath) + } + configSpec.Linux.Seccomp = seccompConfig + } else { + seccompConfig, err := seccomp.GetDefaultProfile(configSpec) + if err != nil { + return nil, errors.Wrapf(err, "loading seccomp profile (%s) failed", config.SeccompProfilePath) + } + configSpec.Linux.Seccomp = seccompConfig + } + } + + // Clear default Seccomp profile from Generator for privileged containers + if config.SeccompProfilePath == "unconfined" || config.Privileged { + configSpec.Linux.Seccomp = nil + } + + // BIND MOUNTS + mounts, err := config.GetVolumeMounts(configSpec.Mounts) + if err != nil { + return nil, errors.Wrapf(err, "error getting volume mounts") + } + configSpec.Mounts = append(configSpec.Mounts, mounts...) + for _, mount := range configSpec.Mounts { + for _, opt := range mount.Options { + switch opt { + case "private", "rprivate", "slave", "rslave", "shared", "rshared": + if err := g.SetLinuxRootPropagation(opt); err != nil { + return nil, errors.Wrapf(err, "error setting root propagation for %q", mount.Destination) + } + } + } + } + + // BLOCK IO + blkio, err := config.CreateBlockIO() + if err != nil { + return nil, errors.Wrapf(err, "error creating block io") + } + if blkio != nil { + configSpec.Linux.Resources.BlockIO = blkio + } + + /* + //Annotations + Resources: &configSpec.LinuxResources{ + BlockIO: &blkio, + //HugepageLimits: + Network: &configSpec.LinuxNetwork{ + // ClassID *uint32 + // Priorites []LinuxInterfacePriority + }, + }, + //CgroupsPath: + //Namespaces: []LinuxNamespace + // DefaultAction: + // Architectures + // Syscalls: + }, + // RootfsPropagation + // MaskedPaths + // ReadonlyPaths: + // IntelRdt + }, + } + */ + return configSpec, nil +} + +func blockAccessToKernelFilesystems(config *CreateConfig, g *generate.Generator) { + if !config.Privileged { + for _, mp := range []string{ + "/proc/kcore", + "/proc/latency_stats", + "/proc/timer_list", + "/proc/timer_stats", + "/proc/sched_debug", + "/proc/scsi", + "/sys/firmware", + } { + g.AddLinuxMaskedPaths(mp) + } + + for _, rp := range []string{ + "/proc/asound", + "/proc/bus", + "/proc/fs", + "/proc/irq", + "/proc/sys", + "/proc/sysrq-trigger", + } { + g.AddLinuxReadonlyPaths(rp) + } + } +} + +func addPidNS(config *CreateConfig, g *generate.Generator) error { + pidMode := config.PidMode + if pidMode.IsHost() { + return g.RemoveLinuxNamespace(string(spec.PIDNamespace)) + } + if pidMode.IsContainer() { + logrus.Debug("using container pidmode") + } + return nil +} + +func addUserNS(config *CreateConfig, g *generate.Generator) error { + if (len(config.IDMappings.UIDMap) > 0 || len(config.IDMappings.GIDMap) > 0) && !config.UsernsMode.IsHost() { + g.AddOrReplaceLinuxNamespace(spec.UserNamespace, "") + } + return nil +} + +func addNetNS(config *CreateConfig, g *generate.Generator) error { + netMode := config.NetMode + if netMode.IsHost() { + logrus.Debug("Using host netmode") + return g.RemoveLinuxNamespace(spec.NetworkNamespace) + } else if netMode.IsNone() { + logrus.Debug("Using none netmode") + return nil + } else if netMode.IsBridge() { + logrus.Debug("Using bridge netmode") + return nil + } else if netMode.IsContainer() { + logrus.Debug("Using container netmode") + } else { + return errors.Errorf("unknown network mode") + } + return nil +} + +func addUTSNS(config *CreateConfig, g *generate.Generator) error { + utsMode := config.UtsMode + if utsMode.IsHost() { + return g.RemoveLinuxNamespace(spec.UTSNamespace) + } + return nil +} + +func addIpcNS(config *CreateConfig, g *generate.Generator) error { + ipcMode := config.IpcMode + if ipcMode.IsHost() { + return g.RemoveLinuxNamespace(spec.IPCNamespace) + } + if ipcMode.IsContainer() { + logrus.Debug("Using container ipcmode") + } + + return nil +} + +func addRlimits(config *CreateConfig, g *generate.Generator) error { + var ( + ul *units.Ulimit + err error + ) + + for _, u := range config.Resources.Ulimit { + if ul, err = units.ParseUlimit(u); err != nil { + return errors.Wrapf(err, "ulimit option %q requires name=SOFT:HARD, failed to be parsed", u) + } + + g.AddProcessRlimits("RLIMIT_"+strings.ToUpper(ul.Name), uint64(ul.Hard), uint64(ul.Soft)) + } + return nil +} + +func setupCapabilities(config *CreateConfig, configSpec *spec.Spec) error { + var err error + var caplist []string + caplist, err = caps.TweakCapabilities(configSpec.Process.Capabilities.Bounding, config.CapAdd, config.CapDrop) + if err != nil { + return err + } + + configSpec.Process.Capabilities.Bounding = caplist + configSpec.Process.Capabilities.Permitted = caplist + configSpec.Process.Capabilities.Inheritable = caplist + configSpec.Process.Capabilities.Effective = caplist + return nil +} + +func addDevice(g *generate.Generator, device string) error { + dev, err := devices.DeviceFromPath(device, "rwm") + if err != nil { + return errors.Wrapf(err, "%s is not a valid device", device) + } + linuxdev := spec.LinuxDevice{ + Path: dev.Path, + Type: string(dev.Type), + Major: dev.Major, + Minor: dev.Minor, + FileMode: &dev.FileMode, + UID: &dev.Uid, + GID: &dev.Gid, + } + g.AddDevice(linuxdev) + g.AddLinuxResourcesDevice(true, string(dev.Type), &dev.Major, &dev.Minor, dev.Permissions) + return nil +} |