//go:build linux || freebsd // +build linux freebsd package libpod import ( "context" "errors" "fmt" "math" "os" "strconv" "strings" "time" cdi "github.com/container-orchestrated-devices/container-device-interface/pkg/cdi" "github.com/containers/buildah/pkg/overlay" butil "github.com/containers/buildah/util" "github.com/containers/common/pkg/apparmor" cutil "github.com/containers/common/pkg/util" "github.com/containers/podman/v4/pkg/annotations" "github.com/containers/podman/v4/pkg/lookup" "github.com/containers/podman/v4/pkg/rootless" "github.com/containers/podman/v4/pkg/util" "github.com/containers/storage/pkg/idtools" securejoin "github.com/cyphar/filepath-securejoin" spec "github.com/opencontainers/runtime-spec/specs-go" "github.com/opencontainers/runtime-tools/generate" "github.com/opencontainers/selinux/go-selinux/label" "github.com/sirupsen/logrus" ) // Internal only function which returns upper and work dir from // overlay options. func getOverlayUpperAndWorkDir(options []string) (string, string, error) { upperDir := "" workDir := "" for _, o := range options { if strings.HasPrefix(o, "upperdir") { splitOpt := strings.SplitN(o, "=", 2) if len(splitOpt) > 1 { upperDir = splitOpt[1] if upperDir == "" { return "", "", errors.New("cannot accept empty value for upperdir") } } } if strings.HasPrefix(o, "workdir") { splitOpt := strings.SplitN(o, "=", 2) if len(splitOpt) > 1 { workDir = splitOpt[1] if workDir == "" { return "", "", errors.New("cannot accept empty value for workdir") } } } } if (upperDir != "" && workDir == "") || (upperDir == "" && workDir != "") { return "", "", errors.New("must specify both upperdir and workdir") } return upperDir, workDir, nil } // Generate spec for a container // Accepts a map of the container's dependencies func (c *Container) generateSpec(ctx context.Context) (*spec.Spec, error) { overrides := c.getUserOverrides() execUser, err := lookup.GetUserGroupInfo(c.state.Mountpoint, c.config.User, overrides) if err != nil { if cutil.StringInSlice(c.config.User, c.config.HostUsers) { execUser, err = lookupHostUser(c.config.User) } if err != nil { return nil, err } } // NewFromSpec() is deprecated according to its comment // however the recommended replace just causes a nil map panic //nolint:staticcheck g := generate.NewFromSpec(c.config.Spec) // If the flag to mount all devices is set for a privileged container, add // all the devices from the host's machine into the container if c.config.MountAllDevices { if err := util.AddPrivilegedDevices(&g); err != nil { return nil, err } } // If network namespace was requested, add it now if c.config.CreateNetNS { if c.config.PostConfigureNetNS { if err := g.AddOrReplaceLinuxNamespace(string(spec.NetworkNamespace), ""); err != nil { return nil, err } } else { if err := g.AddOrReplaceLinuxNamespace(string(spec.NetworkNamespace), c.state.NetNS.Path()); err != nil { return nil, err } } } // Apply AppArmor checks and load the default profile if needed. if len(c.config.Spec.Process.ApparmorProfile) > 0 { updatedProfile, err := apparmor.CheckProfileAndLoadDefault(c.config.Spec.Process.ApparmorProfile) if err != nil { return nil, err } g.SetProcessApparmorProfile(updatedProfile) } if err := c.makeBindMounts(); err != nil { return nil, err } if err := c.mountNotifySocket(g); err != nil { return nil, err } // Get host UID and GID based on the container process UID and GID. hostUID, hostGID, err := butil.GetHostIDs(util.IDtoolsToRuntimeSpec(c.config.IDMappings.UIDMap), util.IDtoolsToRuntimeSpec(c.config.IDMappings.GIDMap), uint32(execUser.Uid), uint32(execUser.Gid)) if err != nil { return nil, err } // Add named volumes for _, namedVol := range c.config.NamedVolumes { volume, err := c.runtime.GetVolume(namedVol.Name) if err != nil { return nil, fmt.Errorf("error retrieving volume %s to add to container %s: %w", namedVol.Name, c.ID(), err) } mountPoint, err := volume.MountPoint() if err != nil { return nil, err } overlayFlag := false upperDir := "" workDir := "" for _, o := range namedVol.Options { if o == "O" { overlayFlag = true upperDir, workDir, err = getOverlayUpperAndWorkDir(namedVol.Options) if err != nil { return nil, err } } } if overlayFlag { var overlayMount spec.Mount var overlayOpts *overlay.Options contentDir, err := overlay.TempDir(c.config.StaticDir, c.RootUID(), c.RootGID()) if err != nil { return nil, err } overlayOpts = &overlay.Options{RootUID: c.RootUID(), RootGID: c.RootGID(), UpperDirOptionFragment: upperDir, WorkDirOptionFragment: workDir, GraphOpts: c.runtime.store.GraphOptions(), } overlayMount, err = overlay.MountWithOptions(contentDir, mountPoint, namedVol.Dest, overlayOpts) if err != nil { return nil, fmt.Errorf("mounting overlay failed %q: %w", mountPoint, err) } for _, o := range namedVol.Options { if o == "U" { if err := c.ChangeHostPathOwnership(mountPoint, true, int(hostUID), int(hostGID)); err != nil { return nil, err } if err := c.ChangeHostPathOwnership(contentDir, true, int(hostUID), int(hostGID)); err != nil { return nil, err } } } g.AddMount(overlayMount) } else { volMount := spec.Mount{ Type: "bind", Source: mountPoint, Destination: namedVol.Dest, Options: namedVol.Options, } g.AddMount(volMount) } } // Check if the spec file mounts contain the options z, Z or U. // If they have z or Z, relabel the source directory and then remove the option. // If they have U, chown the source directory and them remove the option. for i := range g.Config.Mounts { m := &g.Config.Mounts[i] var options []string for _, o := range m.Options { switch o { case "U": if m.Type == "tmpfs" { options = append(options, []string{fmt.Sprintf("uid=%d", execUser.Uid), fmt.Sprintf("gid=%d", execUser.Gid)}...) } else { // only chown on initial creation of container if err := c.ChangeHostPathOwnership(m.Source, true, int(hostUID), int(hostGID)); err != nil { return nil, err } } case "z": fallthrough case "Z": if err := c.relabel(m.Source, c.MountLabel(), label.IsShared(o)); err != nil { return nil, err } default: options = append(options, o) } } m.Options = options } g.SetProcessSelinuxLabel(c.ProcessLabel()) g.SetLinuxMountLabel(c.MountLabel()) // Add bind mounts to container for dstPath, srcPath := range c.state.BindMounts { newMount := spec.Mount{ Type: "bind", Source: srcPath, Destination: dstPath, Options: []string{"bind", "rprivate"}, } if c.IsReadOnly() && dstPath != "/dev/shm" { newMount.Options = append(newMount.Options, "ro", "nosuid", "noexec", "nodev") } if dstPath == "/dev/shm" && c.state.BindMounts["/dev/shm"] == c.config.ShmDir { newMount.Options = append(newMount.Options, "nosuid", "noexec", "nodev") } if !MountExists(g.Mounts(), dstPath) { g.AddMount(newMount) } else { logrus.Infof("User mount overriding libpod mount at %q", dstPath) } } // Add overlay volumes for _, overlayVol := range c.config.OverlayVolumes { upperDir, workDir, err := getOverlayUpperAndWorkDir(overlayVol.Options) if err != nil { return nil, err } contentDir, err := overlay.TempDir(c.config.StaticDir, c.RootUID(), c.RootGID()) if err != nil { return nil, err } overlayOpts := &overlay.Options{RootUID: c.RootUID(), RootGID: c.RootGID(), UpperDirOptionFragment: upperDir, WorkDirOptionFragment: workDir, GraphOpts: c.runtime.store.GraphOptions(), } overlayMount, err := overlay.MountWithOptions(contentDir, overlayVol.Source, overlayVol.Dest, overlayOpts) if err != nil { return nil, fmt.Errorf("mounting overlay failed %q: %w", overlayVol.Source, err) } // Check overlay volume options for _, o := range overlayVol.Options { if o == "U" { if err := c.ChangeHostPathOwnership(overlayVol.Source, true, int(hostUID), int(hostGID)); err != nil { return nil, err } if err := c.ChangeHostPathOwnership(contentDir, true, int(hostUID), int(hostGID)); err != nil { return nil, err } } } g.AddMount(overlayMount) } // Add image volumes as overlay mounts for _, volume := range c.config.ImageVolumes { // Mount the specified image. img, _, err := c.runtime.LibimageRuntime().LookupImage(volume.Source, nil) if err != nil { return nil, fmt.Errorf("error creating image volume %q:%q: %w", volume.Source, volume.Dest, err) } mountPoint, err := img.Mount(ctx, nil, "") if err != nil { return nil, fmt.Errorf("error mounting image volume %q:%q: %w", volume.Source, volume.Dest, err) } contentDir, err := overlay.TempDir(c.config.StaticDir, c.RootUID(), c.RootGID()) if err != nil { return nil, fmt.Errorf("failed to create TempDir in the %s directory: %w", c.config.StaticDir, err) } var overlayMount spec.Mount if volume.ReadWrite { overlayMount, err = overlay.Mount(contentDir, mountPoint, volume.Dest, c.RootUID(), c.RootGID(), c.runtime.store.GraphOptions()) } else { overlayMount, err = overlay.MountReadOnly(contentDir, mountPoint, volume.Dest, c.RootUID(), c.RootGID(), c.runtime.store.GraphOptions()) } if err != nil { return nil, fmt.Errorf("creating overlay mount for image %q failed: %w", volume.Source, err) } g.AddMount(overlayMount) } hasHomeSet := false for _, s := range c.config.Spec.Process.Env { if strings.HasPrefix(s, "HOME=") { hasHomeSet = true break } } if !hasHomeSet && execUser.Home != "" { c.config.Spec.Process.Env = append(c.config.Spec.Process.Env, fmt.Sprintf("HOME=%s", execUser.Home)) } if c.config.User != "" { // User and Group must go together g.SetProcessUID(uint32(execUser.Uid)) g.SetProcessGID(uint32(execUser.Gid)) g.AddProcessAdditionalGid(uint32(execUser.Gid)) } if c.config.Umask != "" { decVal, err := strconv.ParseUint(c.config.Umask, 8, 32) if err != nil { return nil, fmt.Errorf("invalid Umask Value: %w", err) } umask := uint32(decVal) g.Config.Process.User.Umask = &umask } // Add addition groups if c.config.GroupAdd is not empty if len(c.config.Groups) > 0 { gids, err := lookup.GetContainerGroups(c.config.Groups, c.state.Mountpoint, overrides) if err != nil { return nil, fmt.Errorf("error looking up supplemental groups for container %s: %w", c.ID(), err) } for _, gid := range gids { g.AddProcessAdditionalGid(gid) } } if c.Systemd() { if err := c.setupSystemd(g.Mounts(), g); err != nil { return nil, fmt.Errorf("error adding systemd-specific mounts: %w", err) } } // Look up and add groups the user belongs to, if a group wasn't directly specified if !strings.Contains(c.config.User, ":") { // the gidMappings that are present inside the container user namespace var gidMappings []idtools.IDMap switch { case len(c.config.IDMappings.GIDMap) > 0: gidMappings = c.config.IDMappings.GIDMap case rootless.IsRootless(): // Check whether the current user namespace has enough gids available. availableGids, err := rootless.GetAvailableGids() if err != nil { return nil, fmt.Errorf("cannot read number of available GIDs: %w", err) } gidMappings = []idtools.IDMap{{ ContainerID: 0, HostID: 0, Size: int(availableGids), }} default: gidMappings = []idtools.IDMap{{ ContainerID: 0, HostID: 0, Size: math.MaxInt32, }} } for _, gid := range execUser.Sgids { isGIDAvailable := false for _, m := range gidMappings { if gid >= m.ContainerID && gid < m.ContainerID+m.Size { isGIDAvailable = true break } } if isGIDAvailable { g.AddProcessAdditionalGid(uint32(gid)) } else { logrus.Warnf("Additional gid=%d is not present in the user namespace, skip setting it", gid) } } } // Add shared namespaces from other containers if c.config.IPCNsCtr != "" { if err := c.addNamespaceContainer(&g, IPCNS, c.config.IPCNsCtr, spec.IPCNamespace); err != nil { return nil, err } } if c.config.MountNsCtr != "" { if err := c.addNamespaceContainer(&g, MountNS, c.config.MountNsCtr, spec.MountNamespace); err != nil { return nil, err } } if c.config.NetNsCtr != "" { if err := c.addNamespaceContainer(&g, NetNS, c.config.NetNsCtr, spec.NetworkNamespace); err != nil { return nil, err } } if c.config.PIDNsCtr != "" { if err := c.addNamespaceContainer(&g, PIDNS, c.config.PIDNsCtr, spec.PIDNamespace); err != nil { return nil, err } } if c.config.UserNsCtr != "" { if err := c.addNamespaceContainer(&g, UserNS, c.config.UserNsCtr, spec.UserNamespace); err != nil { return nil, err } if len(g.Config.Linux.UIDMappings) == 0 { // runc complains if no mapping is specified, even if we join another ns. So provide a dummy mapping g.AddLinuxUIDMapping(uint32(0), uint32(0), uint32(1)) g.AddLinuxGIDMapping(uint32(0), uint32(0), uint32(1)) } } availableUIDs, availableGIDs, err := rootless.GetAvailableIDMaps() if err != nil { if os.IsNotExist(err) { // The kernel-provided files only exist if user namespaces are supported logrus.Debugf("User or group ID mappings not available: %s", err) } else { return nil, err } } else { g.Config.Linux.UIDMappings = rootless.MaybeSplitMappings(g.Config.Linux.UIDMappings, availableUIDs) g.Config.Linux.GIDMappings = rootless.MaybeSplitMappings(g.Config.Linux.GIDMappings, availableGIDs) } // Hostname handling: // If we have a UTS namespace, set Hostname in the OCI spec. // Set the HOSTNAME environment variable unless explicitly overridden by // the user (already present in OCI spec). If we don't have a UTS ns, // set it to the host's hostname instead. hostname := c.Hostname() foundUTS := false for _, i := range c.config.Spec.Linux.Namespaces { if i.Type == spec.UTSNamespace && i.Path == "" { foundUTS = true g.SetHostname(hostname) break } } if !foundUTS { tmpHostname, err := os.Hostname() if err != nil { return nil, err } hostname = tmpHostname } needEnv := true for _, checkEnv := range g.Config.Process.Env { if strings.SplitN(checkEnv, "=", 2)[0] == "HOSTNAME" { needEnv = false break } } if needEnv { g.AddProcessEnv("HOSTNAME", hostname) } if c.config.UTSNsCtr != "" { if err := c.addNamespaceContainer(&g, UTSNS, c.config.UTSNsCtr, spec.UTSNamespace); err != nil { return nil, err } } if c.config.CgroupNsCtr != "" { if err := c.addNamespaceContainer(&g, CgroupNS, c.config.CgroupNsCtr, spec.CgroupNamespace); err != nil { return nil, err } } if c.config.UserNsCtr == "" && c.config.IDMappings.AutoUserNs { if err := g.AddOrReplaceLinuxNamespace(string(spec.UserNamespace), ""); err != nil { return nil, err } g.ClearLinuxUIDMappings() for _, uidmap := range c.config.IDMappings.UIDMap { g.AddLinuxUIDMapping(uint32(uidmap.HostID), uint32(uidmap.ContainerID), uint32(uidmap.Size)) } g.ClearLinuxGIDMappings() for _, gidmap := range c.config.IDMappings.GIDMap { g.AddLinuxGIDMapping(uint32(gidmap.HostID), uint32(gidmap.ContainerID), uint32(gidmap.Size)) } } g.SetRootPath(c.state.Mountpoint) g.AddAnnotation(annotations.Created, c.config.CreatedTime.Format(time.RFC3339Nano)) g.AddAnnotation("org.opencontainers.image.stopSignal", fmt.Sprintf("%d", c.config.StopSignal)) if _, exists := g.Config.Annotations[annotations.ContainerManager]; !exists { g.AddAnnotation(annotations.ContainerManager, annotations.ContainerManagerLibpod) } cgroupPath, err := c.getOCICgroupPath() if err != nil { return nil, err } g.SetLinuxCgroupsPath(cgroupPath) // Warning: CDI may alter g.Config in place. if len(c.config.CDIDevices) > 0 { registry := cdi.GetRegistry( cdi.WithAutoRefresh(false), ) if err := registry.Refresh(); err != nil { logrus.Debugf("The following error was triggered when refreshing the CDI registry: %v", err) } _, err := registry.InjectDevices(g.Config, c.config.CDIDevices...) if err != nil { return nil, fmt.Errorf("error setting up CDI devices: %w", err) } } // Mounts need to be sorted so paths will not cover other paths mounts := sortMounts(g.Mounts()) g.ClearMounts() // Determine property of RootPropagation based on volume properties. If // a volume is shared, then keep root propagation shared. This should // work for slave and private volumes too. // // For slave volumes, it can be either [r]shared/[r]slave. // // For private volumes any root propagation value should work. rootPropagation := "" for _, m := range mounts { // We need to remove all symlinks from tmpfs mounts. // Runc and other runtimes may choke on them. // Easy solution: use securejoin to do a scoped evaluation of // the links, then trim off the mount prefix. if m.Type == "tmpfs" { finalPath, err := securejoin.SecureJoin(c.state.Mountpoint, m.Destination) if err != nil { return nil, fmt.Errorf("error resolving symlinks for mount destination %s: %w", m.Destination, err) } trimmedPath := strings.TrimPrefix(finalPath, strings.TrimSuffix(c.state.Mountpoint, "/")) m.Destination = trimmedPath } g.AddMount(m) for _, opt := range m.Options { switch opt { case MountShared, MountRShared: if rootPropagation != MountShared && rootPropagation != MountRShared { rootPropagation = MountShared } case MountSlave, MountRSlave: if rootPropagation != MountShared && rootPropagation != MountRShared && rootPropagation != MountSlave && rootPropagation != MountRSlave { rootPropagation = MountRSlave } } } } if rootPropagation != "" { logrus.Debugf("Set root propagation to %q", rootPropagation) if err := g.SetLinuxRootPropagation(rootPropagation); err != nil { return nil, err } } // Warning: precreate hooks may alter g.Config in place. if c.state.ExtensionStageHooks, err = c.setupOCIHooks(ctx, g.Config); err != nil { return nil, fmt.Errorf("error setting up OCI Hooks: %w", err) } if len(c.config.EnvSecrets) > 0 { manager, err := c.runtime.SecretsManager() if err != nil { return nil, err } if err != nil { return nil, err } for name, secr := range c.config.EnvSecrets { _, data, err := manager.LookupSecretData(secr.Name) if err != nil { return nil, err } g.AddProcessEnv(name, string(data)) } } // Pass down the LISTEN_* environment (see #10443). for _, key := range []string{"LISTEN_PID", "LISTEN_FDS", "LISTEN_FDNAMES"} { if val, ok := os.LookupEnv(key); ok { // Force the PID to `1` since we cannot rely on (all // versions of) all runtimes to do it for us. if key == "LISTEN_PID" { val = "1" } g.AddProcessEnv(key, val) } } return g.Config, nil }