diff options
Diffstat (limited to 'libpod')
41 files changed, 3680 insertions, 2916 deletions
diff --git a/libpod/boltdb_state.go b/libpod/boltdb_state.go index 81f11410b..e5a7e20fc 100644 --- a/libpod/boltdb_state.go +++ b/libpod/boltdb_state.go @@ -1278,7 +1278,7 @@ func (s *BoltState) NetworkConnect(ctr *Container, network string, opts types.Pe } netConnected := ctrNetworksBkt.Get([]byte(network)) if netConnected != nil { - return fmt.Errorf("container %s is already connected to network %q: %w", ctr.ID(), network, define.ErrNetworkExists) + return fmt.Errorf("container %s is already connected to network %q: %w", ctr.ID(), network, define.ErrNetworkConnected) } // Add the network diff --git a/libpod/container.go b/libpod/container.go index 6c05b1084..1891b124f 100644 --- a/libpod/container.go +++ b/libpod/container.go @@ -237,6 +237,9 @@ type ContainerNamedVolume struct { Dest string `json:"dest"` // Options are fstab style mount options Options []string `json:"options,omitempty"` + // IsAnonymous sets the named volume as anonymous even if it has a name + // This is used for emptyDir volumes from a kube yaml + IsAnonymous bool `json:"setAnonymous,omitempty"` } // ContainerOverlayVolume is a overlay volume that will be mounted into the @@ -1130,20 +1133,6 @@ func (c *Container) NetworkDisabled() (bool, error) { return networkDisabled(c) } -func networkDisabled(c *Container) (bool, error) { - if c.config.CreateNetNS { - return false, nil - } - if !c.config.PostConfigureNetNS { - for _, ns := range c.config.Spec.Linux.Namespaces { - if ns.Type == spec.NetworkNamespace { - return ns.Path == "", nil - } - } - } - return false, nil -} - func (c *Container) HostNetwork() bool { if c.config.CreateNetNS || c.config.NetNsCtr != "" { return false diff --git a/libpod/container_api.go b/libpod/container_api.go index 2ff4bfe08..f88e38ce1 100644 --- a/libpod/container_api.go +++ b/libpod/container_api.go @@ -16,6 +16,7 @@ import ( "github.com/containers/podman/v4/libpod/events" "github.com/containers/podman/v4/pkg/signal" "github.com/containers/storage/pkg/archive" + spec "github.com/opencontainers/runtime-spec/specs-go" "github.com/sirupsen/logrus" ) @@ -98,6 +99,15 @@ func (c *Container) Start(ctx context.Context, recursive bool) error { return c.start() } +// Update updates the given container. +// only the cgroup config can be updated and therefore only a linux resource spec is passed. +func (c *Container) Update(res *spec.LinuxResources) error { + if err := c.syncContainer(); err != nil { + return err + } + return c.update(res) +} + // StartAndAttach starts a container and attaches to it. // This acts as a combination of the Start and Attach APIs, ensuring proper // ordering of the two such that no output from the container is lost (e.g. the diff --git a/libpod/container_config.go b/libpod/container_config.go index bd9816651..f3585d22c 100644 --- a/libpod/container_config.go +++ b/libpod/container_config.go @@ -7,6 +7,7 @@ import ( "github.com/containers/common/libnetwork/types" "github.com/containers/common/pkg/secrets" "github.com/containers/image/v5/manifest" + "github.com/containers/podman/v4/libpod/define" "github.com/containers/podman/v4/pkg/namespaces" "github.com/containers/podman/v4/pkg/specgen" "github.com/containers/storage" @@ -392,6 +393,8 @@ type ContainerMiscConfig struct { Systemd *bool `json:"systemd,omitempty"` // HealthCheckConfig has the health check command and related timings HealthCheckConfig *manifest.Schema2HealthConfig `json:"healthcheck"` + // HealthCheckOnFailureAction defines an action to take once the container turns unhealthy. + HealthCheckOnFailureAction define.HealthCheckOnFailureAction `json:"healthcheck_on_failure_action"` // PreserveFDs is a number of additional file descriptors (in addition // to 0, 1, 2) that will be passed to the executed process. The total FDs // passed will be 3 + PreserveFDs. diff --git a/libpod/container_freebsd.go b/libpod/container_freebsd.go index f9fbc4daa..7292ba37a 100644 --- a/libpod/container_freebsd.go +++ b/libpod/container_freebsd.go @@ -10,3 +10,13 @@ type containerPlatformState struct { // namespace. NetworkJail string `json:"-"` } + +func networkDisabled(c *Container) (bool, error) { + if c.config.CreateNetNS { + return false, nil + } + if !c.config.PostConfigureNetNS { + return c.state.NetworkJail == "", nil + } + return false, nil +} diff --git a/libpod/container_inspect.go b/libpod/container_inspect.go index 5e2ab2818..ad8bae286 100644 --- a/libpod/container_inspect.go +++ b/libpod/container_inspect.go @@ -390,6 +390,8 @@ func (c *Container) generateInspectContainerConfig(spec *spec.Spec) *define.Insp // leak. ctrConfig.Healthcheck = c.config.HealthCheckConfig + ctrConfig.HealthcheckOnFailureAction = c.config.HealthCheckOnFailureAction.String() + ctrConfig.CreateCommand = c.config.CreateCommand ctrConfig.Timezone = c.config.Timezone diff --git a/libpod/container_internal.go b/libpod/container_internal.go index 60fb29607..32674235a 100644 --- a/libpod/container_internal.go +++ b/libpod/container_internal.go @@ -27,6 +27,7 @@ import ( cutil "github.com/containers/common/pkg/util" "github.com/containers/podman/v4/libpod/define" "github.com/containers/podman/v4/libpod/events" + "github.com/containers/podman/v4/libpod/shutdown" "github.com/containers/podman/v4/pkg/ctime" "github.com/containers/podman/v4/pkg/lookup" "github.com/containers/podman/v4/pkg/rootless" @@ -1038,6 +1039,13 @@ func (c *Container) init(ctx context.Context, retainRetries bool) error { } } + // To ensure that we don't lose track of Conmon if hit by a SIGTERM + // in the middle of setting up the container, inhibit shutdown signals + // until after we save Conmon's PID to the state. + // TODO: This can likely be removed once conmon-rs support merges. + shutdown.Inhibit() + defer shutdown.Uninhibit() + // With the spec complete, do an OCI create if _, err = c.ociRuntime.CreateContainer(c, nil); err != nil { return err @@ -1073,6 +1081,7 @@ func (c *Container) init(ctx context.Context, retainRetries bool) error { if err := c.save(); err != nil { return err } + if c.config.HealthCheckConfig != nil { if err := c.createTimer(); err != nil { logrus.Error(err) @@ -2343,3 +2352,12 @@ func (c *Container) extractSecretToCtrStorage(secr *ContainerSecret) error { } return nil } + +// update calls the ociRuntime update function to modify a cgroup config after container creation +func (c *Container) update(resources *spec.LinuxResources) error { + if err := c.ociRuntime.UpdateContainer(c, resources); err != nil { + return err + } + logrus.Debugf("updated container %s", c.ID()) + return nil +} diff --git a/libpod/container_internal_common.go b/libpod/container_internal_common.go new file mode 100644 index 000000000..192a86b6a --- /dev/null +++ b/libpod/container_internal_common.go @@ -0,0 +1,2699 @@ +//go:build linux || freebsd +// +build linux freebsd + +package libpod + +import ( + "context" + "errors" + "fmt" + "io" + "io/ioutil" + "math" + "os" + "os/user" + "path" + "path/filepath" + "strconv" + "strings" + "syscall" + "time" + + metadata "github.com/checkpoint-restore/checkpointctl/lib" + "github.com/checkpoint-restore/go-criu/v5/stats" + cdi "github.com/container-orchestrated-devices/container-device-interface/pkg/cdi" + "github.com/containers/buildah" + "github.com/containers/buildah/pkg/chrootuser" + "github.com/containers/buildah/pkg/overlay" + butil "github.com/containers/buildah/util" + "github.com/containers/common/libnetwork/etchosts" + "github.com/containers/common/libnetwork/resolvconf" + "github.com/containers/common/libnetwork/types" + "github.com/containers/common/pkg/apparmor" + "github.com/containers/common/pkg/chown" + "github.com/containers/common/pkg/config" + "github.com/containers/common/pkg/subscriptions" + "github.com/containers/common/pkg/umask" + cutil "github.com/containers/common/pkg/util" + is "github.com/containers/image/v5/storage" + "github.com/containers/podman/v4/libpod/define" + "github.com/containers/podman/v4/libpod/events" + "github.com/containers/podman/v4/pkg/annotations" + "github.com/containers/podman/v4/pkg/checkpoint/crutils" + "github.com/containers/podman/v4/pkg/criu" + "github.com/containers/podman/v4/pkg/lookup" + "github.com/containers/podman/v4/pkg/rootless" + "github.com/containers/podman/v4/pkg/util" + "github.com/containers/podman/v4/version" + "github.com/containers/storage/pkg/archive" + "github.com/containers/storage/pkg/idtools" + "github.com/containers/storage/pkg/lockfile" + securejoin "github.com/cyphar/filepath-securejoin" + runcuser "github.com/opencontainers/runc/libcontainer/user" + spec "github.com/opencontainers/runtime-spec/specs-go" + "github.com/opencontainers/runtime-tools/generate" + "github.com/opencontainers/selinux/go-selinux" + "github.com/opencontainers/selinux/go-selinux/label" + "github.com/sirupsen/logrus" +) + +// Internal only function which returns upper and work dir from +// overlay options. +func getOverlayUpperAndWorkDir(options []string) (string, string, error) { + upperDir := "" + workDir := "" + for _, o := range options { + if strings.HasPrefix(o, "upperdir") { + splitOpt := strings.SplitN(o, "=", 2) + if len(splitOpt) > 1 { + upperDir = splitOpt[1] + if upperDir == "" { + return "", "", errors.New("cannot accept empty value for upperdir") + } + } + } + if strings.HasPrefix(o, "workdir") { + splitOpt := strings.SplitN(o, "=", 2) + if len(splitOpt) > 1 { + workDir = splitOpt[1] + if workDir == "" { + return "", "", errors.New("cannot accept empty value for workdir") + } + } + } + } + if (upperDir != "" && workDir == "") || (upperDir == "" && workDir != "") { + return "", "", errors.New("must specify both upperdir and workdir") + } + return upperDir, workDir, nil +} + +// Generate spec for a container +// Accepts a map of the container's dependencies +func (c *Container) generateSpec(ctx context.Context) (*spec.Spec, error) { + overrides := c.getUserOverrides() + execUser, err := lookup.GetUserGroupInfo(c.state.Mountpoint, c.config.User, overrides) + if err != nil { + if cutil.StringInSlice(c.config.User, c.config.HostUsers) { + execUser, err = lookupHostUser(c.config.User) + } + if err != nil { + return nil, err + } + } + + // NewFromSpec() is deprecated according to its comment + // however the recommended replace just causes a nil map panic + //nolint:staticcheck + g := generate.NewFromSpec(c.config.Spec) + + // If the flag to mount all devices is set for a privileged container, add + // all the devices from the host's machine into the container + if c.config.MountAllDevices { + if err := util.AddPrivilegedDevices(&g); err != nil { + return nil, err + } + } + + // If network namespace was requested, add it now + if err := c.addNetworkNamespace(&g); err != nil { + return nil, err + } + + // Apply AppArmor checks and load the default profile if needed. + if len(c.config.Spec.Process.ApparmorProfile) > 0 { + updatedProfile, err := apparmor.CheckProfileAndLoadDefault(c.config.Spec.Process.ApparmorProfile) + if err != nil { + return nil, err + } + g.SetProcessApparmorProfile(updatedProfile) + } + + if err := c.makeBindMounts(); err != nil { + return nil, err + } + + if err := c.mountNotifySocket(g); err != nil { + return nil, err + } + + // Get host UID and GID based on the container process UID and GID. + hostUID, hostGID, err := butil.GetHostIDs(util.IDtoolsToRuntimeSpec(c.config.IDMappings.UIDMap), util.IDtoolsToRuntimeSpec(c.config.IDMappings.GIDMap), uint32(execUser.Uid), uint32(execUser.Gid)) + if err != nil { + return nil, err + } + + // Add named volumes + for _, namedVol := range c.config.NamedVolumes { + volume, err := c.runtime.GetVolume(namedVol.Name) + if err != nil { + return nil, fmt.Errorf("error retrieving volume %s to add to container %s: %w", namedVol.Name, c.ID(), err) + } + mountPoint, err := volume.MountPoint() + if err != nil { + return nil, err + } + + overlayFlag := false + upperDir := "" + workDir := "" + for _, o := range namedVol.Options { + if o == "O" { + overlayFlag = true + upperDir, workDir, err = getOverlayUpperAndWorkDir(namedVol.Options) + if err != nil { + return nil, err + } + } + } + + if overlayFlag { + var overlayMount spec.Mount + var overlayOpts *overlay.Options + contentDir, err := overlay.TempDir(c.config.StaticDir, c.RootUID(), c.RootGID()) + if err != nil { + return nil, err + } + + overlayOpts = &overlay.Options{RootUID: c.RootUID(), + RootGID: c.RootGID(), + UpperDirOptionFragment: upperDir, + WorkDirOptionFragment: workDir, + GraphOpts: c.runtime.store.GraphOptions(), + } + + overlayMount, err = overlay.MountWithOptions(contentDir, mountPoint, namedVol.Dest, overlayOpts) + if err != nil { + return nil, fmt.Errorf("mounting overlay failed %q: %w", mountPoint, err) + } + + for _, o := range namedVol.Options { + if o == "U" { + if err := c.ChangeHostPathOwnership(mountPoint, true, int(hostUID), int(hostGID)); err != nil { + return nil, err + } + + if err := c.ChangeHostPathOwnership(contentDir, true, int(hostUID), int(hostGID)); err != nil { + return nil, err + } + } + } + g.AddMount(overlayMount) + } else { + volMount := spec.Mount{ + Type: define.TypeBind, + Source: mountPoint, + Destination: namedVol.Dest, + Options: namedVol.Options, + } + g.AddMount(volMount) + } + } + + // Check if the spec file mounts contain the options z, Z or U. + // If they have z or Z, relabel the source directory and then remove the option. + // If they have U, chown the source directory and them remove the option. + for i := range g.Config.Mounts { + m := &g.Config.Mounts[i] + var options []string + for _, o := range m.Options { + switch o { + case "U": + if m.Type == "tmpfs" { + options = append(options, []string{fmt.Sprintf("uid=%d", execUser.Uid), fmt.Sprintf("gid=%d", execUser.Gid)}...) + } else { + // only chown on initial creation of container + if err := c.ChangeHostPathOwnership(m.Source, true, int(hostUID), int(hostGID)); err != nil { + return nil, err + } + } + case "z": + fallthrough + case "Z": + if err := c.relabel(m.Source, c.MountLabel(), label.IsShared(o)); err != nil { + return nil, err + } + + default: + options = append(options, o) + } + } + m.Options = options + } + + c.setProcessLabel(&g) + c.setMountLabel(&g) + + // Add bind mounts to container + for dstPath, srcPath := range c.state.BindMounts { + newMount := spec.Mount{ + Type: define.TypeBind, + Source: srcPath, + Destination: dstPath, + Options: bindOptions, + } + if c.IsReadOnly() && dstPath != "/dev/shm" { + newMount.Options = append(newMount.Options, "ro", "nosuid", "noexec", "nodev") + } + if dstPath == "/dev/shm" && c.state.BindMounts["/dev/shm"] == c.config.ShmDir { + newMount.Options = append(newMount.Options, "nosuid", "noexec", "nodev") + } + if !MountExists(g.Mounts(), dstPath) { + g.AddMount(newMount) + } else { + logrus.Infof("User mount overriding libpod mount at %q", dstPath) + } + } + + // Add overlay volumes + for _, overlayVol := range c.config.OverlayVolumes { + upperDir, workDir, err := getOverlayUpperAndWorkDir(overlayVol.Options) + if err != nil { + return nil, err + } + contentDir, err := overlay.TempDir(c.config.StaticDir, c.RootUID(), c.RootGID()) + if err != nil { + return nil, err + } + overlayOpts := &overlay.Options{RootUID: c.RootUID(), + RootGID: c.RootGID(), + UpperDirOptionFragment: upperDir, + WorkDirOptionFragment: workDir, + GraphOpts: c.runtime.store.GraphOptions(), + } + + overlayMount, err := overlay.MountWithOptions(contentDir, overlayVol.Source, overlayVol.Dest, overlayOpts) + if err != nil { + return nil, fmt.Errorf("mounting overlay failed %q: %w", overlayVol.Source, err) + } + + // Check overlay volume options + for _, o := range overlayVol.Options { + if o == "U" { + if err := c.ChangeHostPathOwnership(overlayVol.Source, true, int(hostUID), int(hostGID)); err != nil { + return nil, err + } + + if err := c.ChangeHostPathOwnership(contentDir, true, int(hostUID), int(hostGID)); err != nil { + return nil, err + } + } + } + + g.AddMount(overlayMount) + } + + // Add image volumes as overlay mounts + for _, volume := range c.config.ImageVolumes { + // Mount the specified image. + img, _, err := c.runtime.LibimageRuntime().LookupImage(volume.Source, nil) + if err != nil { + return nil, fmt.Errorf("error creating image volume %q:%q: %w", volume.Source, volume.Dest, err) + } + mountPoint, err := img.Mount(ctx, nil, "") + if err != nil { + return nil, fmt.Errorf("error mounting image volume %q:%q: %w", volume.Source, volume.Dest, err) + } + + contentDir, err := overlay.TempDir(c.config.StaticDir, c.RootUID(), c.RootGID()) + if err != nil { + return nil, fmt.Errorf("failed to create TempDir in the %s directory: %w", c.config.StaticDir, err) + } + + var overlayMount spec.Mount + if volume.ReadWrite { + overlayMount, err = overlay.Mount(contentDir, mountPoint, volume.Dest, c.RootUID(), c.RootGID(), c.runtime.store.GraphOptions()) + } else { + overlayMount, err = overlay.MountReadOnly(contentDir, mountPoint, volume.Dest, c.RootUID(), c.RootGID(), c.runtime.store.GraphOptions()) + } + if err != nil { + return nil, fmt.Errorf("creating overlay mount for image %q failed: %w", volume.Source, err) + } + g.AddMount(overlayMount) + } + + hasHomeSet := false + for _, s := range c.config.Spec.Process.Env { + if strings.HasPrefix(s, "HOME=") { + hasHomeSet = true + break + } + } + if !hasHomeSet && execUser.Home != "" { + c.config.Spec.Process.Env = append(c.config.Spec.Process.Env, fmt.Sprintf("HOME=%s", execUser.Home)) + } + + if c.config.User != "" { + // User and Group must go together + g.SetProcessUID(uint32(execUser.Uid)) + g.SetProcessGID(uint32(execUser.Gid)) + g.AddProcessAdditionalGid(uint32(execUser.Gid)) + } + + if c.config.Umask != "" { + decVal, err := strconv.ParseUint(c.config.Umask, 8, 32) + if err != nil { + return nil, fmt.Errorf("invalid Umask Value: %w", err) + } + umask := uint32(decVal) + g.Config.Process.User.Umask = &umask + } + + // Add addition groups if c.config.GroupAdd is not empty + if len(c.config.Groups) > 0 { + gids, err := lookup.GetContainerGroups(c.config.Groups, c.state.Mountpoint, overrides) + if err != nil { + return nil, fmt.Errorf("error looking up supplemental groups for container %s: %w", c.ID(), err) + } + for _, gid := range gids { + g.AddProcessAdditionalGid(gid) + } + } + + if err := c.addSystemdMounts(&g); err != nil { + return nil, err + } + + // Look up and add groups the user belongs to, if a group wasn't directly specified + if !strings.Contains(c.config.User, ":") { + // the gidMappings that are present inside the container user namespace + var gidMappings []idtools.IDMap + + switch { + case len(c.config.IDMappings.GIDMap) > 0: + gidMappings = c.config.IDMappings.GIDMap + case rootless.IsRootless(): + // Check whether the current user namespace has enough gids available. + availableGids, err := rootless.GetAvailableGids() + if err != nil { + return nil, fmt.Errorf("cannot read number of available GIDs: %w", err) + } + gidMappings = []idtools.IDMap{{ + ContainerID: 0, + HostID: 0, + Size: int(availableGids), + }} + default: + gidMappings = []idtools.IDMap{{ + ContainerID: 0, + HostID: 0, + Size: math.MaxInt32, + }} + } + for _, gid := range execUser.Sgids { + isGIDAvailable := false + for _, m := range gidMappings { + if gid >= m.ContainerID && gid < m.ContainerID+m.Size { + isGIDAvailable = true + break + } + } + if isGIDAvailable { + g.AddProcessAdditionalGid(uint32(gid)) + } else { + logrus.Warnf("Additional gid=%d is not present in the user namespace, skip setting it", gid) + } + } + } + + // Add shared namespaces from other containers + if err := c.addSharedNamespaces(&g); err != nil { + return nil, err + } + + g.SetRootPath(c.state.Mountpoint) + g.AddAnnotation(annotations.Created, c.config.CreatedTime.Format(time.RFC3339Nano)) + g.AddAnnotation("org.opencontainers.image.stopSignal", fmt.Sprintf("%d", c.config.StopSignal)) + + if _, exists := g.Config.Annotations[annotations.ContainerManager]; !exists { + g.AddAnnotation(annotations.ContainerManager, annotations.ContainerManagerLibpod) + } + + if err := c.setCgroupsPath(&g); err != nil { + return nil, err + } + + // Warning: CDI may alter g.Config in place. + if len(c.config.CDIDevices) > 0 { + registry := cdi.GetRegistry( + cdi.WithAutoRefresh(false), + ) + if err := registry.Refresh(); err != nil { + logrus.Debugf("The following error was triggered when refreshing the CDI registry: %v", err) + } + _, err := registry.InjectDevices(g.Config, c.config.CDIDevices...) + if err != nil { + return nil, fmt.Errorf("error setting up CDI devices: %w", err) + } + } + + // Mounts need to be sorted so paths will not cover other paths + mounts := sortMounts(g.Mounts()) + g.ClearMounts() + + for _, m := range mounts { + // We need to remove all symlinks from tmpfs mounts. + // Runc and other runtimes may choke on them. + // Easy solution: use securejoin to do a scoped evaluation of + // the links, then trim off the mount prefix. + if m.Type == "tmpfs" { + finalPath, err := securejoin.SecureJoin(c.state.Mountpoint, m.Destination) + if err != nil { + return nil, fmt.Errorf("error resolving symlinks for mount destination %s: %w", m.Destination, err) + } + trimmedPath := strings.TrimPrefix(finalPath, strings.TrimSuffix(c.state.Mountpoint, "/")) + m.Destination = trimmedPath + } + g.AddMount(m) + } + + if err := c.addRootPropagation(&g, mounts); err != nil { + return nil, err + } + + // Warning: precreate hooks may alter g.Config in place. + if c.state.ExtensionStageHooks, err = c.setupOCIHooks(ctx, g.Config); err != nil { + return nil, fmt.Errorf("error setting up OCI Hooks: %w", err) + } + if len(c.config.EnvSecrets) > 0 { + manager, err := c.runtime.SecretsManager() + if err != nil { + return nil, err + } + if err != nil { + return nil, err + } + for name, secr := range c.config.EnvSecrets { + _, data, err := manager.LookupSecretData(secr.Name) + if err != nil { + return nil, err + } + g.AddProcessEnv(name, string(data)) + } + } + + // Pass down the LISTEN_* environment (see #10443). + for _, key := range []string{"LISTEN_PID", "LISTEN_FDS", "LISTEN_FDNAMES"} { + if val, ok := os.LookupEnv(key); ok { + // Force the PID to `1` since we cannot rely on (all + // versions of) all runtimes to do it for us. + if key == "LISTEN_PID" { + val = "1" + } + g.AddProcessEnv(key, val) + } + } + + return g.Config, nil +} + +// isWorkDirSymlink returns true if resolved workdir is symlink or a chain of symlinks, +// and final resolved target is present either on volume, mount or inside of container +// otherwise it returns false. Following function is meant for internal use only and +// can change at any point of time. +func (c *Container) isWorkDirSymlink(resolvedPath string) bool { + // We cannot create workdir since explicit --workdir is + // set in config but workdir could also be a symlink. + // If it's a symlink, check if the resolved target is present in the container. + // If so, that's a valid use case: return nil. + + maxSymLinks := 0 + for { + // Linux only supports a chain of 40 links. + // Reference: https://github.com/torvalds/linux/blob/master/include/linux/namei.h#L13 + if maxSymLinks > 40 { + break + } + resolvedSymlink, err := os.Readlink(resolvedPath) + if err != nil { + // End sym-link resolution loop. + break + } + if resolvedSymlink != "" { + _, resolvedSymlinkWorkdir, err := c.resolvePath(c.state.Mountpoint, resolvedSymlink) + if isPathOnVolume(c, resolvedSymlinkWorkdir) || isPathOnBindMount(c, resolvedSymlinkWorkdir) { + // Resolved symlink exists on external volume or mount + return true + } + if err != nil { + // Could not resolve path so end sym-link resolution loop. + break + } + if resolvedSymlinkWorkdir != "" { + resolvedPath = resolvedSymlinkWorkdir + _, err := os.Stat(resolvedSymlinkWorkdir) + if err == nil { + // Symlink resolved successfully and resolved path exists on container, + // this is a valid use-case so return nil. + logrus.Debugf("Workdir is a symlink with target to %q and resolved symlink exists on container", resolvedSymlink) + return true + } + } + } + maxSymLinks++ + } + return false +} + +// resolveWorkDir resolves the container's workdir and, depending on the +// configuration, will create it, or error out if it does not exist. +// Note that the container must be mounted before. +func (c *Container) resolveWorkDir() error { + workdir := c.WorkingDir() + + // If the specified workdir is a subdir of a volume or mount, + // we don't need to do anything. The runtime is taking care of + // that. + if isPathOnVolume(c, workdir) || isPathOnBindMount(c, workdir) { + logrus.Debugf("Workdir %q resolved to a volume or mount", workdir) + return nil + } + + _, resolvedWorkdir, err := c.resolvePath(c.state.Mountpoint, workdir) + if err != nil { + return err + } + logrus.Debugf("Workdir %q resolved to host path %q", workdir, resolvedWorkdir) + + st, err := os.Stat(resolvedWorkdir) + if err == nil { + if !st.IsDir() { + return fmt.Errorf("workdir %q exists on container %s, but is not a directory", workdir, c.ID()) + } + return nil + } + if !c.config.CreateWorkingDir { + // No need to create it (e.g., `--workdir=/foo`), so let's make sure + // the path exists on the container. + if err != nil { + if os.IsNotExist(err) { + // If resolved Workdir path gets marked as a valid symlink, + // return nil cause this is valid use-case. + if c.isWorkDirSymlink(resolvedWorkdir) { + return nil + } + return fmt.Errorf("workdir %q does not exist on container %s", workdir, c.ID()) + } + // This might be a serious error (e.g., permission), so + // we need to return the full error. + return fmt.Errorf("error detecting workdir %q on container %s: %w", workdir, c.ID(), err) + } + return nil + } + if err := os.MkdirAll(resolvedWorkdir, 0755); err != nil { + if os.IsExist(err) { + return nil + } + return fmt.Errorf("error creating container %s workdir: %w", c.ID(), err) + } + + // Ensure container entrypoint is created (if required). + uid, gid, _, err := chrootuser.GetUser(c.state.Mountpoint, c.User()) + if err != nil { + return fmt.Errorf("error looking up %s inside of the container %s: %w", c.User(), c.ID(), err) + } + if err := os.Chown(resolvedWorkdir, int(uid), int(gid)); err != nil { + return fmt.Errorf("error chowning container %s workdir to container root: %w", c.ID(), err) + } + + return nil +} + +func (c *Container) getUserOverrides() *lookup.Overrides { + var hasPasswdFile, hasGroupFile bool + overrides := lookup.Overrides{} + for _, m := range c.config.Spec.Mounts { + if m.Destination == "/etc/passwd" { + overrides.ContainerEtcPasswdPath = m.Source + hasPasswdFile = true + } + if m.Destination == "/etc/group" { + overrides.ContainerEtcGroupPath = m.Source + hasGroupFile = true + } + if m.Destination == "/etc" { + if !hasPasswdFile { + overrides.ContainerEtcPasswdPath = filepath.Join(m.Source, "passwd") + } + if !hasGroupFile { + overrides.ContainerEtcGroupPath = filepath.Join(m.Source, "group") + } + } + } + if path, ok := c.state.BindMounts["/etc/passwd"]; ok { + overrides.ContainerEtcPasswdPath = path + } + return &overrides +} + +func lookupHostUser(name string) (*runcuser.ExecUser, error) { + var execUser runcuser.ExecUser + // Look up User on host + u, err := util.LookupUser(name) + if err != nil { + return &execUser, err + } + uid, err := strconv.ParseUint(u.Uid, 8, 32) + if err != nil { + return &execUser, err + } + + gid, err := strconv.ParseUint(u.Gid, 8, 32) + if err != nil { + return &execUser, err + } + execUser.Uid = int(uid) + execUser.Gid = int(gid) + execUser.Home = u.HomeDir + return &execUser, nil +} + +// mountNotifySocket mounts the NOTIFY_SOCKET into the container if it's set +// and if the sdnotify mode is set to container. It also sets c.notifySocket +// to avoid redundantly looking up the env variable. +func (c *Container) mountNotifySocket(g generate.Generator) error { + if c.config.SdNotifySocket == "" { + return nil + } + if c.config.SdNotifyMode != define.SdNotifyModeContainer { + return nil + } + + notifyDir := filepath.Join(c.bundlePath(), "notify") + logrus.Debugf("Checking notify %q dir", notifyDir) + if err := os.MkdirAll(notifyDir, 0755); err != nil { + if !os.IsExist(err) { + return fmt.Errorf("unable to create notify %q dir: %w", notifyDir, err) + } + } + if err := label.Relabel(notifyDir, c.MountLabel(), true); err != nil { + return fmt.Errorf("relabel failed %q: %w", notifyDir, err) + } + logrus.Debugf("Add bindmount notify %q dir", notifyDir) + if _, ok := c.state.BindMounts["/run/notify"]; !ok { + c.state.BindMounts["/run/notify"] = notifyDir + } + + // Set the container's notify socket to the proxy socket created by conmon + g.AddProcessEnv("NOTIFY_SOCKET", "/run/notify/notify.sock") + + return nil +} + +func (c *Container) addCheckpointImageMetadata(importBuilder *buildah.Builder) error { + // Get information about host environment + hostInfo, err := c.Runtime().hostInfo() + if err != nil { + return fmt.Errorf("getting host info: %v", err) + } + + criuVersion, err := criu.GetCriuVersion() + if err != nil { + return fmt.Errorf("getting criu version: %v", err) + } + + rootfsImageID, rootfsImageName := c.Image() + + // Add image annotations with information about the container and the host. + // This information is useful to check compatibility before restoring the checkpoint + + checkpointImageAnnotations := map[string]string{ + define.CheckpointAnnotationName: c.config.Name, + define.CheckpointAnnotationRawImageName: c.config.RawImageName, + define.CheckpointAnnotationRootfsImageID: rootfsImageID, + define.CheckpointAnnotationRootfsImageName: rootfsImageName, + define.CheckpointAnnotationPodmanVersion: version.Version.String(), + define.CheckpointAnnotationCriuVersion: strconv.Itoa(criuVersion), + define.CheckpointAnnotationRuntimeName: hostInfo.OCIRuntime.Name, + define.CheckpointAnnotationRuntimeVersion: hostInfo.OCIRuntime.Version, + define.CheckpointAnnotationConmonVersion: hostInfo.Conmon.Version, + define.CheckpointAnnotationHostArch: hostInfo.Arch, + define.CheckpointAnnotationHostKernel: hostInfo.Kernel, + define.CheckpointAnnotationCgroupVersion: hostInfo.CgroupsVersion, + define.CheckpointAnnotationDistributionVersion: hostInfo.Distribution.Version, + define.CheckpointAnnotationDistributionName: hostInfo.Distribution.Distribution, + } + + for key, value := range checkpointImageAnnotations { + importBuilder.SetAnnotation(key, value) + } + + return nil +} + +func (c *Container) resolveCheckpointImageName(options *ContainerCheckpointOptions) error { + if options.CreateImage == "" { + return nil + } + + // Resolve image name + resolvedImageName, err := c.runtime.LibimageRuntime().ResolveName(options.CreateImage) + if err != nil { + return err + } + + options.CreateImage = resolvedImageName + return nil +} + +func (c *Container) createCheckpointImage(ctx context.Context, options ContainerCheckpointOptions) error { + if options.CreateImage == "" { + return nil + } + logrus.Debugf("Create checkpoint image %s", options.CreateImage) + + // Create storage reference + imageRef, err := is.Transport.ParseStoreReference(c.runtime.store, options.CreateImage) + if err != nil { + return errors.New("failed to parse image name") + } + + // Build an image scratch + builderOptions := buildah.BuilderOptions{ + FromImage: "scratch", + } + importBuilder, err := buildah.NewBuilder(ctx, c.runtime.store, builderOptions) + if err != nil { + return err + } + // Clean up buildah working container + defer func() { + if err := importBuilder.Delete(); err != nil { + logrus.Errorf("Image builder delete failed: %v", err) + } + }() + + if err := c.prepareCheckpointExport(); err != nil { + return err + } + + // Export checkpoint into temporary tar file + tmpDir, err := ioutil.TempDir("", "checkpoint_image_") + if err != nil { + return err + } + defer os.RemoveAll(tmpDir) + + options.TargetFile = path.Join(tmpDir, "checkpoint.tar") + + if err := c.exportCheckpoint(options); err != nil { + return err + } + + // Copy checkpoint from temporary tar file in the image + addAndCopyOptions := buildah.AddAndCopyOptions{} + if err := importBuilder.Add("", true, addAndCopyOptions, options.TargetFile); err != nil { + return err + } + + if err := c.addCheckpointImageMetadata(importBuilder); err != nil { + return err + } + + commitOptions := buildah.CommitOptions{ + Squash: true, + SystemContext: c.runtime.imageContext, + } + + // Create checkpoint image + id, _, _, err := importBuilder.Commit(ctx, imageRef, commitOptions) + if err != nil { + return err + } + logrus.Debugf("Created checkpoint image: %s", id) + return nil +} + +func (c *Container) exportCheckpoint(options ContainerCheckpointOptions) error { + if len(c.Dependencies()) == 1 { + // Check if the dependency is an infra container. If it is we can checkpoint + // the container out of the Pod. + if c.config.Pod == "" { + return errors.New("cannot export checkpoints of containers with dependencies") + } + + pod, err := c.runtime.state.Pod(c.config.Pod) + if err != nil { + return fmt.Errorf("container %s is in pod %s, but pod cannot be retrieved: %w", c.ID(), c.config.Pod, err) + } + infraID, err := pod.InfraContainerID() + if err != nil { + return fmt.Errorf("cannot retrieve infra container ID for pod %s: %w", c.config.Pod, err) + } + if c.Dependencies()[0] != infraID { + return errors.New("cannot export checkpoints of containers with dependencies") + } + } + if len(c.Dependencies()) > 1 { + return errors.New("cannot export checkpoints of containers with dependencies") + } + logrus.Debugf("Exporting checkpoint image of container %q to %q", c.ID(), options.TargetFile) + + includeFiles := []string{ + "artifacts", + metadata.DevShmCheckpointTar, + metadata.ConfigDumpFile, + metadata.SpecDumpFile, + metadata.NetworkStatusFile, + stats.StatsDump, + } + + if c.LogDriver() == define.KubernetesLogging || + c.LogDriver() == define.JSONLogging { + includeFiles = append(includeFiles, "ctr.log") + } + if options.PreCheckPoint { + includeFiles = append(includeFiles, preCheckpointDir) + } else { + includeFiles = append(includeFiles, metadata.CheckpointDirectory) + } + // Get root file-system changes included in the checkpoint archive + var addToTarFiles []string + if !options.IgnoreRootfs { + // To correctly track deleted files, let's go through the output of 'podman diff' + rootFsChanges, err := c.runtime.GetDiff("", c.ID(), define.DiffContainer) + if err != nil { + return fmt.Errorf("error exporting root file-system diff for %q: %w", c.ID(), err) + } + + addToTarFiles, err := crutils.CRCreateRootFsDiffTar(&rootFsChanges, c.state.Mountpoint, c.bundlePath()) + if err != nil { + return err + } + + includeFiles = append(includeFiles, addToTarFiles...) + } + + // Folder containing archived volumes that will be included in the export + expVolDir := filepath.Join(c.bundlePath(), metadata.CheckpointVolumesDirectory) + + // Create an archive for each volume associated with the container + if !options.IgnoreVolumes { + if err := os.MkdirAll(expVolDir, 0700); err != nil { + return fmt.Errorf("error creating volumes export directory %q: %w", expVolDir, err) + } + + for _, v := range c.config.NamedVolumes { + volumeTarFilePath := filepath.Join(metadata.CheckpointVolumesDirectory, v.Name+".tar") + volumeTarFileFullPath := filepath.Join(c.bundlePath(), volumeTarFilePath) + + volumeTarFile, err := os.Create(volumeTarFileFullPath) + if err != nil { + return fmt.Errorf("error creating %q: %w", volumeTarFileFullPath, err) + } + + volume, err := c.runtime.GetVolume(v.Name) + if err != nil { + return err + } + + mp, err := volume.MountPoint() + if err != nil { + return err + } + if mp == "" { + return fmt.Errorf("volume %s is not mounted, cannot export: %w", volume.Name(), define.ErrInternal) + } + + input, err := archive.TarWithOptions(mp, &archive.TarOptions{ + Compression: archive.Uncompressed, + IncludeSourceDir: true, + }) + if err != nil { + return fmt.Errorf("error reading volume directory %q: %w", v.Dest, err) + } + + _, err = io.Copy(volumeTarFile, input) + if err != nil { + return err + } + volumeTarFile.Close() + + includeFiles = append(includeFiles, volumeTarFilePath) + } + } + + input, err := archive.TarWithOptions(c.bundlePath(), &archive.TarOptions{ + Compression: options.Compression, + IncludeSourceDir: true, + IncludeFiles: includeFiles, + }) + + if err != nil { + return fmt.Errorf("error reading checkpoint directory %q: %w", c.ID(), err) + } + + outFile, err := os.Create(options.TargetFile) + if err != nil { + return fmt.Errorf("error creating checkpoint export file %q: %w", options.TargetFile, err) + } + defer outFile.Close() + + if err := os.Chmod(options.TargetFile, 0600); err != nil { + return err + } + + _, err = io.Copy(outFile, input) + if err != nil { + return err + } + + for _, file := range addToTarFiles { + os.Remove(filepath.Join(c.bundlePath(), file)) + } + + if !options.IgnoreVolumes { + os.RemoveAll(expVolDir) + } + + return nil +} + +func (c *Container) checkpointRestoreSupported(version int) error { + if !criu.CheckForCriu(version) { + return fmt.Errorf("checkpoint/restore requires at least CRIU %d", version) + } + if !c.ociRuntime.SupportsCheckpoint() { + return errors.New("configured runtime does not support checkpoint/restore") + } + return nil +} + +func (c *Container) checkpoint(ctx context.Context, options ContainerCheckpointOptions) (*define.CRIUCheckpointRestoreStatistics, int64, error) { + if err := c.checkpointRestoreSupported(criu.MinCriuVersion); err != nil { + return nil, 0, err + } + + if c.state.State != define.ContainerStateRunning { + return nil, 0, fmt.Errorf("%q is not running, cannot checkpoint: %w", c.state.State, define.ErrCtrStateInvalid) + } + + if c.AutoRemove() && options.TargetFile == "" { + return nil, 0, errors.New("cannot checkpoint containers that have been started with '--rm' unless '--export' is used") + } + + if err := c.resolveCheckpointImageName(&options); err != nil { + return nil, 0, err + } + + if err := crutils.CRCreateFileWithLabel(c.bundlePath(), "dump.log", c.MountLabel()); err != nil { + return nil, 0, err + } + + // Setting CheckpointLog early in case there is a failure. + c.state.CheckpointLog = path.Join(c.bundlePath(), "dump.log") + c.state.CheckpointPath = c.CheckpointPath() + + runtimeCheckpointDuration, err := c.ociRuntime.CheckpointContainer(c, options) + if err != nil { + return nil, 0, err + } + + // Keep the content of /dev/shm directory + if c.config.ShmDir != "" && c.state.BindMounts["/dev/shm"] == c.config.ShmDir { + shmDirTarFileFullPath := filepath.Join(c.bundlePath(), metadata.DevShmCheckpointTar) + + shmDirTarFile, err := os.Create(shmDirTarFileFullPath) + if err != nil { + return nil, 0, err + } + defer shmDirTarFile.Close() + + input, err := archive.TarWithOptions(c.config.ShmDir, &archive.TarOptions{ + Compression: archive.Uncompressed, + IncludeSourceDir: true, + }) + if err != nil { + return nil, 0, err + } + + if _, err = io.Copy(shmDirTarFile, input); err != nil { + return nil, 0, err + } + } + + // Save network.status. This is needed to restore the container with + // the same IP. Currently limited to one IP address in a container + // with one interface. + // FIXME: will this break something? + if _, err := metadata.WriteJSONFile(c.getNetworkStatus(), c.bundlePath(), metadata.NetworkStatusFile); err != nil { + return nil, 0, err + } + + defer c.newContainerEvent(events.Checkpoint) + + // There is a bug from criu: https://github.com/checkpoint-restore/criu/issues/116 + // We have to change the symbolic link from absolute path to relative path + if options.WithPrevious { + os.Remove(path.Join(c.CheckpointPath(), "parent")) + if err := os.Symlink("../pre-checkpoint", path.Join(c.CheckpointPath(), "parent")); err != nil { + return nil, 0, err + } + } + + if options.TargetFile != "" { + if err := c.exportCheckpoint(options); err != nil { + return nil, 0, err + } + } else { + if err := c.createCheckpointImage(ctx, options); err != nil { + return nil, 0, err + } + } + + logrus.Debugf("Checkpointed container %s", c.ID()) + + if !options.KeepRunning && !options.PreCheckPoint { + c.state.State = define.ContainerStateStopped + c.state.Checkpointed = true + c.state.CheckpointedTime = time.Now() + c.state.Restored = false + c.state.RestoredTime = time.Time{} + + // Clean up Storage and Network + if err := c.cleanup(ctx); err != nil { + return nil, 0, err + } + } + + criuStatistics, err := func() (*define.CRIUCheckpointRestoreStatistics, error) { + if !options.PrintStats { + return nil, nil + } + statsDirectory, err := os.Open(c.bundlePath()) + if err != nil { + return nil, fmt.Errorf("not able to open %q: %w", c.bundlePath(), err) + } + + dumpStatistics, err := stats.CriuGetDumpStats(statsDirectory) + if err != nil { + return nil, fmt.Errorf("displaying checkpointing statistics not possible: %w", err) + } + + return &define.CRIUCheckpointRestoreStatistics{ + FreezingTime: dumpStatistics.GetFreezingTime(), + FrozenTime: dumpStatistics.GetFrozenTime(), + MemdumpTime: dumpStatistics.GetMemdumpTime(), + MemwriteTime: dumpStatistics.GetMemwriteTime(), + PagesScanned: dumpStatistics.GetPagesScanned(), + PagesWritten: dumpStatistics.GetPagesWritten(), + }, nil + }() + if err != nil { + return nil, 0, err + } + + if !options.Keep && !options.PreCheckPoint { + cleanup := []string{ + "dump.log", + stats.StatsDump, + metadata.ConfigDumpFile, + metadata.SpecDumpFile, + } + for _, del := range cleanup { + file := filepath.Join(c.bundlePath(), del) + if err := os.Remove(file); err != nil { + logrus.Debugf("Unable to remove file %s", file) + } + } + // The file has been deleted. Do not mention it. + c.state.CheckpointLog = "" + } + + c.state.FinishedTime = time.Now() + return criuStatistics, runtimeCheckpointDuration, c.save() +} + +func (c *Container) generateContainerSpec() error { + // Make sure the newly created config.json exists on disk + + // NewFromSpec() is deprecated according to its comment + // however the recommended replace just causes a nil map panic + //nolint:staticcheck + g := generate.NewFromSpec(c.config.Spec) + + if err := c.saveSpec(g.Config); err != nil { + return fmt.Errorf("saving imported container specification for restore failed: %w", err) + } + + return nil +} + +func (c *Container) importCheckpointImage(ctx context.Context, imageID string) error { + img, _, err := c.Runtime().LibimageRuntime().LookupImage(imageID, nil) + if err != nil { + return err + } + + mountPoint, err := img.Mount(ctx, nil, "") + defer func() { + if err := c.unmount(true); err != nil { + logrus.Errorf("Failed to unmount container: %v", err) + } + }() + if err != nil { + return err + } + + // Import all checkpoint files except ConfigDumpFile and SpecDumpFile. We + // generate new container config files to enable to specifying a new + // container name. + checkpoint := []string{ + "artifacts", + metadata.CheckpointDirectory, + metadata.CheckpointVolumesDirectory, + metadata.DevShmCheckpointTar, + metadata.RootFsDiffTar, + metadata.DeletedFilesFile, + metadata.PodOptionsFile, + metadata.PodDumpFile, + } + + for _, name := range checkpoint { + src := filepath.Join(mountPoint, name) + dst := filepath.Join(c.bundlePath(), name) + if err := archive.NewDefaultArchiver().CopyWithTar(src, dst); err != nil { + logrus.Debugf("Can't import '%s' from checkpoint image", name) + } + } + + return c.generateContainerSpec() +} + +func (c *Container) importCheckpointTar(input string) error { + if err := crutils.CRImportCheckpointWithoutConfig(c.bundlePath(), input); err != nil { + return err + } + + return c.generateContainerSpec() +} + +func (c *Container) importPreCheckpoint(input string) error { + archiveFile, err := os.Open(input) + if err != nil { + return fmt.Errorf("failed to open pre-checkpoint archive for import: %w", err) + } + + defer archiveFile.Close() + + err = archive.Untar(archiveFile, c.bundlePath(), nil) + if err != nil { + return fmt.Errorf("unpacking of pre-checkpoint archive %s failed: %w", input, err) + } + return nil +} + +func (c *Container) restore(ctx context.Context, options ContainerCheckpointOptions) (criuStatistics *define.CRIUCheckpointRestoreStatistics, runtimeRestoreDuration int64, retErr error) { + minCriuVersion := func() int { + if options.Pod == "" { + return criu.MinCriuVersion + } + return criu.PodCriuVersion + }() + if err := c.checkpointRestoreSupported(minCriuVersion); err != nil { + return nil, 0, err + } + + if options.Pod != "" && !crutils.CRRuntimeSupportsPodCheckpointRestore(c.ociRuntime.Path()) { + return nil, 0, fmt.Errorf("runtime %s does not support pod restore", c.ociRuntime.Path()) + } + + if !c.ensureState(define.ContainerStateConfigured, define.ContainerStateExited) { + return nil, 0, fmt.Errorf("container %s is running or paused, cannot restore: %w", c.ID(), define.ErrCtrStateInvalid) + } + + if options.ImportPrevious != "" { + if err := c.importPreCheckpoint(options.ImportPrevious); err != nil { + return nil, 0, err + } + } + + if options.TargetFile != "" { + if err := c.importCheckpointTar(options.TargetFile); err != nil { + return nil, 0, err + } + } else if options.CheckpointImageID != "" { + if err := c.importCheckpointImage(ctx, options.CheckpointImageID); err != nil { + return nil, 0, err + } + } + + // Let's try to stat() CRIU's inventory file. If it does not exist, it makes + // no sense to try a restore. This is a minimal check if a checkpoint exist. + if _, err := os.Stat(filepath.Join(c.CheckpointPath(), "inventory.img")); os.IsNotExist(err) { + return nil, 0, fmt.Errorf("a complete checkpoint for this container cannot be found, cannot restore: %w", err) + } + + if err := crutils.CRCreateFileWithLabel(c.bundlePath(), "restore.log", c.MountLabel()); err != nil { + return nil, 0, err + } + + // Setting RestoreLog early in case there is a failure. + c.state.RestoreLog = path.Join(c.bundlePath(), "restore.log") + c.state.CheckpointPath = c.CheckpointPath() + + // Read network configuration from checkpoint + var netStatus map[string]types.StatusBlock + _, err := metadata.ReadJSONFile(&netStatus, c.bundlePath(), metadata.NetworkStatusFile) + if err != nil { + logrus.Infof("Failed to unmarshal network status, cannot restore the same ip/mac: %v", err) + } + // If the restored container should get a new name, the IP address of + // the container will not be restored. This assumes that if a new name is + // specified, the container is restored multiple times. + // TODO: This implicit restoring with or without IP depending on an + // unrelated restore parameter (--name) does not seem like the + // best solution. + if err == nil && options.Name == "" && (!options.IgnoreStaticIP || !options.IgnoreStaticMAC) { + // The file with the network.status does exist. Let's restore the + // container with the same networks settings as during checkpointing. + networkOpts, err := c.networks() + if err != nil { + return nil, 0, err + } + + netOpts := make(map[string]types.PerNetworkOptions, len(netStatus)) + for network, perNetOpts := range networkOpts { + // unset mac and ips before we start adding the ones from the status + perNetOpts.StaticMAC = nil + perNetOpts.StaticIPs = nil + for name, netInt := range netStatus[network].Interfaces { + perNetOpts.InterfaceName = name + if !options.IgnoreStaticIP { + perNetOpts.StaticMAC = netInt.MacAddress + } + if !options.IgnoreStaticIP { + for _, netAddress := range netInt.Subnets { + perNetOpts.StaticIPs = append(perNetOpts.StaticIPs, netAddress.IPNet.IP) + } + } + // Normally interfaces have a length of 1, only for some special cni configs we could get more. + // For now just use the first interface to get the ips this should be good enough for most cases. + break + } + netOpts[network] = perNetOpts + } + c.perNetworkOpts = netOpts + } + + defer func() { + if retErr != nil { + if err := c.cleanup(ctx); err != nil { + logrus.Errorf("Cleaning up container %s: %v", c.ID(), err) + } + } + }() + + if err := c.prepare(); err != nil { + return nil, 0, err + } + + // Read config + jsonPath := filepath.Join(c.bundlePath(), "config.json") + logrus.Debugf("generate.NewFromFile at %v", jsonPath) + g, err := generate.NewFromFile(jsonPath) + if err != nil { + logrus.Debugf("generate.NewFromFile failed with %v", err) + return nil, 0, err + } + + // Restoring from an import means that we are doing migration + if options.TargetFile != "" || options.CheckpointImageID != "" { + g.SetRootPath(c.state.Mountpoint) + } + + // We want to have the same network namespace as before. + if err := c.addNetworkNamespace(&g); err != nil { + return nil, 0, err + } + + if options.Pod != "" { + // Running in a Pod means that we have to change all namespace settings to + // the ones from the infrastructure container. + pod, err := c.runtime.LookupPod(options.Pod) + if err != nil { + return nil, 0, fmt.Errorf("pod %q cannot be retrieved: %w", options.Pod, err) + } + + infraContainer, err := pod.InfraContainer() + if err != nil { + return nil, 0, fmt.Errorf("cannot retrieved infra container from pod %q: %w", options.Pod, err) + } + + infraContainer.lock.Lock() + if err := infraContainer.syncContainer(); err != nil { + infraContainer.lock.Unlock() + return nil, 0, fmt.Errorf("error syncing infrastructure container %s status: %w", infraContainer.ID(), err) + } + if infraContainer.state.State != define.ContainerStateRunning { + if err := infraContainer.initAndStart(ctx); err != nil { + infraContainer.lock.Unlock() + return nil, 0, fmt.Errorf("error starting infrastructure container %s status: %w", infraContainer.ID(), err) + } + } + infraContainer.lock.Unlock() + + if c.config.IPCNsCtr != "" { + nsPath, err := infraContainer.namespacePath(IPCNS) + if err != nil { + return nil, 0, fmt.Errorf("cannot retrieve IPC namespace path for Pod %q: %w", options.Pod, err) + } + if err := g.AddOrReplaceLinuxNamespace(string(spec.IPCNamespace), nsPath); err != nil { + return nil, 0, err + } + } + + if c.config.NetNsCtr != "" { + nsPath, err := infraContainer.namespacePath(NetNS) + if err != nil { + return nil, 0, fmt.Errorf("cannot retrieve network namespace path for Pod %q: %w", options.Pod, err) + } + if err := g.AddOrReplaceLinuxNamespace(string(spec.NetworkNamespace), nsPath); err != nil { + return nil, 0, err + } + } + + if c.config.PIDNsCtr != "" { + nsPath, err := infraContainer.namespacePath(PIDNS) + if err != nil { + return nil, 0, fmt.Errorf("cannot retrieve PID namespace path for Pod %q: %w", options.Pod, err) + } + if err := g.AddOrReplaceLinuxNamespace(string(spec.PIDNamespace), nsPath); err != nil { + return nil, 0, err + } + } + + if c.config.UTSNsCtr != "" { + nsPath, err := infraContainer.namespacePath(UTSNS) + if err != nil { + return nil, 0, fmt.Errorf("cannot retrieve UTS namespace path for Pod %q: %w", options.Pod, err) + } + if err := g.AddOrReplaceLinuxNamespace(string(spec.UTSNamespace), nsPath); err != nil { + return nil, 0, err + } + } + + if c.config.CgroupNsCtr != "" { + nsPath, err := infraContainer.namespacePath(CgroupNS) + if err != nil { + return nil, 0, fmt.Errorf("cannot retrieve Cgroup namespace path for Pod %q: %w", options.Pod, err) + } + if err := g.AddOrReplaceLinuxNamespace(string(spec.CgroupNamespace), nsPath); err != nil { + return nil, 0, err + } + } + } + + if err := c.makeBindMounts(); err != nil { + return nil, 0, err + } + + if options.TargetFile != "" || options.CheckpointImageID != "" { + for dstPath, srcPath := range c.state.BindMounts { + newMount := spec.Mount{ + Type: "bind", + Source: srcPath, + Destination: dstPath, + Options: []string{"bind", "private"}, + } + if c.IsReadOnly() && dstPath != "/dev/shm" { + newMount.Options = append(newMount.Options, "ro", "nosuid", "noexec", "nodev") + } + if dstPath == "/dev/shm" && c.state.BindMounts["/dev/shm"] == c.config.ShmDir { + newMount.Options = append(newMount.Options, "nosuid", "noexec", "nodev") + } + if !MountExists(g.Mounts(), dstPath) { + g.AddMount(newMount) + } + } + } + + // Restore /dev/shm content + if c.config.ShmDir != "" && c.state.BindMounts["/dev/shm"] == c.config.ShmDir { + shmDirTarFileFullPath := filepath.Join(c.bundlePath(), metadata.DevShmCheckpointTar) + if _, err := os.Stat(shmDirTarFileFullPath); err != nil { + logrus.Debug("Container checkpoint doesn't contain dev/shm: ", err.Error()) + } else { + shmDirTarFile, err := os.Open(shmDirTarFileFullPath) + if err != nil { + return nil, 0, err + } + defer shmDirTarFile.Close() + + if err := archive.UntarUncompressed(shmDirTarFile, c.config.ShmDir, nil); err != nil { + return nil, 0, err + } + } + } + + // Cleanup for a working restore. + if err := c.removeConmonFiles(); err != nil { + return nil, 0, err + } + + // Save the OCI spec to disk + if err := c.saveSpec(g.Config); err != nil { + return nil, 0, err + } + + // When restoring from an imported archive, allow restoring the content of volumes. + // Volumes are created in setupContainer() + if !options.IgnoreVolumes && (options.TargetFile != "" || options.CheckpointImageID != "") { + for _, v := range c.config.NamedVolumes { + volumeFilePath := filepath.Join(c.bundlePath(), metadata.CheckpointVolumesDirectory, v.Name+".tar") + + volumeFile, err := os.Open(volumeFilePath) + if err != nil { + return nil, 0, fmt.Errorf("failed to open volume file %s: %w", volumeFilePath, err) + } + defer volumeFile.Close() + + volume, err := c.runtime.GetVolume(v.Name) + if err != nil { + return nil, 0, fmt.Errorf("failed to retrieve volume %s: %w", v.Name, err) + } + + mountPoint, err := volume.MountPoint() + if err != nil { + return nil, 0, err + } + if mountPoint == "" { + return nil, 0, fmt.Errorf("unable to import volume %s as it is not mounted: %w", volume.Name(), err) + } + if err := archive.UntarUncompressed(volumeFile, mountPoint, nil); err != nil { + return nil, 0, fmt.Errorf("failed to extract volume %s to %s: %w", volumeFilePath, mountPoint, err) + } + } + } + + // Before actually restarting the container, apply the root file-system changes + if !options.IgnoreRootfs { + if err := crutils.CRApplyRootFsDiffTar(c.bundlePath(), c.state.Mountpoint); err != nil { + return nil, 0, err + } + + if err := crutils.CRRemoveDeletedFiles(c.ID(), c.bundlePath(), c.state.Mountpoint); err != nil { + return nil, 0, err + } + } + + runtimeRestoreDuration, err = c.ociRuntime.CreateContainer(c, &options) + if err != nil { + return nil, 0, err + } + + criuStatistics, err = func() (*define.CRIUCheckpointRestoreStatistics, error) { + if !options.PrintStats { + return nil, nil + } + statsDirectory, err := os.Open(c.bundlePath()) + if err != nil { + return nil, fmt.Errorf("not able to open %q: %w", c.bundlePath(), err) + } + + restoreStatistics, err := stats.CriuGetRestoreStats(statsDirectory) + if err != nil { + return nil, fmt.Errorf("displaying restore statistics not possible: %w", err) + } + + return &define.CRIUCheckpointRestoreStatistics{ + PagesCompared: restoreStatistics.GetPagesCompared(), + PagesSkippedCow: restoreStatistics.GetPagesSkippedCow(), + ForkingTime: restoreStatistics.GetForkingTime(), + RestoreTime: restoreStatistics.GetRestoreTime(), + PagesRestored: restoreStatistics.GetPagesRestored(), + }, nil + }() + if err != nil { + return nil, 0, err + } + + logrus.Debugf("Restored container %s", c.ID()) + + c.state.State = define.ContainerStateRunning + c.state.Checkpointed = false + c.state.Restored = true + c.state.CheckpointedTime = time.Time{} + c.state.RestoredTime = time.Now() + + if !options.Keep { + // Delete all checkpoint related files. At this point, in theory, all files + // should exist. Still ignoring errors for now as the container should be + // restored and running. Not erroring out just because some cleanup operation + // failed. Starting with the checkpoint directory + err = os.RemoveAll(c.CheckpointPath()) + if err != nil { + logrus.Debugf("Non-fatal: removal of checkpoint directory (%s) failed: %v", c.CheckpointPath(), err) + } + c.state.CheckpointPath = "" + err = os.RemoveAll(c.PreCheckPointPath()) + if err != nil { + logrus.Debugf("Non-fatal: removal of pre-checkpoint directory (%s) failed: %v", c.PreCheckPointPath(), err) + } + err = os.RemoveAll(c.CheckpointVolumesPath()) + if err != nil { + logrus.Debugf("Non-fatal: removal of checkpoint volumes directory (%s) failed: %v", c.CheckpointVolumesPath(), err) + } + cleanup := [...]string{ + "restore.log", + "dump.log", + stats.StatsDump, + stats.StatsRestore, + metadata.DevShmCheckpointTar, + metadata.NetworkStatusFile, + metadata.RootFsDiffTar, + metadata.DeletedFilesFile, + } + for _, del := range cleanup { + file := filepath.Join(c.bundlePath(), del) + err = os.Remove(file) + if err != nil { + logrus.Debugf("Non-fatal: removal of checkpoint file (%s) failed: %v", file, err) + } + } + c.state.CheckpointLog = "" + c.state.RestoreLog = "" + } + + return criuStatistics, runtimeRestoreDuration, c.save() +} + +// Retrieves a container's "root" net namespace container dependency. +func (c *Container) getRootNetNsDepCtr() (depCtr *Container, err error) { + containersVisited := map[string]int{c.config.ID: 1} + nextCtr := c.config.NetNsCtr + for nextCtr != "" { + // Make sure we aren't in a loop + if _, visited := containersVisited[nextCtr]; visited { + return nil, errors.New("loop encountered while determining net namespace container") + } + containersVisited[nextCtr] = 1 + + depCtr, err = c.runtime.state.Container(nextCtr) + if err != nil { + return nil, fmt.Errorf("error fetching dependency %s of container %s: %w", c.config.NetNsCtr, c.ID(), err) + } + // This should never happen without an error + if depCtr == nil { + break + } + nextCtr = depCtr.config.NetNsCtr + } + + if depCtr == nil { + return nil, errors.New("unexpected error depCtr is nil without reported error from runtime state") + } + return depCtr, nil +} + +// Ensure standard bind mounts are mounted into all root directories (including chroot directories) +func (c *Container) mountIntoRootDirs(mountName string, mountPath string) error { + c.state.BindMounts[mountName] = mountPath + + for _, chrootDir := range c.config.ChrootDirs { + c.state.BindMounts[filepath.Join(chrootDir, mountName)] = mountPath + } + + return nil +} + +// Make standard bind mounts to include in the container +func (c *Container) makeBindMounts() error { + if err := os.Chown(c.state.RunDir, c.RootUID(), c.RootGID()); err != nil { + return fmt.Errorf("cannot chown run directory: %w", err) + } + + if c.state.BindMounts == nil { + c.state.BindMounts = make(map[string]string) + } + netDisabled, err := c.NetworkDisabled() + if err != nil { + return err + } + + if !netDisabled { + // If /etc/resolv.conf and /etc/hosts exist, delete them so we + // will recreate. Only do this if we aren't sharing them with + // another container. + if c.config.NetNsCtr == "" { + if resolvePath, ok := c.state.BindMounts["/etc/resolv.conf"]; ok { + if err := os.Remove(resolvePath); err != nil && !os.IsNotExist(err) { + return fmt.Errorf("container %s: %w", c.ID(), err) + } + delete(c.state.BindMounts, "/etc/resolv.conf") + } + if hostsPath, ok := c.state.BindMounts["/etc/hosts"]; ok { + if err := os.Remove(hostsPath); err != nil && !os.IsNotExist(err) { + return fmt.Errorf("container %s: %w", c.ID(), err) + } + delete(c.state.BindMounts, "/etc/hosts") + } + } + + if c.config.NetNsCtr != "" && (!c.config.UseImageResolvConf || !c.config.UseImageHosts) { + // We share a net namespace. + // We want /etc/resolv.conf and /etc/hosts from the + // other container. Unless we're not creating both of + // them. + depCtr, err := c.getRootNetNsDepCtr() + if err != nil { + return fmt.Errorf("error fetching network namespace dependency container for container %s: %w", c.ID(), err) + } + + // We need that container's bind mounts + bindMounts, err := depCtr.BindMounts() + if err != nil { + return fmt.Errorf("error fetching bind mounts from dependency %s of container %s: %w", depCtr.ID(), c.ID(), err) + } + + // The other container may not have a resolv.conf or /etc/hosts + // If it doesn't, don't copy them + resolvPath, exists := bindMounts["/etc/resolv.conf"] + if !c.config.UseImageResolvConf && exists { + err := c.mountIntoRootDirs("/etc/resolv.conf", resolvPath) + + if err != nil { + return fmt.Errorf("error assigning mounts to container %s: %w", c.ID(), err) + } + } + + // check if dependency container has an /etc/hosts file. + // It may not have one, so only use it if it does. + hostsPath, exists := bindMounts[config.DefaultHostsFile] + if !c.config.UseImageHosts && exists { + // we cannot use the dependency container lock due ABBA deadlocks in cleanup() + lock, err := lockfile.GetLockfile(hostsPath) + if err != nil { + return fmt.Errorf("failed to lock hosts file: %w", err) + } + lock.Lock() + + // add the newly added container to the hosts file + // we always use 127.0.0.1 as ip since they have the same netns + err = etchosts.Add(hostsPath, getLocalhostHostEntry(c)) + lock.Unlock() + if err != nil { + return fmt.Errorf("error creating hosts file for container %s which depends on container %s: %w", c.ID(), depCtr.ID(), err) + } + + // finally, save it in the new container + err = c.mountIntoRootDirs(config.DefaultHostsFile, hostsPath) + if err != nil { + return fmt.Errorf("error assigning mounts to container %s: %w", c.ID(), err) + } + } + + if !hasCurrentUserMapped(c) { + if err := makeAccessible(resolvPath, c.RootUID(), c.RootGID()); err != nil { + return err + } + if err := makeAccessible(hostsPath, c.RootUID(), c.RootGID()); err != nil { + return err + } + } + } else { + if !c.config.UseImageResolvConf { + if err := c.generateResolvConf(); err != nil { + return fmt.Errorf("error creating resolv.conf for container %s: %w", c.ID(), err) + } + } + + if !c.config.UseImageHosts { + if err := c.createHosts(); err != nil { + return fmt.Errorf("error creating hosts file for container %s: %w", c.ID(), err) + } + } + } + + if c.state.BindMounts["/etc/hosts"] != "" { + if err := c.relabel(c.state.BindMounts["/etc/hosts"], c.config.MountLabel, true); err != nil { + return err + } + } + + if c.state.BindMounts["/etc/resolv.conf"] != "" { + if err := c.relabel(c.state.BindMounts["/etc/resolv.conf"], c.config.MountLabel, true); err != nil { + return err + } + } + } else if !c.config.UseImageHosts && c.state.BindMounts["/etc/hosts"] == "" { + if err := c.createHosts(); err != nil { + return fmt.Errorf("error creating hosts file for container %s: %w", c.ID(), err) + } + } + + if c.config.ShmDir != "" { + // If ShmDir has a value SHM is always added when we mount the container + c.state.BindMounts["/dev/shm"] = c.config.ShmDir + } + + if c.config.Passwd == nil || *c.config.Passwd { + newPasswd, newGroup, err := c.generatePasswdAndGroup() + if err != nil { + return fmt.Errorf("error creating temporary passwd file for container %s: %w", c.ID(), err) + } + if newPasswd != "" { + // Make /etc/passwd + // If it already exists, delete so we can recreate + delete(c.state.BindMounts, "/etc/passwd") + c.state.BindMounts["/etc/passwd"] = newPasswd + } + if newGroup != "" { + // Make /etc/group + // If it already exists, delete so we can recreate + delete(c.state.BindMounts, "/etc/group") + c.state.BindMounts["/etc/group"] = newGroup + } + } + + // Make /etc/hostname + // This should never change, so no need to recreate if it exists + if _, ok := c.state.BindMounts["/etc/hostname"]; !ok { + hostnamePath, err := c.writeStringToRundir("hostname", c.Hostname()) + if err != nil { + return fmt.Errorf("error creating hostname file for container %s: %w", c.ID(), err) + } + c.state.BindMounts["/etc/hostname"] = hostnamePath + } + + // Make /etc/localtime + ctrTimezone := c.Timezone() + if ctrTimezone != "" { + // validate the format of the timezone specified if it's not "local" + if ctrTimezone != "local" { + _, err = time.LoadLocation(ctrTimezone) + if err != nil { + return fmt.Errorf("error finding timezone for container %s: %w", c.ID(), err) + } + } + if _, ok := c.state.BindMounts["/etc/localtime"]; !ok { + var zonePath string + if ctrTimezone == "local" { + zonePath, err = filepath.EvalSymlinks("/etc/localtime") + if err != nil { + return fmt.Errorf("error finding local timezone for container %s: %w", c.ID(), err) + } + } else { + zone := filepath.Join("/usr/share/zoneinfo", ctrTimezone) + zonePath, err = filepath.EvalSymlinks(zone) + if err != nil { + return fmt.Errorf("error setting timezone for container %s: %w", c.ID(), err) + } + } + localtimePath, err := c.copyTimezoneFile(zonePath) + if err != nil { + return fmt.Errorf("error setting timezone for container %s: %w", c.ID(), err) + } + c.state.BindMounts["/etc/localtime"] = localtimePath + } + } + + _, hasRunContainerenv := c.state.BindMounts["/run/.containerenv"] + if !hasRunContainerenv { + // check in the spec mounts + for _, m := range c.config.Spec.Mounts { + if m.Destination == "/run/.containerenv" || m.Destination == "/run" { + hasRunContainerenv = true + break + } + } + } + + // Make .containerenv if it does not exist + if !hasRunContainerenv { + containerenv := c.runtime.graphRootMountedFlag(c.config.Spec.Mounts) + isRootless := 0 + if rootless.IsRootless() { + isRootless = 1 + } + imageID, imageName := c.Image() + + if c.Privileged() { + // Populate the .containerenv with container information + containerenv = fmt.Sprintf(`engine="podman-%s" +name=%q +id=%q +image=%q +imageid=%q +rootless=%d +%s`, version.Version.String(), c.Name(), c.ID(), imageName, imageID, isRootless, containerenv) + } + containerenvPath, err := c.writeStringToRundir(".containerenv", containerenv) + if err != nil { + return fmt.Errorf("error creating containerenv file for container %s: %w", c.ID(), err) + } + c.state.BindMounts["/run/.containerenv"] = containerenvPath + } + + // Add Subscription Mounts + subscriptionMounts := subscriptions.MountsWithUIDGID(c.config.MountLabel, c.state.RunDir, c.runtime.config.Containers.DefaultMountsFile, c.state.Mountpoint, c.RootUID(), c.RootGID(), rootless.IsRootless(), false) + for _, mount := range subscriptionMounts { + if _, ok := c.state.BindMounts[mount.Destination]; !ok { + c.state.BindMounts[mount.Destination] = mount.Source + } + } + + // Secrets are mounted by getting the secret data from the secrets manager, + // copying the data into the container's static dir, + // then mounting the copied dir into /run/secrets. + // The secrets mounting must come after subscription mounts, since subscription mounts + // creates the /run/secrets dir in the container where we mount as well. + if len(c.Secrets()) > 0 { + // create /run/secrets if subscriptions did not create + if err := c.createSecretMountDir(); err != nil { + return fmt.Errorf("error creating secrets mount: %w", err) + } + for _, secret := range c.Secrets() { + secretFileName := secret.Name + base := "/run/secrets" + if secret.Target != "" { + secretFileName = secret.Target + // If absolute path for target given remove base. + if filepath.IsAbs(secretFileName) { + base = "" + } + } + src := filepath.Join(c.config.SecretsPath, secret.Name) + dest := filepath.Join(base, secretFileName) + c.state.BindMounts[dest] = src + } + } + + return nil +} + +// generateResolvConf generates a containers resolv.conf +func (c *Container) generateResolvConf() error { + var ( + networkNameServers []string + networkSearchDomains []string + ) + + netStatus := c.getNetworkStatus() + for _, status := range netStatus { + if status.DNSServerIPs != nil { + for _, nsIP := range status.DNSServerIPs { + networkNameServers = append(networkNameServers, nsIP.String()) + } + logrus.Debugf("Adding nameserver(s) from network status of '%q'", status.DNSServerIPs) + } + if status.DNSSearchDomains != nil { + networkSearchDomains = append(networkSearchDomains, status.DNSSearchDomains...) + logrus.Debugf("Adding search domain(s) from network status of '%q'", status.DNSSearchDomains) + } + } + + ipv6, err := c.checkForIPv6(netStatus) + if err != nil { + return err + } + + nameservers := make([]string, 0, len(c.runtime.config.Containers.DNSServers)+len(c.config.DNSServer)) + nameservers = append(nameservers, c.runtime.config.Containers.DNSServers...) + for _, ip := range c.config.DNSServer { + nameservers = append(nameservers, ip.String()) + } + // If the user provided dns, it trumps all; then dns masq; then resolv.conf + var search []string + keepHostServers := false + if len(nameservers) == 0 { + keepHostServers = true + // first add the nameservers from the networks status + nameservers = networkNameServers + // when we add network dns server we also have to add the search domains + search = networkSearchDomains + // slirp4netns has a built in DNS forwarder. + nameservers = c.addSlirp4netnsDNS(nameservers) + } + + if len(c.config.DNSSearch) > 0 || len(c.runtime.config.Containers.DNSSearches) > 0 { + customSearch := make([]string, 0, len(c.config.DNSSearch)+len(c.runtime.config.Containers.DNSSearches)) + customSearch = append(customSearch, c.runtime.config.Containers.DNSSearches...) + customSearch = append(customSearch, c.config.DNSSearch...) + search = customSearch + } + + options := make([]string, 0, len(c.config.DNSOption)+len(c.runtime.config.Containers.DNSOptions)) + options = append(options, c.runtime.config.Containers.DNSOptions...) + options = append(options, c.config.DNSOption...) + + destPath := filepath.Join(c.state.RunDir, "resolv.conf") + + if err := resolvconf.New(&resolvconf.Params{ + IPv6Enabled: ipv6, + KeepHostServers: keepHostServers, + Nameservers: nameservers, + Namespaces: c.config.Spec.Linux.Namespaces, + Options: options, + Path: destPath, + Searches: search, + }); err != nil { + return fmt.Errorf("error building resolv.conf for container %s: %w", c.ID(), err) + } + + return c.bindMountRootFile(destPath, resolvconf.DefaultResolvConf) +} + +// Check if a container uses IPv6. +func (c *Container) checkForIPv6(netStatus map[string]types.StatusBlock) (bool, error) { + for _, status := range netStatus { + for _, netInt := range status.Interfaces { + for _, netAddress := range netInt.Subnets { + // Note: only using To16() does not work since it also returns a valid ip for ipv4 + if netAddress.IPNet.IP.To4() == nil && netAddress.IPNet.IP.To16() != nil { + return true, nil + } + } + } + } + + return c.isSlirp4netnsIPv6() +} + +// Add a new nameserver to the container's resolv.conf, ensuring that it is the +// first nameserver present. +// Usable only with running containers. +func (c *Container) addNameserver(ips []string) error { + // Take no action if container is not running. + if !c.ensureState(define.ContainerStateRunning, define.ContainerStateCreated) { + return nil + } + + // Do we have a resolv.conf at all? + path, ok := c.state.BindMounts[resolvconf.DefaultResolvConf] + if !ok { + return nil + } + + if err := resolvconf.Add(path, ips); err != nil { + return fmt.Errorf("adding new nameserver to container %s resolv.conf: %w", c.ID(), err) + } + + return nil +} + +// Remove an entry from the existing resolv.conf of the container. +// Usable only with running containers. +func (c *Container) removeNameserver(ips []string) error { + // Take no action if container is not running. + if !c.ensureState(define.ContainerStateRunning, define.ContainerStateCreated) { + return nil + } + + // Do we have a resolv.conf at all? + path, ok := c.state.BindMounts[resolvconf.DefaultResolvConf] + if !ok { + return nil + } + + if err := resolvconf.Remove(path, ips); err != nil { + return fmt.Errorf("removing nameservers from container %s resolv.conf: %w", c.ID(), err) + } + + return nil +} + +func getLocalhostHostEntry(c *Container) etchosts.HostEntries { + return etchosts.HostEntries{{IP: "127.0.0.1", Names: []string{c.Hostname(), c.config.Name}}} +} + +// getHostsEntries returns the container ip host entries for the correct netmode +func (c *Container) getHostsEntries() (etchosts.HostEntries, error) { + var entries etchosts.HostEntries + names := []string{c.Hostname(), c.config.Name} + switch { + case c.config.NetMode.IsBridge(): + entries = etchosts.GetNetworkHostEntries(c.state.NetworkStatus, names...) + case c.config.NetMode.IsSlirp4netns(): + ip, err := GetSlirp4netnsIP(c.slirp4netnsSubnet) + if err != nil { + return nil, err + } + entries = etchosts.HostEntries{{IP: ip.String(), Names: names}} + default: + if c.hasNetNone() { + entries = etchosts.HostEntries{{IP: "127.0.0.1", Names: names}} + } + } + return entries, nil +} + +func (c *Container) createHosts() error { + var containerIPsEntries etchosts.HostEntries + var err error + // if we configure the netns after the container create we should not add + // the hosts here since we have no information about the actual ips + // instead we will add them in c.completeNetworkSetup() + if !c.config.PostConfigureNetNS { + containerIPsEntries, err = c.getHostsEntries() + if err != nil { + return fmt.Errorf("failed to get container ip host entries: %w", err) + } + } + baseHostFile, err := etchosts.GetBaseHostFile(c.runtime.config.Containers.BaseHostsFile, c.state.Mountpoint) + if err != nil { + return err + } + + targetFile := filepath.Join(c.state.RunDir, "hosts") + err = etchosts.New(&etchosts.Params{ + BaseFile: baseHostFile, + ExtraHosts: c.config.HostAdd, + ContainerIPs: containerIPsEntries, + HostContainersInternalIP: etchosts.GetHostContainersInternalIP(c.runtime.config, c.state.NetworkStatus, c.runtime.network), + TargetFile: targetFile, + }) + if err != nil { + return err + } + + return c.bindMountRootFile(targetFile, config.DefaultHostsFile) +} + +// bindMountRootFile will chown and relabel the source file to make it usable in the container. +// It will also add the path to the container bind mount map. +// source is the path on the host, dest is the path in the container. +func (c *Container) bindMountRootFile(source, dest string) error { + if err := os.Chown(source, c.RootUID(), c.RootGID()); err != nil { + return err + } + if err := label.Relabel(source, c.MountLabel(), false); err != nil { + return err + } + + return c.mountIntoRootDirs(dest, source) +} + +// generateGroupEntry generates an entry or entries into /etc/group as +// required by container configuration. +// Generally speaking, we will make an entry under two circumstances: +// 1. The container is started as a specific user:group, and that group is both +// numeric, and does not already exist in /etc/group. +// 2. It is requested that Libpod add the group that launched Podman to +// /etc/group via AddCurrentUserPasswdEntry (though this does not trigger if +// the group in question already exists in /etc/passwd). +// +// Returns group entry (as a string that can be appended to /etc/group) and any +// error that occurred. +func (c *Container) generateGroupEntry() (string, error) { + groupString := "" + + // Things we *can't* handle: adding the user we added in + // generatePasswdEntry to any *existing* groups. + addedGID := 0 + if c.config.AddCurrentUserPasswdEntry { + entry, gid, err := c.generateCurrentUserGroupEntry() + if err != nil { + return "", err + } + groupString += entry + addedGID = gid + } + if c.config.User != "" { + entry, err := c.generateUserGroupEntry(addedGID) + if err != nil { + return "", err + } + groupString += entry + } + + return groupString, nil +} + +// Make an entry in /etc/group for the group of the user running podman iff we +// are rootless. +func (c *Container) generateCurrentUserGroupEntry() (string, int, error) { + gid := rootless.GetRootlessGID() + if gid == 0 { + return "", 0, nil + } + + g, err := user.LookupGroupId(strconv.Itoa(gid)) + if err != nil { + return "", 0, fmt.Errorf("failed to get current group: %w", err) + } + + // Look up group name to see if it exists in the image. + _, err = lookup.GetGroup(c.state.Mountpoint, g.Name) + if err != runcuser.ErrNoGroupEntries { + return "", 0, err + } + + // Look up GID to see if it exists in the image. + _, err = lookup.GetGroup(c.state.Mountpoint, g.Gid) + if err != runcuser.ErrNoGroupEntries { + return "", 0, err + } + + // We need to get the username of the rootless user so we can add it to + // the group. + username := "" + uid := rootless.GetRootlessUID() + if uid != 0 { + u, err := user.LookupId(strconv.Itoa(uid)) + if err != nil { + return "", 0, fmt.Errorf("failed to get current user to make group entry: %w", err) + } + username = u.Username + } + + // Make the entry. + return fmt.Sprintf("%s:x:%s:%s\n", g.Name, g.Gid, username), gid, nil +} + +// Make an entry in /etc/group for the group the container was specified to run +// as. +func (c *Container) generateUserGroupEntry(addedGID int) (string, error) { + if c.config.User == "" { + return "", nil + } + + splitUser := strings.SplitN(c.config.User, ":", 2) + group := splitUser[0] + if len(splitUser) > 1 { + group = splitUser[1] + } + + gid, err := strconv.ParseUint(group, 10, 32) + if err != nil { + return "", nil //nolint: nilerr + } + + if addedGID != 0 && addedGID == int(gid) { + return "", nil + } + + // Check if the group already exists + _, err = lookup.GetGroup(c.state.Mountpoint, group) + if err != runcuser.ErrNoGroupEntries { + return "", err + } + + return fmt.Sprintf("%d:x:%d:%s\n", gid, gid, splitUser[0]), nil +} + +// generatePasswdEntry generates an entry or entries into /etc/passwd as +// required by container configuration. +// Generally speaking, we will make an entry under two circumstances: +// 1. The container is started as a specific user who is not in /etc/passwd. +// This only triggers if the user is given as a *numeric* ID. +// 2. It is requested that Libpod add the user that launched Podman to +// /etc/passwd via AddCurrentUserPasswdEntry (though this does not trigger if +// the user in question already exists in /etc/passwd) or the UID to be added +// is 0). +// 3. The user specified additional host user accounts to add the the /etc/passwd file +// +// Returns password entry (as a string that can be appended to /etc/passwd) and +// any error that occurred. +func (c *Container) generatePasswdEntry() (string, error) { + passwdString := "" + + addedUID := 0 + for _, userid := range c.config.HostUsers { + // Look up User on host + u, err := util.LookupUser(userid) + if err != nil { + return "", err + } + entry, err := c.userPasswdEntry(u) + if err != nil { + return "", err + } + passwdString += entry + } + if c.config.AddCurrentUserPasswdEntry { + entry, uid, _, err := c.generateCurrentUserPasswdEntry() + if err != nil { + return "", err + } + passwdString += entry + addedUID = uid + } + if c.config.User != "" { + entry, err := c.generateUserPasswdEntry(addedUID) + if err != nil { + return "", err + } + passwdString += entry + } + + return passwdString, nil +} + +// generateCurrentUserPasswdEntry generates an /etc/passwd entry for the user +// running the container engine. +// Returns a passwd entry for the user, and the UID and GID of the added entry. +func (c *Container) generateCurrentUserPasswdEntry() (string, int, int, error) { + uid := rootless.GetRootlessUID() + if uid == 0 { + return "", 0, 0, nil + } + + u, err := user.LookupId(strconv.Itoa(uid)) + if err != nil { + return "", 0, 0, fmt.Errorf("failed to get current user: %w", err) + } + pwd, err := c.userPasswdEntry(u) + if err != nil { + return "", 0, 0, err + } + + return pwd, uid, rootless.GetRootlessGID(), nil +} + +func (c *Container) userPasswdEntry(u *user.User) (string, error) { + // Look up the user to see if it exists in the container image. + _, err := lookup.GetUser(c.state.Mountpoint, u.Username) + if err != runcuser.ErrNoPasswdEntries { + return "", err + } + + // Look up the UID to see if it exists in the container image. + _, err = lookup.GetUser(c.state.Mountpoint, u.Uid) + if err != runcuser.ErrNoPasswdEntries { + return "", err + } + + // If the user's actual home directory exists, or was mounted in - use + // that. + homeDir := c.WorkingDir() + hDir := u.HomeDir + for hDir != "/" { + if MountExists(c.config.Spec.Mounts, hDir) { + homeDir = u.HomeDir + break + } + hDir = filepath.Dir(hDir) + } + if homeDir != u.HomeDir { + for _, hDir := range c.UserVolumes() { + if hDir == u.HomeDir { + homeDir = u.HomeDir + break + } + } + } + // Set HOME environment if not already set + hasHomeSet := false + for _, s := range c.config.Spec.Process.Env { + if strings.HasPrefix(s, "HOME=") { + hasHomeSet = true + break + } + } + if !hasHomeSet { + c.config.Spec.Process.Env = append(c.config.Spec.Process.Env, fmt.Sprintf("HOME=%s", homeDir)) + } + if c.config.PasswdEntry != "" { + return c.passwdEntry(u.Username, u.Uid, u.Gid, u.Name, homeDir), nil + } + + return fmt.Sprintf("%s:*:%s:%s:%s:%s:/bin/sh\n", u.Username, u.Uid, u.Gid, u.Name, homeDir), nil +} + +// generateUserPasswdEntry generates an /etc/passwd entry for the container user +// to run in the container. +// The UID and GID of the added entry will also be returned. +// Accepts one argument, that being any UID that has already been added to the +// passwd file by other functions; if it matches the UID we were given, we don't +// need to do anything. +func (c *Container) generateUserPasswdEntry(addedUID int) (string, error) { + var ( + groupspec string + gid int + ) + if c.config.User == "" { + return "", nil + } + splitSpec := strings.SplitN(c.config.User, ":", 2) + userspec := splitSpec[0] + if len(splitSpec) > 1 { + groupspec = splitSpec[1] + } + // If a non numeric User, then don't generate passwd + uid, err := strconv.ParseUint(userspec, 10, 32) + if err != nil { + return "", nil //nolint: nilerr + } + + if addedUID != 0 && int(uid) == addedUID { + return "", nil + } + + // Look up the user to see if it exists in the container image + _, err = lookup.GetUser(c.state.Mountpoint, userspec) + if err != runcuser.ErrNoPasswdEntries { + return "", err + } + + if groupspec != "" { + ugid, err := strconv.ParseUint(groupspec, 10, 32) + if err == nil { + gid = int(ugid) + } else { + group, err := lookup.GetGroup(c.state.Mountpoint, groupspec) + if err != nil { + return "", fmt.Errorf("unable to get gid %s from group file: %w", groupspec, err) + } + gid = group.Gid + } + } + + if c.config.PasswdEntry != "" { + entry := c.passwdEntry(fmt.Sprintf("%d", uid), fmt.Sprintf("%d", uid), fmt.Sprintf("%d", gid), "container user", c.WorkingDir()) + return entry, nil + } + + return fmt.Sprintf("%d:*:%d:%d:container user:%s:/bin/sh\n", uid, uid, gid, c.WorkingDir()), nil +} + +func (c *Container) passwdEntry(username string, uid, gid, name, homeDir string) string { + s := c.config.PasswdEntry + s = strings.ReplaceAll(s, "$USERNAME", username) + s = strings.ReplaceAll(s, "$UID", uid) + s = strings.ReplaceAll(s, "$GID", gid) + s = strings.ReplaceAll(s, "$NAME", name) + s = strings.ReplaceAll(s, "$HOME", homeDir) + return s + "\n" +} + +// generatePasswdAndGroup generates container-specific passwd and group files +// iff g.config.User is a number or we are configured to make a passwd entry for +// the current user or the user specified HostsUsers +// Returns path to file to mount at /etc/passwd, path to file to mount at +// /etc/group, and any error that occurred. If no passwd/group file were +// required, the empty string will be returned for those path (this may occur +// even if no error happened). +// This may modify the mounted container's /etc/passwd and /etc/group instead of +// making copies to bind-mount in, so we don't break useradd (it wants to make a +// copy of /etc/passwd and rename the copy to /etc/passwd, which is impossible +// with a bind mount). This is done in cases where the container is *not* +// read-only. In this case, the function will return nothing ("", "", nil). +func (c *Container) generatePasswdAndGroup() (string, string, error) { + if !c.config.AddCurrentUserPasswdEntry && c.config.User == "" && + len(c.config.HostUsers) == 0 { + return "", "", nil + } + + needPasswd := true + needGroup := true + + // First, check if there's a mount at /etc/passwd or group, we don't + // want to interfere with user mounts. + if MountExists(c.config.Spec.Mounts, "/etc/passwd") { + needPasswd = false + } + if MountExists(c.config.Spec.Mounts, "/etc/group") { + needGroup = false + } + + // Next, check if we already made the files. If we didn't, don't need to + // do anything more. + if needPasswd { + passwdPath := filepath.Join(c.config.StaticDir, "passwd") + if _, err := os.Stat(passwdPath); err == nil { + needPasswd = false + } + } + if needGroup { + groupPath := filepath.Join(c.config.StaticDir, "group") + if _, err := os.Stat(groupPath); err == nil { + needGroup = false + } + } + + // If we don't need a /etc/passwd or /etc/group at this point we can + // just return. + if !needPasswd && !needGroup { + return "", "", nil + } + + passwdPath := "" + groupPath := "" + + ro := c.IsReadOnly() + + if needPasswd { + passwdEntry, err := c.generatePasswdEntry() + if err != nil { + return "", "", err + } + + needsWrite := passwdEntry != "" + switch { + case ro && needsWrite: + logrus.Debugf("Making /etc/passwd for container %s", c.ID()) + originPasswdFile, err := securejoin.SecureJoin(c.state.Mountpoint, "/etc/passwd") + if err != nil { + return "", "", fmt.Errorf("error creating path to container %s /etc/passwd: %w", c.ID(), err) + } + orig, err := ioutil.ReadFile(originPasswdFile) + if err != nil && !os.IsNotExist(err) { + return "", "", err + } + passwdFile, err := c.writeStringToStaticDir("passwd", string(orig)+passwdEntry) + if err != nil { + return "", "", fmt.Errorf("failed to create temporary passwd file: %w", err) + } + if err := os.Chmod(passwdFile, 0644); err != nil { + return "", "", err + } + passwdPath = passwdFile + case !ro && needsWrite: + logrus.Debugf("Modifying container %s /etc/passwd", c.ID()) + containerPasswd, err := securejoin.SecureJoin(c.state.Mountpoint, "/etc/passwd") + if err != nil { + return "", "", fmt.Errorf("error looking up location of container %s /etc/passwd: %w", c.ID(), err) + } + + f, err := os.OpenFile(containerPasswd, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0600) + if err != nil { + return "", "", fmt.Errorf("container %s: %w", c.ID(), err) + } + defer f.Close() + + if _, err := f.WriteString(passwdEntry); err != nil { + return "", "", fmt.Errorf("unable to append to container %s /etc/passwd: %w", c.ID(), err) + } + default: + logrus.Debugf("Not modifying container %s /etc/passwd", c.ID()) + } + } + if needGroup { + groupEntry, err := c.generateGroupEntry() + if err != nil { + return "", "", err + } + + needsWrite := groupEntry != "" + switch { + case ro && needsWrite: + logrus.Debugf("Making /etc/group for container %s", c.ID()) + originGroupFile, err := securejoin.SecureJoin(c.state.Mountpoint, "/etc/group") + if err != nil { + return "", "", fmt.Errorf("error creating path to container %s /etc/group: %w", c.ID(), err) + } + orig, err := ioutil.ReadFile(originGroupFile) + if err != nil && !os.IsNotExist(err) { + return "", "", err + } + groupFile, err := c.writeStringToStaticDir("group", string(orig)+groupEntry) + if err != nil { + return "", "", fmt.Errorf("failed to create temporary group file: %w", err) + } + if err := os.Chmod(groupFile, 0644); err != nil { + return "", "", err + } + groupPath = groupFile + case !ro && needsWrite: + logrus.Debugf("Modifying container %s /etc/group", c.ID()) + containerGroup, err := securejoin.SecureJoin(c.state.Mountpoint, "/etc/group") + if err != nil { + return "", "", fmt.Errorf("error looking up location of container %s /etc/group: %w", c.ID(), err) + } + + f, err := os.OpenFile(containerGroup, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0600) + if err != nil { + return "", "", fmt.Errorf("container %s: %w", c.ID(), err) + } + defer f.Close() + + if _, err := f.WriteString(groupEntry); err != nil { + return "", "", fmt.Errorf("unable to append to container %s /etc/group: %w", c.ID(), err) + } + default: + logrus.Debugf("Not modifying container %s /etc/group", c.ID()) + } + } + + return passwdPath, groupPath, nil +} + +func (c *Container) copyTimezoneFile(zonePath string) (string, error) { + localtimeCopy := filepath.Join(c.state.RunDir, "localtime") + file, err := os.Stat(zonePath) + if err != nil { + return "", err + } + if file.IsDir() { + return "", errors.New("invalid timezone: is a directory") + } + src, err := os.Open(zonePath) + if err != nil { + return "", err + } + defer src.Close() + dest, err := os.Create(localtimeCopy) + if err != nil { + return "", err + } + defer dest.Close() + _, err = io.Copy(dest, src) + if err != nil { + return "", err + } + if err := c.relabel(localtimeCopy, c.config.MountLabel, false); err != nil { + return "", err + } + if err := dest.Chown(c.RootUID(), c.RootGID()); err != nil { + return "", err + } + return localtimeCopy, err +} + +func (c *Container) cleanupOverlayMounts() error { + return overlay.CleanupContent(c.config.StaticDir) +} + +// Creates and mounts an empty dir to mount secrets into, if it does not already exist +func (c *Container) createSecretMountDir() error { + src := filepath.Join(c.state.RunDir, "/run/secrets") + _, err := os.Stat(src) + if os.IsNotExist(err) { + oldUmask := umask.Set(0) + defer umask.Set(oldUmask) + + if err := os.MkdirAll(src, 0755); err != nil { + return err + } + if err := label.Relabel(src, c.config.MountLabel, false); err != nil { + return err + } + if err := os.Chown(src, c.RootUID(), c.RootGID()); err != nil { + return err + } + c.state.BindMounts["/run/secrets"] = src + return nil + } + + return err +} + +// Fix ownership and permissions of the specified volume if necessary. +func (c *Container) fixVolumePermissions(v *ContainerNamedVolume) error { + vol, err := c.runtime.state.Volume(v.Name) + if err != nil { + return fmt.Errorf("error retrieving named volume %s for container %s: %w", v.Name, c.ID(), err) + } + + vol.lock.Lock() + defer vol.lock.Unlock() + + // The volume may need a copy-up. Check the state. + if err := vol.update(); err != nil { + return err + } + + // Volumes owned by a volume driver are not chowned - we don't want to + // mess with a mount not managed by us. + if vol.state.NeedsChown && !vol.UsesVolumeDriver() { + vol.state.NeedsChown = false + + uid := int(c.config.Spec.Process.User.UID) + gid := int(c.config.Spec.Process.User.GID) + + if c.config.IDMappings.UIDMap != nil { + p := idtools.IDPair{ + UID: uid, + GID: gid, + } + mappings := idtools.NewIDMappingsFromMaps(c.config.IDMappings.UIDMap, c.config.IDMappings.GIDMap) + newPair, err := mappings.ToHost(p) + if err != nil { + return fmt.Errorf("error mapping user %d:%d: %w", uid, gid, err) + } + uid = newPair.UID + gid = newPair.GID + } + + vol.state.UIDChowned = uid + vol.state.GIDChowned = gid + + if err := vol.save(); err != nil { + return err + } + + mountPoint, err := vol.MountPoint() + if err != nil { + return err + } + + if err := os.Lchown(mountPoint, uid, gid); err != nil { + return err + } + + // Make sure the new volume matches the permissions of the target directory. + // https://github.com/containers/podman/issues/10188 + st, err := os.Lstat(filepath.Join(c.state.Mountpoint, v.Dest)) + if err == nil { + if stat, ok := st.Sys().(*syscall.Stat_t); ok { + if err := os.Lchown(mountPoint, int(stat.Uid), int(stat.Gid)); err != nil { + return err + } + } + if err := os.Chmod(mountPoint, st.Mode()); err != nil { + return err + } + if err := setVolumeAtime(mountPoint, st); err != nil { + return err + } + } else if !os.IsNotExist(err) { + return err + } + } + return nil +} + +func (c *Container) relabel(src, mountLabel string, recurse bool) error { + if !selinux.GetEnabled() || mountLabel == "" { + return nil + } + // only relabel on initial creation of container + if !c.ensureState(define.ContainerStateConfigured, define.ContainerStateUnknown) { + label, err := label.FileLabel(src) + if err != nil { + return err + } + // If labels are different, might be on a tmpfs + if label == mountLabel { + return nil + } + } + return label.Relabel(src, mountLabel, recurse) +} + +func (c *Container) ChangeHostPathOwnership(src string, recurse bool, uid, gid int) error { + // only chown on initial creation of container + if !c.ensureState(define.ContainerStateConfigured, define.ContainerStateUnknown) { + st, err := os.Stat(src) + if err != nil { + return err + } + + // If labels are different, might be on a tmpfs + if int(st.Sys().(*syscall.Stat_t).Uid) == uid && int(st.Sys().(*syscall.Stat_t).Gid) == gid { + return nil + } + } + return chown.ChangeHostPathOwnership(src, recurse, uid, gid) +} diff --git a/libpod/container_internal_freebsd.go b/libpod/container_internal_freebsd.go new file mode 100644 index 000000000..40c6c5ebf --- /dev/null +++ b/libpod/container_internal_freebsd.go @@ -0,0 +1,285 @@ +//go:build freebsd +// +build freebsd + +package libpod + +import ( + "errors" + "fmt" + "os" + "strings" + "sync" + "syscall" + "time" + + "github.com/containers/common/libnetwork/types" + "github.com/containers/podman/v4/pkg/rootless" + spec "github.com/opencontainers/runtime-spec/specs-go" + "github.com/opencontainers/runtime-tools/generate" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +var ( + bindOptions = []string{} +) + +// Network stubs to decouple container_internal_freebsd.go from +// networking_freebsd.go so they can be reviewed separately. +func (r *Runtime) createNetNS(ctr *Container) (netJail string, q map[string]types.StatusBlock, retErr error) { + return "", nil, errors.New("not implemented (*Runtime) createNetNS") +} + +func (r *Runtime) teardownNetNS(ctr *Container) error { + return errors.New("not implemented (*Runtime) teardownNetNS") +} + +func (r *Runtime) reloadContainerNetwork(ctr *Container) (map[string]types.StatusBlock, error) { + return nil, errors.New("not implemented (*Runtime) reloadContainerNetwork") +} + +func (c *Container) mountSHM(shmOptions string) error { + return nil +} + +func (c *Container) unmountSHM(path string) error { + return nil +} + +// prepare mounts the container and sets up other required resources like net +// namespaces +func (c *Container) prepare() error { + var ( + wg sync.WaitGroup + jailName string + networkStatus map[string]types.StatusBlock + createNetNSErr, mountStorageErr error + mountPoint string + tmpStateLock sync.Mutex + ) + + wg.Add(2) + + go func() { + defer wg.Done() + // Set up network namespace if not already set up + noNetNS := c.state.NetworkJail == "" + if c.config.CreateNetNS && noNetNS && !c.config.PostConfigureNetNS { + jailName, networkStatus, createNetNSErr = c.runtime.createNetNS(c) + if createNetNSErr != nil { + return + } + + tmpStateLock.Lock() + defer tmpStateLock.Unlock() + + // Assign NetNS attributes to container + c.state.NetworkJail = jailName + c.state.NetworkStatus = networkStatus + } + }() + // Mount storage if not mounted + go func() { + defer wg.Done() + mountPoint, mountStorageErr = c.mountStorage() + + if mountStorageErr != nil { + return + } + + tmpStateLock.Lock() + defer tmpStateLock.Unlock() + + // Finish up mountStorage + c.state.Mounted = true + c.state.Mountpoint = mountPoint + + logrus.Debugf("Created root filesystem for container %s at %s", c.ID(), c.state.Mountpoint) + }() + + wg.Wait() + + var createErr error + if mountStorageErr != nil { + if createErr != nil { + logrus.Errorf("Preparing container %s: %v", c.ID(), createErr) + } + createErr = mountStorageErr + } + + if createErr != nil { + return createErr + } + + // Save changes to container state + if err := c.save(); err != nil { + return err + } + + return nil +} + +// cleanupNetwork unmounts and cleans up the container's network +func (c *Container) cleanupNetwork() error { + if c.config.NetNsCtr != "" { + return nil + } + netDisabled, err := c.NetworkDisabled() + if err != nil { + return err + } + if netDisabled { + return nil + } + + // Stop the container's network namespace (if it has one) + if err := c.runtime.teardownNetNS(c); err != nil { + logrus.Errorf("Unable to cleanup network for container %s: %q", c.ID(), err) + } + + if c.valid { + return c.save() + } + + return nil +} + +// reloadNetwork reloads the network for the given container, recreating +// firewall rules. +func (c *Container) reloadNetwork() error { + result, err := c.runtime.reloadContainerNetwork(c) + if err != nil { + return err + } + + c.state.NetworkStatus = result + + return c.save() +} + +// Add an existing container's network jail +func (c *Container) addNetworkContainer(g *generate.Generator, ctr string) error { + nsCtr, err := c.runtime.state.Container(ctr) + c.runtime.state.UpdateContainer(nsCtr) + if err != nil { + return fmt.Errorf("error retrieving dependency %s of container %s from state: %w", ctr, c.ID(), err) + } + g.AddAnnotation("org.freebsd.parentJail", nsCtr.state.NetworkJail) + return nil +} + +func isRootlessCgroupSet(cgroup string) bool { + return false +} + +func (c *Container) expectPodCgroup() (bool, error) { + return false, nil +} + +func (c *Container) getOCICgroupPath() (string, error) { + return "", nil +} + +func openDirectory(path string) (fd int, err error) { + const O_PATH = 0x00400000 + return unix.Open(path, unix.O_RDONLY|O_PATH, 0) +} + +func (c *Container) addNetworkNamespace(g *generate.Generator) error { + if c.config.CreateNetNS { + g.AddAnnotation("org.freebsd.parentJail", c.state.NetworkJail) + } + return nil +} + +func (c *Container) addSystemdMounts(g *generate.Generator) error { + return nil +} + +func (c *Container) addSharedNamespaces(g *generate.Generator) error { + if c.config.NetNsCtr != "" { + if err := c.addNetworkContainer(g, c.config.NetNsCtr); err != nil { + return err + } + } + + availableUIDs, availableGIDs, err := rootless.GetAvailableIDMaps() + if err != nil { + if os.IsNotExist(err) { + // The kernel-provided files only exist if user namespaces are supported + logrus.Debugf("User or group ID mappings not available: %s", err) + } else { + return err + } + } else { + g.Config.Linux.UIDMappings = rootless.MaybeSplitMappings(g.Config.Linux.UIDMappings, availableUIDs) + g.Config.Linux.GIDMappings = rootless.MaybeSplitMappings(g.Config.Linux.GIDMappings, availableGIDs) + } + + // Hostname handling: + // If we have a UTS namespace, set Hostname in the OCI spec. + // Set the HOSTNAME environment variable unless explicitly overridden by + // the user (already present in OCI spec). If we don't have a UTS ns, + // set it to the host's hostname instead. + hostname := c.Hostname() + foundUTS := false + + // TODO: make this optional, needs progress on adding FreeBSD section to the spec + foundUTS = true + g.SetHostname(hostname) + + if !foundUTS { + tmpHostname, err := os.Hostname() + if err != nil { + return err + } + hostname = tmpHostname + } + needEnv := true + for _, checkEnv := range g.Config.Process.Env { + if strings.SplitN(checkEnv, "=", 2)[0] == "HOSTNAME" { + needEnv = false + break + } + } + if needEnv { + g.AddProcessEnv("HOSTNAME", hostname) + } + return nil +} + +func (c *Container) addRootPropagation(g *generate.Generator, mounts []spec.Mount) error { + return nil +} + +func (c *Container) setProcessLabel(g *generate.Generator) { +} + +func (c *Container) setMountLabel(g *generate.Generator) { +} + +func (c *Container) setCgroupsPath(g *generate.Generator) error { + return nil +} + +func (c *Container) addSlirp4netnsDNS(nameservers []string) []string { + return nameservers +} + +func (c *Container) isSlirp4netnsIPv6() (bool, error) { + return false, nil +} + +// check for net=none +func (c *Container) hasNetNone() bool { + return c.state.NetworkJail == "" +} + +func setVolumeAtime(mountPoint string, st os.FileInfo) error { + stat := st.Sys().(*syscall.Stat_t) + atime := time.Unix(int64(stat.Atimespec.Sec), int64(stat.Atimespec.Nsec)) //nolint: unconvert + if err := os.Chtimes(mountPoint, atime, st.ModTime()); err != nil { + return err + } + return nil +} diff --git a/libpod/container_internal_linux.go b/libpod/container_internal_linux.go index 5c5fd471b..9b05a2d61 100644 --- a/libpod/container_internal_linux.go +++ b/libpod/container_internal_linux.go @@ -4,64 +4,34 @@ package libpod import ( - "context" "errors" "fmt" - "io" - "io/ioutil" - "math" "os" - "os/user" "path" "path/filepath" - "strconv" "strings" "sync" "syscall" "time" - metadata "github.com/checkpoint-restore/checkpointctl/lib" - "github.com/checkpoint-restore/go-criu/v5/stats" - cdi "github.com/container-orchestrated-devices/container-device-interface/pkg/cdi" "github.com/containernetworking/plugins/pkg/ns" - "github.com/containers/buildah" - "github.com/containers/buildah/pkg/chrootuser" - "github.com/containers/buildah/pkg/overlay" - butil "github.com/containers/buildah/util" - "github.com/containers/common/libnetwork/etchosts" - "github.com/containers/common/libnetwork/resolvconf" "github.com/containers/common/libnetwork/types" - "github.com/containers/common/pkg/apparmor" "github.com/containers/common/pkg/cgroups" - "github.com/containers/common/pkg/chown" "github.com/containers/common/pkg/config" - "github.com/containers/common/pkg/subscriptions" - "github.com/containers/common/pkg/umask" - cutil "github.com/containers/common/pkg/util" - is "github.com/containers/image/v5/storage" "github.com/containers/podman/v4/libpod/define" - "github.com/containers/podman/v4/libpod/events" - "github.com/containers/podman/v4/pkg/annotations" - "github.com/containers/podman/v4/pkg/checkpoint/crutils" - "github.com/containers/podman/v4/pkg/criu" - "github.com/containers/podman/v4/pkg/lookup" "github.com/containers/podman/v4/pkg/rootless" - "github.com/containers/podman/v4/pkg/util" "github.com/containers/podman/v4/utils" - "github.com/containers/podman/v4/version" - "github.com/containers/storage/pkg/archive" - "github.com/containers/storage/pkg/idtools" - "github.com/containers/storage/pkg/lockfile" - securejoin "github.com/cyphar/filepath-securejoin" - runcuser "github.com/opencontainers/runc/libcontainer/user" spec "github.com/opencontainers/runtime-spec/specs-go" "github.com/opencontainers/runtime-tools/generate" - "github.com/opencontainers/selinux/go-selinux" "github.com/opencontainers/selinux/go-selinux/label" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) +var ( + bindOptions = []string{"bind", "rprivate"} +) + func (c *Container) mountSHM(shmOptions string) error { if err := unix.Mount("shm", c.config.ShmDir, "tmpfs", unix.MS_NOEXEC|unix.MS_NOSUID|unix.MS_NODEV, label.FormatMountLabel(shmOptions, c.config.MountLabel)); err != nil { @@ -177,118 +147,6 @@ func (c *Container) prepare() error { return nil } -// isWorkDirSymlink returns true if resolved workdir is symlink or a chain of symlinks, -// and final resolved target is present either on volume, mount or inside of container -// otherwise it returns false. Following function is meant for internal use only and -// can change at any point of time. -func (c *Container) isWorkDirSymlink(resolvedPath string) bool { - // We cannot create workdir since explicit --workdir is - // set in config but workdir could also be a symlink. - // If it's a symlink, check if the resolved target is present in the container. - // If so, that's a valid use case: return nil. - - maxSymLinks := 0 - for { - // Linux only supports a chain of 40 links. - // Reference: https://github.com/torvalds/linux/blob/master/include/linux/namei.h#L13 - if maxSymLinks > 40 { - break - } - resolvedSymlink, err := os.Readlink(resolvedPath) - if err != nil { - // End sym-link resolution loop. - break - } - if resolvedSymlink != "" { - _, resolvedSymlinkWorkdir, err := c.resolvePath(c.state.Mountpoint, resolvedSymlink) - if isPathOnVolume(c, resolvedSymlinkWorkdir) || isPathOnBindMount(c, resolvedSymlinkWorkdir) { - // Resolved symlink exists on external volume or mount - return true - } - if err != nil { - // Could not resolve path so end sym-link resolution loop. - break - } - if resolvedSymlinkWorkdir != "" { - resolvedPath = resolvedSymlinkWorkdir - _, err := os.Stat(resolvedSymlinkWorkdir) - if err == nil { - // Symlink resolved successfully and resolved path exists on container, - // this is a valid use-case so return nil. - logrus.Debugf("Workdir is a symlink with target to %q and resolved symlink exists on container", resolvedSymlink) - return true - } - } - } - maxSymLinks++ - } - return false -} - -// resolveWorkDir resolves the container's workdir and, depending on the -// configuration, will create it, or error out if it does not exist. -// Note that the container must be mounted before. -func (c *Container) resolveWorkDir() error { - workdir := c.WorkingDir() - - // If the specified workdir is a subdir of a volume or mount, - // we don't need to do anything. The runtime is taking care of - // that. - if isPathOnVolume(c, workdir) || isPathOnBindMount(c, workdir) { - logrus.Debugf("Workdir %q resolved to a volume or mount", workdir) - return nil - } - - _, resolvedWorkdir, err := c.resolvePath(c.state.Mountpoint, workdir) - if err != nil { - return err - } - logrus.Debugf("Workdir %q resolved to host path %q", workdir, resolvedWorkdir) - - st, err := os.Stat(resolvedWorkdir) - if err == nil { - if !st.IsDir() { - return fmt.Errorf("workdir %q exists on container %s, but is not a directory", workdir, c.ID()) - } - return nil - } - if !c.config.CreateWorkingDir { - // No need to create it (e.g., `--workdir=/foo`), so let's make sure - // the path exists on the container. - if err != nil { - if os.IsNotExist(err) { - // If resolved Workdir path gets marked as a valid symlink, - // return nil cause this is valid use-case. - if c.isWorkDirSymlink(resolvedWorkdir) { - return nil - } - return fmt.Errorf("workdir %q does not exist on container %s", workdir, c.ID()) - } - // This might be a serious error (e.g., permission), so - // we need to return the full error. - return fmt.Errorf("error detecting workdir %q on container %s: %w", workdir, c.ID(), err) - } - return nil - } - if err := os.MkdirAll(resolvedWorkdir, 0755); err != nil { - if os.IsExist(err) { - return nil - } - return fmt.Errorf("error creating container %s workdir: %w", c.ID(), err) - } - - // Ensure container entrypoint is created (if required). - uid, gid, _, err := chrootuser.GetUser(c.state.Mountpoint, c.User()) - if err != nil { - return fmt.Errorf("error looking up %s inside of the container %s: %w", c.User(), c.ID(), err) - } - if err := os.Chown(resolvedWorkdir, int(uid), int(gid)); err != nil { - return fmt.Errorf("error chowning container %s workdir to container root: %w", c.ID(), err) - } - - return nil -} - // cleanupNetwork unmounts and cleans up the container's network func (c *Container) cleanupNetwork() error { if c.config.NetNsCtr != "" { @@ -335,670 +193,6 @@ func (c *Container) reloadNetwork() error { return c.save() } -func (c *Container) getUserOverrides() *lookup.Overrides { - var hasPasswdFile, hasGroupFile bool - overrides := lookup.Overrides{} - for _, m := range c.config.Spec.Mounts { - if m.Destination == "/etc/passwd" { - overrides.ContainerEtcPasswdPath = m.Source - hasPasswdFile = true - } - if m.Destination == "/etc/group" { - overrides.ContainerEtcGroupPath = m.Source - hasGroupFile = true - } - if m.Destination == "/etc" { - if !hasPasswdFile { - overrides.ContainerEtcPasswdPath = filepath.Join(m.Source, "passwd") - } - if !hasGroupFile { - overrides.ContainerEtcGroupPath = filepath.Join(m.Source, "group") - } - } - } - if path, ok := c.state.BindMounts["/etc/passwd"]; ok { - overrides.ContainerEtcPasswdPath = path - } - return &overrides -} - -func lookupHostUser(name string) (*runcuser.ExecUser, error) { - var execUser runcuser.ExecUser - // Look up User on host - u, err := util.LookupUser(name) - if err != nil { - return &execUser, err - } - uid, err := strconv.ParseUint(u.Uid, 8, 32) - if err != nil { - return &execUser, err - } - - gid, err := strconv.ParseUint(u.Gid, 8, 32) - if err != nil { - return &execUser, err - } - execUser.Uid = int(uid) - execUser.Gid = int(gid) - execUser.Home = u.HomeDir - return &execUser, nil -} - -// Internal only function which returns upper and work dir from -// overlay options. -func getOverlayUpperAndWorkDir(options []string) (string, string, error) { - upperDir := "" - workDir := "" - for _, o := range options { - if strings.HasPrefix(o, "upperdir") { - splitOpt := strings.SplitN(o, "=", 2) - if len(splitOpt) > 1 { - upperDir = splitOpt[1] - if upperDir == "" { - return "", "", errors.New("cannot accept empty value for upperdir") - } - } - } - if strings.HasPrefix(o, "workdir") { - splitOpt := strings.SplitN(o, "=", 2) - if len(splitOpt) > 1 { - workDir = splitOpt[1] - if workDir == "" { - return "", "", errors.New("cannot accept empty value for workdir") - } - } - } - } - if (upperDir != "" && workDir == "") || (upperDir == "" && workDir != "") { - return "", "", errors.New("must specify both upperdir and workdir") - } - return upperDir, workDir, nil -} - -// Generate spec for a container -// Accepts a map of the container's dependencies -func (c *Container) generateSpec(ctx context.Context) (*spec.Spec, error) { - overrides := c.getUserOverrides() - execUser, err := lookup.GetUserGroupInfo(c.state.Mountpoint, c.config.User, overrides) - if err != nil { - if cutil.StringInSlice(c.config.User, c.config.HostUsers) { - execUser, err = lookupHostUser(c.config.User) - } - if err != nil { - return nil, err - } - } - - // NewFromSpec() is deprecated according to its comment - // however the recommended replace just causes a nil map panic - //nolint:staticcheck - g := generate.NewFromSpec(c.config.Spec) - - // If the flag to mount all devices is set for a privileged container, add - // all the devices from the host's machine into the container - if c.config.MountAllDevices { - if err := util.AddPrivilegedDevices(&g); err != nil { - return nil, err - } - } - - // If network namespace was requested, add it now - if c.config.CreateNetNS { - if c.config.PostConfigureNetNS { - if err := g.AddOrReplaceLinuxNamespace(string(spec.NetworkNamespace), ""); err != nil { - return nil, err - } - } else { - if err := g.AddOrReplaceLinuxNamespace(string(spec.NetworkNamespace), c.state.NetNS.Path()); err != nil { - return nil, err - } - } - } - - // Apply AppArmor checks and load the default profile if needed. - if len(c.config.Spec.Process.ApparmorProfile) > 0 { - updatedProfile, err := apparmor.CheckProfileAndLoadDefault(c.config.Spec.Process.ApparmorProfile) - if err != nil { - return nil, err - } - g.SetProcessApparmorProfile(updatedProfile) - } - - if err := c.makeBindMounts(); err != nil { - return nil, err - } - - if err := c.mountNotifySocket(g); err != nil { - return nil, err - } - - // Get host UID and GID based on the container process UID and GID. - hostUID, hostGID, err := butil.GetHostIDs(util.IDtoolsToRuntimeSpec(c.config.IDMappings.UIDMap), util.IDtoolsToRuntimeSpec(c.config.IDMappings.GIDMap), uint32(execUser.Uid), uint32(execUser.Gid)) - if err != nil { - return nil, err - } - - // Add named volumes - for _, namedVol := range c.config.NamedVolumes { - volume, err := c.runtime.GetVolume(namedVol.Name) - if err != nil { - return nil, fmt.Errorf("error retrieving volume %s to add to container %s: %w", namedVol.Name, c.ID(), err) - } - mountPoint, err := volume.MountPoint() - if err != nil { - return nil, err - } - - overlayFlag := false - upperDir := "" - workDir := "" - for _, o := range namedVol.Options { - if o == "O" { - overlayFlag = true - upperDir, workDir, err = getOverlayUpperAndWorkDir(namedVol.Options) - if err != nil { - return nil, err - } - } - } - - if overlayFlag { - var overlayMount spec.Mount - var overlayOpts *overlay.Options - contentDir, err := overlay.TempDir(c.config.StaticDir, c.RootUID(), c.RootGID()) - if err != nil { - return nil, err - } - - overlayOpts = &overlay.Options{RootUID: c.RootUID(), - RootGID: c.RootGID(), - UpperDirOptionFragment: upperDir, - WorkDirOptionFragment: workDir, - GraphOpts: c.runtime.store.GraphOptions(), - } - - overlayMount, err = overlay.MountWithOptions(contentDir, mountPoint, namedVol.Dest, overlayOpts) - if err != nil { - return nil, fmt.Errorf("mounting overlay failed %q: %w", mountPoint, err) - } - - for _, o := range namedVol.Options { - if o == "U" { - if err := c.ChangeHostPathOwnership(mountPoint, true, int(hostUID), int(hostGID)); err != nil { - return nil, err - } - - if err := c.ChangeHostPathOwnership(contentDir, true, int(hostUID), int(hostGID)); err != nil { - return nil, err - } - } - } - g.AddMount(overlayMount) - } else { - volMount := spec.Mount{ - Type: "bind", - Source: mountPoint, - Destination: namedVol.Dest, - Options: namedVol.Options, - } - g.AddMount(volMount) - } - } - - // Check if the spec file mounts contain the options z, Z or U. - // If they have z or Z, relabel the source directory and then remove the option. - // If they have U, chown the source directory and them remove the option. - for i := range g.Config.Mounts { - m := &g.Config.Mounts[i] - var options []string - for _, o := range m.Options { - switch o { - case "U": - if m.Type == "tmpfs" { - options = append(options, []string{fmt.Sprintf("uid=%d", execUser.Uid), fmt.Sprintf("gid=%d", execUser.Gid)}...) - } else { - // only chown on initial creation of container - if err := c.ChangeHostPathOwnership(m.Source, true, int(hostUID), int(hostGID)); err != nil { - return nil, err - } - } - case "z": - fallthrough - case "Z": - if err := c.relabel(m.Source, c.MountLabel(), label.IsShared(o)); err != nil { - return nil, err - } - - default: - options = append(options, o) - } - } - m.Options = options - } - - g.SetProcessSelinuxLabel(c.ProcessLabel()) - g.SetLinuxMountLabel(c.MountLabel()) - - // Add bind mounts to container - for dstPath, srcPath := range c.state.BindMounts { - newMount := spec.Mount{ - Type: "bind", - Source: srcPath, - Destination: dstPath, - Options: []string{"bind", "rprivate"}, - } - if c.IsReadOnly() && dstPath != "/dev/shm" { - newMount.Options = append(newMount.Options, "ro", "nosuid", "noexec", "nodev") - } - if dstPath == "/dev/shm" && c.state.BindMounts["/dev/shm"] == c.config.ShmDir { - newMount.Options = append(newMount.Options, "nosuid", "noexec", "nodev") - } - if !MountExists(g.Mounts(), dstPath) { - g.AddMount(newMount) - } else { - logrus.Infof("User mount overriding libpod mount at %q", dstPath) - } - } - - // Add overlay volumes - for _, overlayVol := range c.config.OverlayVolumes { - upperDir, workDir, err := getOverlayUpperAndWorkDir(overlayVol.Options) - if err != nil { - return nil, err - } - contentDir, err := overlay.TempDir(c.config.StaticDir, c.RootUID(), c.RootGID()) - if err != nil { - return nil, err - } - overlayOpts := &overlay.Options{RootUID: c.RootUID(), - RootGID: c.RootGID(), - UpperDirOptionFragment: upperDir, - WorkDirOptionFragment: workDir, - GraphOpts: c.runtime.store.GraphOptions(), - } - - overlayMount, err := overlay.MountWithOptions(contentDir, overlayVol.Source, overlayVol.Dest, overlayOpts) - if err != nil { - return nil, fmt.Errorf("mounting overlay failed %q: %w", overlayVol.Source, err) - } - - // Check overlay volume options - for _, o := range overlayVol.Options { - if o == "U" { - if err := c.ChangeHostPathOwnership(overlayVol.Source, true, int(hostUID), int(hostGID)); err != nil { - return nil, err - } - - if err := c.ChangeHostPathOwnership(contentDir, true, int(hostUID), int(hostGID)); err != nil { - return nil, err - } - } - } - - g.AddMount(overlayMount) - } - - // Add image volumes as overlay mounts - for _, volume := range c.config.ImageVolumes { - // Mount the specified image. - img, _, err := c.runtime.LibimageRuntime().LookupImage(volume.Source, nil) - if err != nil { - return nil, fmt.Errorf("error creating image volume %q:%q: %w", volume.Source, volume.Dest, err) - } - mountPoint, err := img.Mount(ctx, nil, "") - if err != nil { - return nil, fmt.Errorf("error mounting image volume %q:%q: %w", volume.Source, volume.Dest, err) - } - - contentDir, err := overlay.TempDir(c.config.StaticDir, c.RootUID(), c.RootGID()) - if err != nil { - return nil, fmt.Errorf("failed to create TempDir in the %s directory: %w", c.config.StaticDir, err) - } - - var overlayMount spec.Mount - if volume.ReadWrite { - overlayMount, err = overlay.Mount(contentDir, mountPoint, volume.Dest, c.RootUID(), c.RootGID(), c.runtime.store.GraphOptions()) - } else { - overlayMount, err = overlay.MountReadOnly(contentDir, mountPoint, volume.Dest, c.RootUID(), c.RootGID(), c.runtime.store.GraphOptions()) - } - if err != nil { - return nil, fmt.Errorf("creating overlay mount for image %q failed: %w", volume.Source, err) - } - g.AddMount(overlayMount) - } - - hasHomeSet := false - for _, s := range c.config.Spec.Process.Env { - if strings.HasPrefix(s, "HOME=") { - hasHomeSet = true - break - } - } - if !hasHomeSet && execUser.Home != "" { - c.config.Spec.Process.Env = append(c.config.Spec.Process.Env, fmt.Sprintf("HOME=%s", execUser.Home)) - } - - if c.config.User != "" { - // User and Group must go together - g.SetProcessUID(uint32(execUser.Uid)) - g.SetProcessGID(uint32(execUser.Gid)) - } - - if c.config.Umask != "" { - decVal, err := strconv.ParseUint(c.config.Umask, 8, 32) - if err != nil { - return nil, fmt.Errorf("invalid Umask Value: %w", err) - } - umask := uint32(decVal) - g.Config.Process.User.Umask = &umask - } - - // Add addition groups if c.config.GroupAdd is not empty - if len(c.config.Groups) > 0 { - gids, err := lookup.GetContainerGroups(c.config.Groups, c.state.Mountpoint, overrides) - if err != nil { - return nil, fmt.Errorf("error looking up supplemental groups for container %s: %w", c.ID(), err) - } - for _, gid := range gids { - g.AddProcessAdditionalGid(gid) - } - } - - if c.Systemd() { - if err := c.setupSystemd(g.Mounts(), g); err != nil { - return nil, fmt.Errorf("error adding systemd-specific mounts: %w", err) - } - } - - // Look up and add groups the user belongs to, if a group wasn't directly specified - if !strings.Contains(c.config.User, ":") { - // the gidMappings that are present inside the container user namespace - var gidMappings []idtools.IDMap - - switch { - case len(c.config.IDMappings.GIDMap) > 0: - gidMappings = c.config.IDMappings.GIDMap - case rootless.IsRootless(): - // Check whether the current user namespace has enough gids available. - availableGids, err := rootless.GetAvailableGids() - if err != nil { - return nil, fmt.Errorf("cannot read number of available GIDs: %w", err) - } - gidMappings = []idtools.IDMap{{ - ContainerID: 0, - HostID: 0, - Size: int(availableGids), - }} - default: - gidMappings = []idtools.IDMap{{ - ContainerID: 0, - HostID: 0, - Size: math.MaxInt32, - }} - } - for _, gid := range execUser.Sgids { - isGIDAvailable := false - for _, m := range gidMappings { - if gid >= m.ContainerID && gid < m.ContainerID+m.Size { - isGIDAvailable = true - break - } - } - if isGIDAvailable { - g.AddProcessAdditionalGid(uint32(gid)) - } else { - logrus.Warnf("Additional gid=%d is not present in the user namespace, skip setting it", gid) - } - } - } - - // Add shared namespaces from other containers - if c.config.IPCNsCtr != "" { - if err := c.addNamespaceContainer(&g, IPCNS, c.config.IPCNsCtr, spec.IPCNamespace); err != nil { - return nil, err - } - } - if c.config.MountNsCtr != "" { - if err := c.addNamespaceContainer(&g, MountNS, c.config.MountNsCtr, spec.MountNamespace); err != nil { - return nil, err - } - } - if c.config.NetNsCtr != "" { - if err := c.addNamespaceContainer(&g, NetNS, c.config.NetNsCtr, spec.NetworkNamespace); err != nil { - return nil, err - } - } - if c.config.PIDNsCtr != "" { - if err := c.addNamespaceContainer(&g, PIDNS, c.config.PIDNsCtr, spec.PIDNamespace); err != nil { - return nil, err - } - } - if c.config.UserNsCtr != "" { - if err := c.addNamespaceContainer(&g, UserNS, c.config.UserNsCtr, spec.UserNamespace); err != nil { - return nil, err - } - if len(g.Config.Linux.UIDMappings) == 0 { - // runc complains if no mapping is specified, even if we join another ns. So provide a dummy mapping - g.AddLinuxUIDMapping(uint32(0), uint32(0), uint32(1)) - g.AddLinuxGIDMapping(uint32(0), uint32(0), uint32(1)) - } - } - - availableUIDs, availableGIDs, err := rootless.GetAvailableIDMaps() - if err != nil { - if os.IsNotExist(err) { - // The kernel-provided files only exist if user namespaces are supported - logrus.Debugf("User or group ID mappings not available: %s", err) - } else { - return nil, err - } - } else { - g.Config.Linux.UIDMappings = rootless.MaybeSplitMappings(g.Config.Linux.UIDMappings, availableUIDs) - g.Config.Linux.GIDMappings = rootless.MaybeSplitMappings(g.Config.Linux.GIDMappings, availableGIDs) - } - - // Hostname handling: - // If we have a UTS namespace, set Hostname in the OCI spec. - // Set the HOSTNAME environment variable unless explicitly overridden by - // the user (already present in OCI spec). If we don't have a UTS ns, - // set it to the host's hostname instead. - hostname := c.Hostname() - foundUTS := false - - for _, i := range c.config.Spec.Linux.Namespaces { - if i.Type == spec.UTSNamespace && i.Path == "" { - foundUTS = true - g.SetHostname(hostname) - break - } - } - if !foundUTS { - tmpHostname, err := os.Hostname() - if err != nil { - return nil, err - } - hostname = tmpHostname - } - needEnv := true - for _, checkEnv := range g.Config.Process.Env { - if strings.SplitN(checkEnv, "=", 2)[0] == "HOSTNAME" { - needEnv = false - break - } - } - if needEnv { - g.AddProcessEnv("HOSTNAME", hostname) - } - - if c.config.UTSNsCtr != "" { - if err := c.addNamespaceContainer(&g, UTSNS, c.config.UTSNsCtr, spec.UTSNamespace); err != nil { - return nil, err - } - } - if c.config.CgroupNsCtr != "" { - if err := c.addNamespaceContainer(&g, CgroupNS, c.config.CgroupNsCtr, spec.CgroupNamespace); err != nil { - return nil, err - } - } - - if c.config.UserNsCtr == "" && c.config.IDMappings.AutoUserNs { - if err := g.AddOrReplaceLinuxNamespace(string(spec.UserNamespace), ""); err != nil { - return nil, err - } - g.ClearLinuxUIDMappings() - for _, uidmap := range c.config.IDMappings.UIDMap { - g.AddLinuxUIDMapping(uint32(uidmap.HostID), uint32(uidmap.ContainerID), uint32(uidmap.Size)) - } - g.ClearLinuxGIDMappings() - for _, gidmap := range c.config.IDMappings.GIDMap { - g.AddLinuxGIDMapping(uint32(gidmap.HostID), uint32(gidmap.ContainerID), uint32(gidmap.Size)) - } - } - - g.SetRootPath(c.state.Mountpoint) - g.AddAnnotation(annotations.Created, c.config.CreatedTime.Format(time.RFC3339Nano)) - g.AddAnnotation("org.opencontainers.image.stopSignal", fmt.Sprintf("%d", c.config.StopSignal)) - - if _, exists := g.Config.Annotations[annotations.ContainerManager]; !exists { - g.AddAnnotation(annotations.ContainerManager, annotations.ContainerManagerLibpod) - } - - cgroupPath, err := c.getOCICgroupPath() - if err != nil { - return nil, err - } - - g.SetLinuxCgroupsPath(cgroupPath) - - // Warning: CDI may alter g.Config in place. - if len(c.config.CDIDevices) > 0 { - registry := cdi.GetRegistry( - cdi.WithAutoRefresh(false), - ) - if err := registry.Refresh(); err != nil { - logrus.Debugf("The following error was triggered when refreshing the CDI registry: %v", err) - } - _, err := registry.InjectDevices(g.Config, c.config.CDIDevices...) - if err != nil { - return nil, fmt.Errorf("error setting up CDI devices: %w", err) - } - } - - // Mounts need to be sorted so paths will not cover other paths - mounts := sortMounts(g.Mounts()) - g.ClearMounts() - - // Determine property of RootPropagation based on volume properties. If - // a volume is shared, then keep root propagation shared. This should - // work for slave and private volumes too. - // - // For slave volumes, it can be either [r]shared/[r]slave. - // - // For private volumes any root propagation value should work. - rootPropagation := "" - for _, m := range mounts { - // We need to remove all symlinks from tmpfs mounts. - // Runc and other runtimes may choke on them. - // Easy solution: use securejoin to do a scoped evaluation of - // the links, then trim off the mount prefix. - if m.Type == "tmpfs" { - finalPath, err := securejoin.SecureJoin(c.state.Mountpoint, m.Destination) - if err != nil { - return nil, fmt.Errorf("error resolving symlinks for mount destination %s: %w", m.Destination, err) - } - trimmedPath := strings.TrimPrefix(finalPath, strings.TrimSuffix(c.state.Mountpoint, "/")) - m.Destination = trimmedPath - } - g.AddMount(m) - for _, opt := range m.Options { - switch opt { - case MountShared, MountRShared: - if rootPropagation != MountShared && rootPropagation != MountRShared { - rootPropagation = MountShared - } - case MountSlave, MountRSlave: - if rootPropagation != MountShared && rootPropagation != MountRShared && rootPropagation != MountSlave && rootPropagation != MountRSlave { - rootPropagation = MountRSlave - } - } - } - } - - if rootPropagation != "" { - logrus.Debugf("Set root propagation to %q", rootPropagation) - if err := g.SetLinuxRootPropagation(rootPropagation); err != nil { - return nil, err - } - } - - // Warning: precreate hooks may alter g.Config in place. - if c.state.ExtensionStageHooks, err = c.setupOCIHooks(ctx, g.Config); err != nil { - return nil, fmt.Errorf("error setting up OCI Hooks: %w", err) - } - if len(c.config.EnvSecrets) > 0 { - manager, err := c.runtime.SecretsManager() - if err != nil { - return nil, err - } - if err != nil { - return nil, err - } - for name, secr := range c.config.EnvSecrets { - _, data, err := manager.LookupSecretData(secr.Name) - if err != nil { - return nil, err - } - g.AddProcessEnv(name, string(data)) - } - } - - // Pass down the LISTEN_* environment (see #10443). - for _, key := range []string{"LISTEN_PID", "LISTEN_FDS", "LISTEN_FDNAMES"} { - if val, ok := os.LookupEnv(key); ok { - // Force the PID to `1` since we cannot rely on (all - // versions of) all runtimes to do it for us. - if key == "LISTEN_PID" { - val = "1" - } - g.AddProcessEnv(key, val) - } - } - - return g.Config, nil -} - -// mountNotifySocket mounts the NOTIFY_SOCKET into the container if it's set -// and if the sdnotify mode is set to container. It also sets c.notifySocket -// to avoid redundantly looking up the env variable. -func (c *Container) mountNotifySocket(g generate.Generator) error { - if c.config.SdNotifySocket == "" { - return nil - } - if c.config.SdNotifyMode != define.SdNotifyModeContainer { - return nil - } - - notifyDir := filepath.Join(c.bundlePath(), "notify") - logrus.Debugf("Checking notify %q dir", notifyDir) - if err := os.MkdirAll(notifyDir, 0755); err != nil { - if !os.IsExist(err) { - return fmt.Errorf("unable to create notify %q dir: %w", notifyDir, err) - } - } - if err := label.Relabel(notifyDir, c.MountLabel(), true); err != nil { - return fmt.Errorf("relabel failed %q: %w", notifyDir, err) - } - logrus.Debugf("Add bindmount notify %q dir", notifyDir) - if _, ok := c.state.BindMounts["/run/notify"]; !ok { - c.state.BindMounts["/run/notify"] = notifyDir - } - - // Set the container's notify socket to the proxy socket created by conmon - g.AddProcessEnv("NOTIFY_SOCKET", "/run/notify/notify.sock") - - return nil -} - // systemd expects to have /run, /run/lock and /tmp on tmpfs // It also expects to be able to write to /sys/fs/cgroup/systemd and /var/log/journal func (c *Container) setupSystemd(mounts []spec.Mount, g generate.Generator) error { @@ -1073,9 +267,15 @@ func (c *Container) setupSystemd(mounts []spec.Mount, g generate.Generator) erro g.AddMount(systemdMnt) } else { mountOptions := []string{"bind", "rprivate"} + skipMount := false var statfs unix.Statfs_t if err := unix.Statfs("/sys/fs/cgroup/systemd", &statfs); err != nil { + if errors.Is(err, os.ErrNotExist) { + // If the mount is missing on the host, we cannot bind mount it so + // just skip it. + skipMount = true + } mountOptions = append(mountOptions, "nodev", "noexec", "nosuid") } else { if statfs.Flags&unix.MS_NODEV == unix.MS_NODEV { @@ -1091,15 +291,16 @@ func (c *Container) setupSystemd(mounts []spec.Mount, g generate.Generator) erro mountOptions = append(mountOptions, "ro") } } - - systemdMnt := spec.Mount{ - Destination: "/sys/fs/cgroup/systemd", - Type: "bind", - Source: "/sys/fs/cgroup/systemd", - Options: mountOptions, + if !skipMount { + systemdMnt := spec.Mount{ + Destination: "/sys/fs/cgroup/systemd", + Type: "bind", + Source: "/sys/fs/cgroup/systemd", + Options: mountOptions, + } + g.AddMount(systemdMnt) + g.AddLinuxMaskedPaths("/sys/fs/cgroup/systemd/release_agent") } - g.AddMount(systemdMnt) - g.AddLinuxMaskedPaths("/sys/fs/cgroup/systemd/release_agent") } return nil @@ -1131,1867 +332,6 @@ func (c *Container) addNamespaceContainer(g *generate.Generator, ns LinuxNS, ctr return nil } -func (c *Container) addCheckpointImageMetadata(importBuilder *buildah.Builder) error { - // Get information about host environment - hostInfo, err := c.Runtime().hostInfo() - if err != nil { - return fmt.Errorf("getting host info: %v", err) - } - - criuVersion, err := criu.GetCriuVersion() - if err != nil { - return fmt.Errorf("getting criu version: %v", err) - } - - rootfsImageID, rootfsImageName := c.Image() - - // Add image annotations with information about the container and the host. - // This information is useful to check compatibility before restoring the checkpoint - - checkpointImageAnnotations := map[string]string{ - define.CheckpointAnnotationName: c.config.Name, - define.CheckpointAnnotationRawImageName: c.config.RawImageName, - define.CheckpointAnnotationRootfsImageID: rootfsImageID, - define.CheckpointAnnotationRootfsImageName: rootfsImageName, - define.CheckpointAnnotationPodmanVersion: version.Version.String(), - define.CheckpointAnnotationCriuVersion: strconv.Itoa(criuVersion), - define.CheckpointAnnotationRuntimeName: hostInfo.OCIRuntime.Name, - define.CheckpointAnnotationRuntimeVersion: hostInfo.OCIRuntime.Version, - define.CheckpointAnnotationConmonVersion: hostInfo.Conmon.Version, - define.CheckpointAnnotationHostArch: hostInfo.Arch, - define.CheckpointAnnotationHostKernel: hostInfo.Kernel, - define.CheckpointAnnotationCgroupVersion: hostInfo.CgroupsVersion, - define.CheckpointAnnotationDistributionVersion: hostInfo.Distribution.Version, - define.CheckpointAnnotationDistributionName: hostInfo.Distribution.Distribution, - } - - for key, value := range checkpointImageAnnotations { - importBuilder.SetAnnotation(key, value) - } - - return nil -} - -func (c *Container) resolveCheckpointImageName(options *ContainerCheckpointOptions) error { - if options.CreateImage == "" { - return nil - } - - // Resolve image name - resolvedImageName, err := c.runtime.LibimageRuntime().ResolveName(options.CreateImage) - if err != nil { - return err - } - - options.CreateImage = resolvedImageName - return nil -} - -func (c *Container) createCheckpointImage(ctx context.Context, options ContainerCheckpointOptions) error { - if options.CreateImage == "" { - return nil - } - logrus.Debugf("Create checkpoint image %s", options.CreateImage) - - // Create storage reference - imageRef, err := is.Transport.ParseStoreReference(c.runtime.store, options.CreateImage) - if err != nil { - return errors.New("failed to parse image name") - } - - // Build an image scratch - builderOptions := buildah.BuilderOptions{ - FromImage: "scratch", - } - importBuilder, err := buildah.NewBuilder(ctx, c.runtime.store, builderOptions) - if err != nil { - return err - } - // Clean up buildah working container - defer func() { - if err := importBuilder.Delete(); err != nil { - logrus.Errorf("Image builder delete failed: %v", err) - } - }() - - if err := c.prepareCheckpointExport(); err != nil { - return err - } - - // Export checkpoint into temporary tar file - tmpDir, err := ioutil.TempDir("", "checkpoint_image_") - if err != nil { - return err - } - defer os.RemoveAll(tmpDir) - - options.TargetFile = path.Join(tmpDir, "checkpoint.tar") - - if err := c.exportCheckpoint(options); err != nil { - return err - } - - // Copy checkpoint from temporary tar file in the image - addAndCopyOptions := buildah.AddAndCopyOptions{} - if err := importBuilder.Add("", true, addAndCopyOptions, options.TargetFile); err != nil { - return err - } - - if err := c.addCheckpointImageMetadata(importBuilder); err != nil { - return err - } - - commitOptions := buildah.CommitOptions{ - Squash: true, - SystemContext: c.runtime.imageContext, - } - - // Create checkpoint image - id, _, _, err := importBuilder.Commit(ctx, imageRef, commitOptions) - if err != nil { - return err - } - logrus.Debugf("Created checkpoint image: %s", id) - return nil -} - -func (c *Container) exportCheckpoint(options ContainerCheckpointOptions) error { - if len(c.Dependencies()) == 1 { - // Check if the dependency is an infra container. If it is we can checkpoint - // the container out of the Pod. - if c.config.Pod == "" { - return errors.New("cannot export checkpoints of containers with dependencies") - } - - pod, err := c.runtime.state.Pod(c.config.Pod) - if err != nil { - return fmt.Errorf("container %s is in pod %s, but pod cannot be retrieved: %w", c.ID(), c.config.Pod, err) - } - infraID, err := pod.InfraContainerID() - if err != nil { - return fmt.Errorf("cannot retrieve infra container ID for pod %s: %w", c.config.Pod, err) - } - if c.Dependencies()[0] != infraID { - return errors.New("cannot export checkpoints of containers with dependencies") - } - } - if len(c.Dependencies()) > 1 { - return errors.New("cannot export checkpoints of containers with dependencies") - } - logrus.Debugf("Exporting checkpoint image of container %q to %q", c.ID(), options.TargetFile) - - includeFiles := []string{ - "artifacts", - metadata.DevShmCheckpointTar, - metadata.ConfigDumpFile, - metadata.SpecDumpFile, - metadata.NetworkStatusFile, - stats.StatsDump, - } - - if c.LogDriver() == define.KubernetesLogging || - c.LogDriver() == define.JSONLogging { - includeFiles = append(includeFiles, "ctr.log") - } - if options.PreCheckPoint { - includeFiles = append(includeFiles, preCheckpointDir) - } else { - includeFiles = append(includeFiles, metadata.CheckpointDirectory) - } - // Get root file-system changes included in the checkpoint archive - var addToTarFiles []string - if !options.IgnoreRootfs { - // To correctly track deleted files, let's go through the output of 'podman diff' - rootFsChanges, err := c.runtime.GetDiff("", c.ID(), define.DiffContainer) - if err != nil { - return fmt.Errorf("error exporting root file-system diff for %q: %w", c.ID(), err) - } - - addToTarFiles, err := crutils.CRCreateRootFsDiffTar(&rootFsChanges, c.state.Mountpoint, c.bundlePath()) - if err != nil { - return err - } - - includeFiles = append(includeFiles, addToTarFiles...) - } - - // Folder containing archived volumes that will be included in the export - expVolDir := filepath.Join(c.bundlePath(), metadata.CheckpointVolumesDirectory) - - // Create an archive for each volume associated with the container - if !options.IgnoreVolumes { - if err := os.MkdirAll(expVolDir, 0700); err != nil { - return fmt.Errorf("error creating volumes export directory %q: %w", expVolDir, err) - } - - for _, v := range c.config.NamedVolumes { - volumeTarFilePath := filepath.Join(metadata.CheckpointVolumesDirectory, v.Name+".tar") - volumeTarFileFullPath := filepath.Join(c.bundlePath(), volumeTarFilePath) - - volumeTarFile, err := os.Create(volumeTarFileFullPath) - if err != nil { - return fmt.Errorf("error creating %q: %w", volumeTarFileFullPath, err) - } - - volume, err := c.runtime.GetVolume(v.Name) - if err != nil { - return err - } - - mp, err := volume.MountPoint() - if err != nil { - return err - } - if mp == "" { - return fmt.Errorf("volume %s is not mounted, cannot export: %w", volume.Name(), define.ErrInternal) - } - - input, err := archive.TarWithOptions(mp, &archive.TarOptions{ - Compression: archive.Uncompressed, - IncludeSourceDir: true, - }) - if err != nil { - return fmt.Errorf("error reading volume directory %q: %w", v.Dest, err) - } - - _, err = io.Copy(volumeTarFile, input) - if err != nil { - return err - } - volumeTarFile.Close() - - includeFiles = append(includeFiles, volumeTarFilePath) - } - } - - input, err := archive.TarWithOptions(c.bundlePath(), &archive.TarOptions{ - Compression: options.Compression, - IncludeSourceDir: true, - IncludeFiles: includeFiles, - }) - - if err != nil { - return fmt.Errorf("error reading checkpoint directory %q: %w", c.ID(), err) - } - - outFile, err := os.Create(options.TargetFile) - if err != nil { - return fmt.Errorf("error creating checkpoint export file %q: %w", options.TargetFile, err) - } - defer outFile.Close() - - if err := os.Chmod(options.TargetFile, 0600); err != nil { - return err - } - - _, err = io.Copy(outFile, input) - if err != nil { - return err - } - - for _, file := range addToTarFiles { - os.Remove(filepath.Join(c.bundlePath(), file)) - } - - if !options.IgnoreVolumes { - os.RemoveAll(expVolDir) - } - - return nil -} - -func (c *Container) checkpointRestoreSupported(version int) error { - if !criu.CheckForCriu(version) { - return fmt.Errorf("checkpoint/restore requires at least CRIU %d", version) - } - if !c.ociRuntime.SupportsCheckpoint() { - return errors.New("configured runtime does not support checkpoint/restore") - } - return nil -} - -func (c *Container) checkpoint(ctx context.Context, options ContainerCheckpointOptions) (*define.CRIUCheckpointRestoreStatistics, int64, error) { - if err := c.checkpointRestoreSupported(criu.MinCriuVersion); err != nil { - return nil, 0, err - } - - if c.state.State != define.ContainerStateRunning { - return nil, 0, fmt.Errorf("%q is not running, cannot checkpoint: %w", c.state.State, define.ErrCtrStateInvalid) - } - - if c.AutoRemove() && options.TargetFile == "" { - return nil, 0, errors.New("cannot checkpoint containers that have been started with '--rm' unless '--export' is used") - } - - if err := c.resolveCheckpointImageName(&options); err != nil { - return nil, 0, err - } - - if err := crutils.CRCreateFileWithLabel(c.bundlePath(), "dump.log", c.MountLabel()); err != nil { - return nil, 0, err - } - - // Setting CheckpointLog early in case there is a failure. - c.state.CheckpointLog = path.Join(c.bundlePath(), "dump.log") - c.state.CheckpointPath = c.CheckpointPath() - - runtimeCheckpointDuration, err := c.ociRuntime.CheckpointContainer(c, options) - if err != nil { - return nil, 0, err - } - - // Keep the content of /dev/shm directory - if c.config.ShmDir != "" && c.state.BindMounts["/dev/shm"] == c.config.ShmDir { - shmDirTarFileFullPath := filepath.Join(c.bundlePath(), metadata.DevShmCheckpointTar) - - shmDirTarFile, err := os.Create(shmDirTarFileFullPath) - if err != nil { - return nil, 0, err - } - defer shmDirTarFile.Close() - - input, err := archive.TarWithOptions(c.config.ShmDir, &archive.TarOptions{ - Compression: archive.Uncompressed, - IncludeSourceDir: true, - }) - if err != nil { - return nil, 0, err - } - - if _, err = io.Copy(shmDirTarFile, input); err != nil { - return nil, 0, err - } - } - - // Save network.status. This is needed to restore the container with - // the same IP. Currently limited to one IP address in a container - // with one interface. - // FIXME: will this break something? - if _, err := metadata.WriteJSONFile(c.getNetworkStatus(), c.bundlePath(), metadata.NetworkStatusFile); err != nil { - return nil, 0, err - } - - defer c.newContainerEvent(events.Checkpoint) - - // There is a bug from criu: https://github.com/checkpoint-restore/criu/issues/116 - // We have to change the symbolic link from absolute path to relative path - if options.WithPrevious { - os.Remove(path.Join(c.CheckpointPath(), "parent")) - if err := os.Symlink("../pre-checkpoint", path.Join(c.CheckpointPath(), "parent")); err != nil { - return nil, 0, err - } - } - - if options.TargetFile != "" { - if err := c.exportCheckpoint(options); err != nil { - return nil, 0, err - } - } else { - if err := c.createCheckpointImage(ctx, options); err != nil { - return nil, 0, err - } - } - - logrus.Debugf("Checkpointed container %s", c.ID()) - - if !options.KeepRunning && !options.PreCheckPoint { - c.state.State = define.ContainerStateStopped - c.state.Checkpointed = true - c.state.CheckpointedTime = time.Now() - c.state.Restored = false - c.state.RestoredTime = time.Time{} - - // Clean up Storage and Network - if err := c.cleanup(ctx); err != nil { - return nil, 0, err - } - } - - criuStatistics, err := func() (*define.CRIUCheckpointRestoreStatistics, error) { - if !options.PrintStats { - return nil, nil - } - statsDirectory, err := os.Open(c.bundlePath()) - if err != nil { - return nil, fmt.Errorf("not able to open %q: %w", c.bundlePath(), err) - } - - dumpStatistics, err := stats.CriuGetDumpStats(statsDirectory) - if err != nil { - return nil, fmt.Errorf("displaying checkpointing statistics not possible: %w", err) - } - - return &define.CRIUCheckpointRestoreStatistics{ - FreezingTime: dumpStatistics.GetFreezingTime(), - FrozenTime: dumpStatistics.GetFrozenTime(), - MemdumpTime: dumpStatistics.GetMemdumpTime(), - MemwriteTime: dumpStatistics.GetMemwriteTime(), - PagesScanned: dumpStatistics.GetPagesScanned(), - PagesWritten: dumpStatistics.GetPagesWritten(), - }, nil - }() - if err != nil { - return nil, 0, err - } - - if !options.Keep && !options.PreCheckPoint { - cleanup := []string{ - "dump.log", - stats.StatsDump, - metadata.ConfigDumpFile, - metadata.SpecDumpFile, - } - for _, del := range cleanup { - file := filepath.Join(c.bundlePath(), del) - if err := os.Remove(file); err != nil { - logrus.Debugf("Unable to remove file %s", file) - } - } - // The file has been deleted. Do not mention it. - c.state.CheckpointLog = "" - } - - c.state.FinishedTime = time.Now() - return criuStatistics, runtimeCheckpointDuration, c.save() -} - -func (c *Container) generateContainerSpec() error { - // Make sure the newly created config.json exists on disk - - // NewFromSpec() is deprecated according to its comment - // however the recommended replace just causes a nil map panic - //nolint:staticcheck - g := generate.NewFromSpec(c.config.Spec) - - if err := c.saveSpec(g.Config); err != nil { - return fmt.Errorf("saving imported container specification for restore failed: %w", err) - } - - return nil -} - -func (c *Container) importCheckpointImage(ctx context.Context, imageID string) error { - img, _, err := c.Runtime().LibimageRuntime().LookupImage(imageID, nil) - if err != nil { - return err - } - - mountPoint, err := img.Mount(ctx, nil, "") - defer func() { - if err := c.unmount(true); err != nil { - logrus.Errorf("Failed to unmount container: %v", err) - } - }() - if err != nil { - return err - } - - // Import all checkpoint files except ConfigDumpFile and SpecDumpFile. We - // generate new container config files to enable to specifying a new - // container name. - checkpoint := []string{ - "artifacts", - metadata.CheckpointDirectory, - metadata.CheckpointVolumesDirectory, - metadata.DevShmCheckpointTar, - metadata.RootFsDiffTar, - metadata.DeletedFilesFile, - metadata.PodOptionsFile, - metadata.PodDumpFile, - } - - for _, name := range checkpoint { - src := filepath.Join(mountPoint, name) - dst := filepath.Join(c.bundlePath(), name) - if err := archive.NewDefaultArchiver().CopyWithTar(src, dst); err != nil { - logrus.Debugf("Can't import '%s' from checkpoint image", name) - } - } - - return c.generateContainerSpec() -} - -func (c *Container) importCheckpointTar(input string) error { - if err := crutils.CRImportCheckpointWithoutConfig(c.bundlePath(), input); err != nil { - return err - } - - return c.generateContainerSpec() -} - -func (c *Container) importPreCheckpoint(input string) error { - archiveFile, err := os.Open(input) - if err != nil { - return fmt.Errorf("failed to open pre-checkpoint archive for import: %w", err) - } - - defer archiveFile.Close() - - err = archive.Untar(archiveFile, c.bundlePath(), nil) - if err != nil { - return fmt.Errorf("unpacking of pre-checkpoint archive %s failed: %w", input, err) - } - return nil -} - -func (c *Container) restore(ctx context.Context, options ContainerCheckpointOptions) (criuStatistics *define.CRIUCheckpointRestoreStatistics, runtimeRestoreDuration int64, retErr error) { - minCriuVersion := func() int { - if options.Pod == "" { - return criu.MinCriuVersion - } - return criu.PodCriuVersion - }() - if err := c.checkpointRestoreSupported(minCriuVersion); err != nil { - return nil, 0, err - } - - if options.Pod != "" && !crutils.CRRuntimeSupportsPodCheckpointRestore(c.ociRuntime.Path()) { - return nil, 0, fmt.Errorf("runtime %s does not support pod restore", c.ociRuntime.Path()) - } - - if !c.ensureState(define.ContainerStateConfigured, define.ContainerStateExited) { - return nil, 0, fmt.Errorf("container %s is running or paused, cannot restore: %w", c.ID(), define.ErrCtrStateInvalid) - } - - if options.ImportPrevious != "" { - if err := c.importPreCheckpoint(options.ImportPrevious); err != nil { - return nil, 0, err - } - } - - if options.TargetFile != "" { - if err := c.importCheckpointTar(options.TargetFile); err != nil { - return nil, 0, err - } - } else if options.CheckpointImageID != "" { - if err := c.importCheckpointImage(ctx, options.CheckpointImageID); err != nil { - return nil, 0, err - } - } - - // Let's try to stat() CRIU's inventory file. If it does not exist, it makes - // no sense to try a restore. This is a minimal check if a checkpoint exist. - if _, err := os.Stat(filepath.Join(c.CheckpointPath(), "inventory.img")); os.IsNotExist(err) { - return nil, 0, fmt.Errorf("a complete checkpoint for this container cannot be found, cannot restore: %w", err) - } - - if err := crutils.CRCreateFileWithLabel(c.bundlePath(), "restore.log", c.MountLabel()); err != nil { - return nil, 0, err - } - - // Setting RestoreLog early in case there is a failure. - c.state.RestoreLog = path.Join(c.bundlePath(), "restore.log") - c.state.CheckpointPath = c.CheckpointPath() - - // Read network configuration from checkpoint - var netStatus map[string]types.StatusBlock - _, err := metadata.ReadJSONFile(&netStatus, c.bundlePath(), metadata.NetworkStatusFile) - if err != nil { - logrus.Infof("Failed to unmarshal network status, cannot restore the same ip/mac: %v", err) - } - // If the restored container should get a new name, the IP address of - // the container will not be restored. This assumes that if a new name is - // specified, the container is restored multiple times. - // TODO: This implicit restoring with or without IP depending on an - // unrelated restore parameter (--name) does not seem like the - // best solution. - if err == nil && options.Name == "" && (!options.IgnoreStaticIP || !options.IgnoreStaticMAC) { - // The file with the network.status does exist. Let's restore the - // container with the same networks settings as during checkpointing. - networkOpts, err := c.networks() - if err != nil { - return nil, 0, err - } - - netOpts := make(map[string]types.PerNetworkOptions, len(netStatus)) - for network, perNetOpts := range networkOpts { - // unset mac and ips before we start adding the ones from the status - perNetOpts.StaticMAC = nil - perNetOpts.StaticIPs = nil - for name, netInt := range netStatus[network].Interfaces { - perNetOpts.InterfaceName = name - if !options.IgnoreStaticIP { - perNetOpts.StaticMAC = netInt.MacAddress - } - if !options.IgnoreStaticIP { - for _, netAddress := range netInt.Subnets { - perNetOpts.StaticIPs = append(perNetOpts.StaticIPs, netAddress.IPNet.IP) - } - } - // Normally interfaces have a length of 1, only for some special cni configs we could get more. - // For now just use the first interface to get the ips this should be good enough for most cases. - break - } - netOpts[network] = perNetOpts - } - c.perNetworkOpts = netOpts - } - - defer func() { - if retErr != nil { - if err := c.cleanup(ctx); err != nil { - logrus.Errorf("Cleaning up container %s: %v", c.ID(), err) - } - } - }() - - if err := c.prepare(); err != nil { - return nil, 0, err - } - - // Read config - jsonPath := filepath.Join(c.bundlePath(), "config.json") - logrus.Debugf("generate.NewFromFile at %v", jsonPath) - g, err := generate.NewFromFile(jsonPath) - if err != nil { - logrus.Debugf("generate.NewFromFile failed with %v", err) - return nil, 0, err - } - - // Restoring from an import means that we are doing migration - if options.TargetFile != "" || options.CheckpointImageID != "" { - g.SetRootPath(c.state.Mountpoint) - } - - // We want to have the same network namespace as before. - if c.config.CreateNetNS { - netNSPath := "" - if !c.config.PostConfigureNetNS { - netNSPath = c.state.NetNS.Path() - } - - if err := g.AddOrReplaceLinuxNamespace(string(spec.NetworkNamespace), netNSPath); err != nil { - return nil, 0, err - } - } - - if options.Pod != "" { - // Running in a Pod means that we have to change all namespace settings to - // the ones from the infrastructure container. - pod, err := c.runtime.LookupPod(options.Pod) - if err != nil { - return nil, 0, fmt.Errorf("pod %q cannot be retrieved: %w", options.Pod, err) - } - - infraContainer, err := pod.InfraContainer() - if err != nil { - return nil, 0, fmt.Errorf("cannot retrieved infra container from pod %q: %w", options.Pod, err) - } - - infraContainer.lock.Lock() - if err := infraContainer.syncContainer(); err != nil { - infraContainer.lock.Unlock() - return nil, 0, fmt.Errorf("error syncing infrastructure container %s status: %w", infraContainer.ID(), err) - } - if infraContainer.state.State != define.ContainerStateRunning { - if err := infraContainer.initAndStart(ctx); err != nil { - infraContainer.lock.Unlock() - return nil, 0, fmt.Errorf("error starting infrastructure container %s status: %w", infraContainer.ID(), err) - } - } - infraContainer.lock.Unlock() - - if c.config.IPCNsCtr != "" { - nsPath, err := infraContainer.namespacePath(IPCNS) - if err != nil { - return nil, 0, fmt.Errorf("cannot retrieve IPC namespace path for Pod %q: %w", options.Pod, err) - } - if err := g.AddOrReplaceLinuxNamespace(string(spec.IPCNamespace), nsPath); err != nil { - return nil, 0, err - } - } - - if c.config.NetNsCtr != "" { - nsPath, err := infraContainer.namespacePath(NetNS) - if err != nil { - return nil, 0, fmt.Errorf("cannot retrieve network namespace path for Pod %q: %w", options.Pod, err) - } - if err := g.AddOrReplaceLinuxNamespace(string(spec.NetworkNamespace), nsPath); err != nil { - return nil, 0, err - } - } - - if c.config.PIDNsCtr != "" { - nsPath, err := infraContainer.namespacePath(PIDNS) - if err != nil { - return nil, 0, fmt.Errorf("cannot retrieve PID namespace path for Pod %q: %w", options.Pod, err) - } - if err := g.AddOrReplaceLinuxNamespace(string(spec.PIDNamespace), nsPath); err != nil { - return nil, 0, err - } - } - - if c.config.UTSNsCtr != "" { - nsPath, err := infraContainer.namespacePath(UTSNS) - if err != nil { - return nil, 0, fmt.Errorf("cannot retrieve UTS namespace path for Pod %q: %w", options.Pod, err) - } - if err := g.AddOrReplaceLinuxNamespace(string(spec.UTSNamespace), nsPath); err != nil { - return nil, 0, err - } - } - - if c.config.CgroupNsCtr != "" { - nsPath, err := infraContainer.namespacePath(CgroupNS) - if err != nil { - return nil, 0, fmt.Errorf("cannot retrieve Cgroup namespace path for Pod %q: %w", options.Pod, err) - } - if err := g.AddOrReplaceLinuxNamespace(string(spec.CgroupNamespace), nsPath); err != nil { - return nil, 0, err - } - } - } - - if err := c.makeBindMounts(); err != nil { - return nil, 0, err - } - - if options.TargetFile != "" || options.CheckpointImageID != "" { - for dstPath, srcPath := range c.state.BindMounts { - newMount := spec.Mount{ - Type: "bind", - Source: srcPath, - Destination: dstPath, - Options: []string{"bind", "private"}, - } - if c.IsReadOnly() && dstPath != "/dev/shm" { - newMount.Options = append(newMount.Options, "ro", "nosuid", "noexec", "nodev") - } - if dstPath == "/dev/shm" && c.state.BindMounts["/dev/shm"] == c.config.ShmDir { - newMount.Options = append(newMount.Options, "nosuid", "noexec", "nodev") - } - if !MountExists(g.Mounts(), dstPath) { - g.AddMount(newMount) - } - } - } - - // Restore /dev/shm content - if c.config.ShmDir != "" && c.state.BindMounts["/dev/shm"] == c.config.ShmDir { - shmDirTarFileFullPath := filepath.Join(c.bundlePath(), metadata.DevShmCheckpointTar) - if _, err := os.Stat(shmDirTarFileFullPath); err != nil { - logrus.Debug("Container checkpoint doesn't contain dev/shm: ", err.Error()) - } else { - shmDirTarFile, err := os.Open(shmDirTarFileFullPath) - if err != nil { - return nil, 0, err - } - defer shmDirTarFile.Close() - - if err := archive.UntarUncompressed(shmDirTarFile, c.config.ShmDir, nil); err != nil { - return nil, 0, err - } - } - } - - // Cleanup for a working restore. - if err := c.removeConmonFiles(); err != nil { - return nil, 0, err - } - - // Save the OCI spec to disk - if err := c.saveSpec(g.Config); err != nil { - return nil, 0, err - } - - // When restoring from an imported archive, allow restoring the content of volumes. - // Volumes are created in setupContainer() - if !options.IgnoreVolumes && (options.TargetFile != "" || options.CheckpointImageID != "") { - for _, v := range c.config.NamedVolumes { - volumeFilePath := filepath.Join(c.bundlePath(), metadata.CheckpointVolumesDirectory, v.Name+".tar") - - volumeFile, err := os.Open(volumeFilePath) - if err != nil { - return nil, 0, fmt.Errorf("failed to open volume file %s: %w", volumeFilePath, err) - } - defer volumeFile.Close() - - volume, err := c.runtime.GetVolume(v.Name) - if err != nil { - return nil, 0, fmt.Errorf("failed to retrieve volume %s: %w", v.Name, err) - } - - mountPoint, err := volume.MountPoint() - if err != nil { - return nil, 0, err - } - if mountPoint == "" { - return nil, 0, fmt.Errorf("unable to import volume %s as it is not mounted: %w", volume.Name(), err) - } - if err := archive.UntarUncompressed(volumeFile, mountPoint, nil); err != nil { - return nil, 0, fmt.Errorf("failed to extract volume %s to %s: %w", volumeFilePath, mountPoint, err) - } - } - } - - // Before actually restarting the container, apply the root file-system changes - if !options.IgnoreRootfs { - if err := crutils.CRApplyRootFsDiffTar(c.bundlePath(), c.state.Mountpoint); err != nil { - return nil, 0, err - } - - if err := crutils.CRRemoveDeletedFiles(c.ID(), c.bundlePath(), c.state.Mountpoint); err != nil { - return nil, 0, err - } - } - - runtimeRestoreDuration, err = c.ociRuntime.CreateContainer(c, &options) - if err != nil { - return nil, 0, err - } - - criuStatistics, err = func() (*define.CRIUCheckpointRestoreStatistics, error) { - if !options.PrintStats { - return nil, nil - } - statsDirectory, err := os.Open(c.bundlePath()) - if err != nil { - return nil, fmt.Errorf("not able to open %q: %w", c.bundlePath(), err) - } - - restoreStatistics, err := stats.CriuGetRestoreStats(statsDirectory) - if err != nil { - return nil, fmt.Errorf("displaying restore statistics not possible: %w", err) - } - - return &define.CRIUCheckpointRestoreStatistics{ - PagesCompared: restoreStatistics.GetPagesCompared(), - PagesSkippedCow: restoreStatistics.GetPagesSkippedCow(), - ForkingTime: restoreStatistics.GetForkingTime(), - RestoreTime: restoreStatistics.GetRestoreTime(), - PagesRestored: restoreStatistics.GetPagesRestored(), - }, nil - }() - if err != nil { - return nil, 0, err - } - - logrus.Debugf("Restored container %s", c.ID()) - - c.state.State = define.ContainerStateRunning - c.state.Checkpointed = false - c.state.Restored = true - c.state.CheckpointedTime = time.Time{} - c.state.RestoredTime = time.Now() - - if !options.Keep { - // Delete all checkpoint related files. At this point, in theory, all files - // should exist. Still ignoring errors for now as the container should be - // restored and running. Not erroring out just because some cleanup operation - // failed. Starting with the checkpoint directory - err = os.RemoveAll(c.CheckpointPath()) - if err != nil { - logrus.Debugf("Non-fatal: removal of checkpoint directory (%s) failed: %v", c.CheckpointPath(), err) - } - c.state.CheckpointPath = "" - err = os.RemoveAll(c.PreCheckPointPath()) - if err != nil { - logrus.Debugf("Non-fatal: removal of pre-checkpoint directory (%s) failed: %v", c.PreCheckPointPath(), err) - } - err = os.RemoveAll(c.CheckpointVolumesPath()) - if err != nil { - logrus.Debugf("Non-fatal: removal of checkpoint volumes directory (%s) failed: %v", c.CheckpointVolumesPath(), err) - } - cleanup := [...]string{ - "restore.log", - "dump.log", - stats.StatsDump, - stats.StatsRestore, - metadata.DevShmCheckpointTar, - metadata.NetworkStatusFile, - metadata.RootFsDiffTar, - metadata.DeletedFilesFile, - } - for _, del := range cleanup { - file := filepath.Join(c.bundlePath(), del) - err = os.Remove(file) - if err != nil { - logrus.Debugf("Non-fatal: removal of checkpoint file (%s) failed: %v", file, err) - } - } - c.state.CheckpointLog = "" - c.state.RestoreLog = "" - } - - return criuStatistics, runtimeRestoreDuration, c.save() -} - -// Retrieves a container's "root" net namespace container dependency. -func (c *Container) getRootNetNsDepCtr() (depCtr *Container, err error) { - containersVisited := map[string]int{c.config.ID: 1} - nextCtr := c.config.NetNsCtr - for nextCtr != "" { - // Make sure we aren't in a loop - if _, visited := containersVisited[nextCtr]; visited { - return nil, errors.New("loop encountered while determining net namespace container") - } - containersVisited[nextCtr] = 1 - - depCtr, err = c.runtime.state.Container(nextCtr) - if err != nil { - return nil, fmt.Errorf("error fetching dependency %s of container %s: %w", c.config.NetNsCtr, c.ID(), err) - } - // This should never happen without an error - if depCtr == nil { - break - } - nextCtr = depCtr.config.NetNsCtr - } - - if depCtr == nil { - return nil, errors.New("unexpected error depCtr is nil without reported error from runtime state") - } - return depCtr, nil -} - -// Ensure standard bind mounts are mounted into all root directories (including chroot directories) -func (c *Container) mountIntoRootDirs(mountName string, mountPath string) error { - c.state.BindMounts[mountName] = mountPath - - for _, chrootDir := range c.config.ChrootDirs { - c.state.BindMounts[filepath.Join(chrootDir, mountName)] = mountPath - } - - return nil -} - -// Make standard bind mounts to include in the container -func (c *Container) makeBindMounts() error { - if err := os.Chown(c.state.RunDir, c.RootUID(), c.RootGID()); err != nil { - return fmt.Errorf("cannot chown run directory: %w", err) - } - - if c.state.BindMounts == nil { - c.state.BindMounts = make(map[string]string) - } - netDisabled, err := c.NetworkDisabled() - if err != nil { - return err - } - - if !netDisabled { - // If /etc/resolv.conf and /etc/hosts exist, delete them so we - // will recreate. Only do this if we aren't sharing them with - // another container. - if c.config.NetNsCtr == "" { - if resolvePath, ok := c.state.BindMounts["/etc/resolv.conf"]; ok { - if err := os.Remove(resolvePath); err != nil && !os.IsNotExist(err) { - return fmt.Errorf("container %s: %w", c.ID(), err) - } - delete(c.state.BindMounts, "/etc/resolv.conf") - } - if hostsPath, ok := c.state.BindMounts["/etc/hosts"]; ok { - if err := os.Remove(hostsPath); err != nil && !os.IsNotExist(err) { - return fmt.Errorf("container %s: %w", c.ID(), err) - } - delete(c.state.BindMounts, "/etc/hosts") - } - } - - if c.config.NetNsCtr != "" && (!c.config.UseImageResolvConf || !c.config.UseImageHosts) { - // We share a net namespace. - // We want /etc/resolv.conf and /etc/hosts from the - // other container. Unless we're not creating both of - // them. - depCtr, err := c.getRootNetNsDepCtr() - if err != nil { - return fmt.Errorf("error fetching network namespace dependency container for container %s: %w", c.ID(), err) - } - - // We need that container's bind mounts - bindMounts, err := depCtr.BindMounts() - if err != nil { - return fmt.Errorf("error fetching bind mounts from dependency %s of container %s: %w", depCtr.ID(), c.ID(), err) - } - - // The other container may not have a resolv.conf or /etc/hosts - // If it doesn't, don't copy them - resolvPath, exists := bindMounts["/etc/resolv.conf"] - if !c.config.UseImageResolvConf && exists { - err := c.mountIntoRootDirs("/etc/resolv.conf", resolvPath) - - if err != nil { - return fmt.Errorf("error assigning mounts to container %s: %w", c.ID(), err) - } - } - - // check if dependency container has an /etc/hosts file. - // It may not have one, so only use it if it does. - hostsPath, exists := bindMounts[config.DefaultHostsFile] - if !c.config.UseImageHosts && exists { - // we cannot use the dependency container lock due ABBA deadlocks in cleanup() - lock, err := lockfile.GetLockfile(hostsPath) - if err != nil { - return fmt.Errorf("failed to lock hosts file: %w", err) - } - lock.Lock() - - // add the newly added container to the hosts file - // we always use 127.0.0.1 as ip since they have the same netns - err = etchosts.Add(hostsPath, getLocalhostHostEntry(c)) - lock.Unlock() - if err != nil { - return fmt.Errorf("error creating hosts file for container %s which depends on container %s: %w", c.ID(), depCtr.ID(), err) - } - - // finally, save it in the new container - err = c.mountIntoRootDirs(config.DefaultHostsFile, hostsPath) - if err != nil { - return fmt.Errorf("error assigning mounts to container %s: %w", c.ID(), err) - } - } - - if !hasCurrentUserMapped(c) { - if err := makeAccessible(resolvPath, c.RootUID(), c.RootGID()); err != nil { - return err - } - if err := makeAccessible(hostsPath, c.RootUID(), c.RootGID()); err != nil { - return err - } - } - } else { - if !c.config.UseImageResolvConf { - if err := c.generateResolvConf(); err != nil { - return fmt.Errorf("error creating resolv.conf for container %s: %w", c.ID(), err) - } - } - - if !c.config.UseImageHosts { - if err := c.createHosts(); err != nil { - return fmt.Errorf("error creating hosts file for container %s: %w", c.ID(), err) - } - } - } - - if c.state.BindMounts["/etc/hosts"] != "" { - if err := c.relabel(c.state.BindMounts["/etc/hosts"], c.config.MountLabel, true); err != nil { - return err - } - } - - if c.state.BindMounts["/etc/resolv.conf"] != "" { - if err := c.relabel(c.state.BindMounts["/etc/resolv.conf"], c.config.MountLabel, true); err != nil { - return err - } - } - } else if !c.config.UseImageHosts && c.state.BindMounts["/etc/hosts"] == "" { - if err := c.createHosts(); err != nil { - return fmt.Errorf("error creating hosts file for container %s: %w", c.ID(), err) - } - } - - if c.config.ShmDir != "" { - // If ShmDir has a value SHM is always added when we mount the container - c.state.BindMounts["/dev/shm"] = c.config.ShmDir - } - - if c.config.Passwd == nil || *c.config.Passwd { - newPasswd, newGroup, err := c.generatePasswdAndGroup() - if err != nil { - return fmt.Errorf("error creating temporary passwd file for container %s: %w", c.ID(), err) - } - if newPasswd != "" { - // Make /etc/passwd - // If it already exists, delete so we can recreate - delete(c.state.BindMounts, "/etc/passwd") - c.state.BindMounts["/etc/passwd"] = newPasswd - } - if newGroup != "" { - // Make /etc/group - // If it already exists, delete so we can recreate - delete(c.state.BindMounts, "/etc/group") - c.state.BindMounts["/etc/group"] = newGroup - } - } - - // Make /etc/hostname - // This should never change, so no need to recreate if it exists - if _, ok := c.state.BindMounts["/etc/hostname"]; !ok { - hostnamePath, err := c.writeStringToRundir("hostname", c.Hostname()) - if err != nil { - return fmt.Errorf("error creating hostname file for container %s: %w", c.ID(), err) - } - c.state.BindMounts["/etc/hostname"] = hostnamePath - } - - // Make /etc/localtime - ctrTimezone := c.Timezone() - if ctrTimezone != "" { - // validate the format of the timezone specified if it's not "local" - if ctrTimezone != "local" { - _, err = time.LoadLocation(ctrTimezone) - if err != nil { - return fmt.Errorf("error finding timezone for container %s: %w", c.ID(), err) - } - } - if _, ok := c.state.BindMounts["/etc/localtime"]; !ok { - var zonePath string - if ctrTimezone == "local" { - zonePath, err = filepath.EvalSymlinks("/etc/localtime") - if err != nil { - return fmt.Errorf("error finding local timezone for container %s: %w", c.ID(), err) - } - } else { - zone := filepath.Join("/usr/share/zoneinfo", ctrTimezone) - zonePath, err = filepath.EvalSymlinks(zone) - if err != nil { - return fmt.Errorf("error setting timezone for container %s: %w", c.ID(), err) - } - } - localtimePath, err := c.copyTimezoneFile(zonePath) - if err != nil { - return fmt.Errorf("error setting timezone for container %s: %w", c.ID(), err) - } - c.state.BindMounts["/etc/localtime"] = localtimePath - } - } - - _, hasRunContainerenv := c.state.BindMounts["/run/.containerenv"] - if !hasRunContainerenv { - // check in the spec mounts - for _, m := range c.config.Spec.Mounts { - if m.Destination == "/run/.containerenv" || m.Destination == "/run" { - hasRunContainerenv = true - break - } - } - } - - // Make .containerenv if it does not exist - if !hasRunContainerenv { - containerenv := c.runtime.graphRootMountedFlag(c.config.Spec.Mounts) - isRootless := 0 - if rootless.IsRootless() { - isRootless = 1 - } - imageID, imageName := c.Image() - - if c.Privileged() { - // Populate the .containerenv with container information - containerenv = fmt.Sprintf(`engine="podman-%s" -name=%q -id=%q -image=%q -imageid=%q -rootless=%d -%s`, version.Version.String(), c.Name(), c.ID(), imageName, imageID, isRootless, containerenv) - } - containerenvPath, err := c.writeStringToRundir(".containerenv", containerenv) - if err != nil { - return fmt.Errorf("error creating containerenv file for container %s: %w", c.ID(), err) - } - c.state.BindMounts["/run/.containerenv"] = containerenvPath - } - - // Add Subscription Mounts - subscriptionMounts := subscriptions.MountsWithUIDGID(c.config.MountLabel, c.state.RunDir, c.runtime.config.Containers.DefaultMountsFile, c.state.Mountpoint, c.RootUID(), c.RootGID(), rootless.IsRootless(), false) - for _, mount := range subscriptionMounts { - if _, ok := c.state.BindMounts[mount.Destination]; !ok { - c.state.BindMounts[mount.Destination] = mount.Source - } - } - - // Secrets are mounted by getting the secret data from the secrets manager, - // copying the data into the container's static dir, - // then mounting the copied dir into /run/secrets. - // The secrets mounting must come after subscription mounts, since subscription mounts - // creates the /run/secrets dir in the container where we mount as well. - if len(c.Secrets()) > 0 { - // create /run/secrets if subscriptions did not create - if err := c.createSecretMountDir(); err != nil { - return fmt.Errorf("error creating secrets mount: %w", err) - } - for _, secret := range c.Secrets() { - secretFileName := secret.Name - base := "/run/secrets" - if secret.Target != "" { - secretFileName = secret.Target - // If absolute path for target given remove base. - if filepath.IsAbs(secretFileName) { - base = "" - } - } - src := filepath.Join(c.config.SecretsPath, secret.Name) - dest := filepath.Join(base, secretFileName) - c.state.BindMounts[dest] = src - } - } - - return nil -} - -// generateResolvConf generates a containers resolv.conf -func (c *Container) generateResolvConf() error { - var ( - networkNameServers []string - networkSearchDomains []string - ) - - netStatus := c.getNetworkStatus() - for _, status := range netStatus { - if status.DNSServerIPs != nil { - for _, nsIP := range status.DNSServerIPs { - networkNameServers = append(networkNameServers, nsIP.String()) - } - logrus.Debugf("Adding nameserver(s) from network status of '%q'", status.DNSServerIPs) - } - if status.DNSSearchDomains != nil { - networkSearchDomains = append(networkSearchDomains, status.DNSSearchDomains...) - logrus.Debugf("Adding search domain(s) from network status of '%q'", status.DNSSearchDomains) - } - } - - ipv6, err := c.checkForIPv6(netStatus) - if err != nil { - return err - } - - nameservers := make([]string, 0, len(c.runtime.config.Containers.DNSServers)+len(c.config.DNSServer)) - nameservers = append(nameservers, c.runtime.config.Containers.DNSServers...) - for _, ip := range c.config.DNSServer { - nameservers = append(nameservers, ip.String()) - } - // If the user provided dns, it trumps all; then dns masq; then resolv.conf - var search []string - keepHostServers := false - if len(nameservers) == 0 { - keepHostServers = true - // first add the nameservers from the networks status - nameservers = networkNameServers - // when we add network dns server we also have to add the search domains - search = networkSearchDomains - // slirp4netns has a built in DNS forwarder. - if c.config.NetMode.IsSlirp4netns() { - slirp4netnsDNS, err := GetSlirp4netnsDNS(c.slirp4netnsSubnet) - if err != nil { - logrus.Warn("Failed to determine Slirp4netns DNS: ", err.Error()) - } else { - nameservers = append(nameservers, slirp4netnsDNS.String()) - } - } - } - - if len(c.config.DNSSearch) > 0 || len(c.runtime.config.Containers.DNSSearches) > 0 { - customSearch := make([]string, 0, len(c.config.DNSSearch)+len(c.runtime.config.Containers.DNSSearches)) - customSearch = append(customSearch, c.runtime.config.Containers.DNSSearches...) - customSearch = append(customSearch, c.config.DNSSearch...) - search = customSearch - } - - options := make([]string, 0, len(c.config.DNSOption)+len(c.runtime.config.Containers.DNSOptions)) - options = append(options, c.runtime.config.Containers.DNSOptions...) - options = append(options, c.config.DNSOption...) - - destPath := filepath.Join(c.state.RunDir, "resolv.conf") - - if err := resolvconf.New(&resolvconf.Params{ - IPv6Enabled: ipv6, - KeepHostServers: keepHostServers, - Nameservers: nameservers, - Namespaces: c.config.Spec.Linux.Namespaces, - Options: options, - Path: destPath, - Searches: search, - }); err != nil { - return fmt.Errorf("error building resolv.conf for container %s: %w", c.ID(), err) - } - - return c.bindMountRootFile(destPath, resolvconf.DefaultResolvConf) -} - -// Check if a container uses IPv6. -func (c *Container) checkForIPv6(netStatus map[string]types.StatusBlock) (bool, error) { - for _, status := range netStatus { - for _, netInt := range status.Interfaces { - for _, netAddress := range netInt.Subnets { - // Note: only using To16() does not work since it also returns a valid ip for ipv4 - if netAddress.IPNet.IP.To4() == nil && netAddress.IPNet.IP.To16() != nil { - return true, nil - } - } - } - } - - if c.config.NetMode.IsSlirp4netns() { - ctrNetworkSlipOpts := []string{} - if c.config.NetworkOptions != nil { - ctrNetworkSlipOpts = append(ctrNetworkSlipOpts, c.config.NetworkOptions["slirp4netns"]...) - } - slirpOpts, err := parseSlirp4netnsNetworkOptions(c.runtime, ctrNetworkSlipOpts) - if err != nil { - return false, err - } - return slirpOpts.enableIPv6, nil - } - - return false, nil -} - -// Add a new nameserver to the container's resolv.conf, ensuring that it is the -// first nameserver present. -// Usable only with running containers. -func (c *Container) addNameserver(ips []string) error { - // Take no action if container is not running. - if !c.ensureState(define.ContainerStateRunning, define.ContainerStateCreated) { - return nil - } - - // Do we have a resolv.conf at all? - path, ok := c.state.BindMounts[resolvconf.DefaultResolvConf] - if !ok { - return nil - } - - if err := resolvconf.Add(path, ips); err != nil { - return fmt.Errorf("adding new nameserver to container %s resolv.conf: %w", c.ID(), err) - } - - return nil -} - -// Remove an entry from the existing resolv.conf of the container. -// Usable only with running containers. -func (c *Container) removeNameserver(ips []string) error { - // Take no action if container is not running. - if !c.ensureState(define.ContainerStateRunning, define.ContainerStateCreated) { - return nil - } - - // Do we have a resolv.conf at all? - path, ok := c.state.BindMounts[resolvconf.DefaultResolvConf] - if !ok { - return nil - } - - if err := resolvconf.Remove(path, ips); err != nil { - return fmt.Errorf("removing nameservers from container %s resolv.conf: %w", c.ID(), err) - } - - return nil -} - -func getLocalhostHostEntry(c *Container) etchosts.HostEntries { - return etchosts.HostEntries{{IP: "127.0.0.1", Names: []string{c.Hostname(), c.config.Name}}} -} - -// getHostsEntries returns the container ip host entries for the correct netmode -func (c *Container) getHostsEntries() (etchosts.HostEntries, error) { - var entries etchosts.HostEntries - names := []string{c.Hostname(), c.config.Name} - switch { - case c.config.NetMode.IsBridge(): - entries = etchosts.GetNetworkHostEntries(c.state.NetworkStatus, names...) - case c.config.NetMode.IsSlirp4netns(): - ip, err := GetSlirp4netnsIP(c.slirp4netnsSubnet) - if err != nil { - return nil, err - } - entries = etchosts.HostEntries{{IP: ip.String(), Names: names}} - default: - // check for net=none - if !c.config.CreateNetNS { - for _, ns := range c.config.Spec.Linux.Namespaces { - if ns.Type == spec.NetworkNamespace { - if ns.Path == "" { - entries = etchosts.HostEntries{{IP: "127.0.0.1", Names: names}} - } - break - } - } - } - } - return entries, nil -} - -func (c *Container) createHosts() error { - var containerIPsEntries etchosts.HostEntries - var err error - // if we configure the netns after the container create we should not add - // the hosts here since we have no information about the actual ips - // instead we will add them in c.completeNetworkSetup() - if !c.config.PostConfigureNetNS { - containerIPsEntries, err = c.getHostsEntries() - if err != nil { - return fmt.Errorf("failed to get container ip host entries: %w", err) - } - } - baseHostFile, err := etchosts.GetBaseHostFile(c.runtime.config.Containers.BaseHostsFile, c.state.Mountpoint) - if err != nil { - return err - } - - targetFile := filepath.Join(c.state.RunDir, "hosts") - err = etchosts.New(&etchosts.Params{ - BaseFile: baseHostFile, - ExtraHosts: c.config.HostAdd, - ContainerIPs: containerIPsEntries, - HostContainersInternalIP: etchosts.GetHostContainersInternalIP(c.runtime.config, c.state.NetworkStatus, c.runtime.network), - TargetFile: targetFile, - }) - if err != nil { - return err - } - - return c.bindMountRootFile(targetFile, config.DefaultHostsFile) -} - -// bindMountRootFile will chown and relabel the source file to make it usable in the container. -// It will also add the path to the container bind mount map. -// source is the path on the host, dest is the path in the container. -func (c *Container) bindMountRootFile(source, dest string) error { - if err := os.Chown(source, c.RootUID(), c.RootGID()); err != nil { - return err - } - if err := label.Relabel(source, c.MountLabel(), false); err != nil { - return err - } - - return c.mountIntoRootDirs(dest, source) -} - -// generateGroupEntry generates an entry or entries into /etc/group as -// required by container configuration. -// Generally speaking, we will make an entry under two circumstances: -// 1. The container is started as a specific user:group, and that group is both -// numeric, and does not already exist in /etc/group. -// 2. It is requested that Libpod add the group that launched Podman to -// /etc/group via AddCurrentUserPasswdEntry (though this does not trigger if -// the group in question already exists in /etc/passwd). -// Returns group entry (as a string that can be appended to /etc/group) and any -// error that occurred. -func (c *Container) generateGroupEntry() (string, error) { - groupString := "" - - // Things we *can't* handle: adding the user we added in - // generatePasswdEntry to any *existing* groups. - addedGID := 0 - if c.config.AddCurrentUserPasswdEntry { - entry, gid, err := c.generateCurrentUserGroupEntry() - if err != nil { - return "", err - } - groupString += entry - addedGID = gid - } - if c.config.User != "" { - entry, err := c.generateUserGroupEntry(addedGID) - if err != nil { - return "", err - } - groupString += entry - } - - return groupString, nil -} - -// Make an entry in /etc/group for the group of the user running podman iff we -// are rootless. -func (c *Container) generateCurrentUserGroupEntry() (string, int, error) { - gid := rootless.GetRootlessGID() - if gid == 0 { - return "", 0, nil - } - - g, err := user.LookupGroupId(strconv.Itoa(gid)) - if err != nil { - return "", 0, fmt.Errorf("failed to get current group: %w", err) - } - - // Look up group name to see if it exists in the image. - _, err = lookup.GetGroup(c.state.Mountpoint, g.Name) - if err != runcuser.ErrNoGroupEntries { - return "", 0, err - } - - // Look up GID to see if it exists in the image. - _, err = lookup.GetGroup(c.state.Mountpoint, g.Gid) - if err != runcuser.ErrNoGroupEntries { - return "", 0, err - } - - // We need to get the username of the rootless user so we can add it to - // the group. - username := "" - uid := rootless.GetRootlessUID() - if uid != 0 { - u, err := user.LookupId(strconv.Itoa(uid)) - if err != nil { - return "", 0, fmt.Errorf("failed to get current user to make group entry: %w", err) - } - username = u.Username - } - - // Make the entry. - return fmt.Sprintf("%s:x:%s:%s\n", g.Name, g.Gid, username), gid, nil -} - -// Make an entry in /etc/group for the group the container was specified to run -// as. -func (c *Container) generateUserGroupEntry(addedGID int) (string, error) { - if c.config.User == "" { - return "", nil - } - - splitUser := strings.SplitN(c.config.User, ":", 2) - group := splitUser[0] - if len(splitUser) > 1 { - group = splitUser[1] - } - - gid, err := strconv.ParseUint(group, 10, 32) - if err != nil { - return "", nil //nolint: nilerr - } - - if addedGID != 0 && addedGID == int(gid) { - return "", nil - } - - // Check if the group already exists - _, err = lookup.GetGroup(c.state.Mountpoint, group) - if err != runcuser.ErrNoGroupEntries { - return "", err - } - - return fmt.Sprintf("%d:x:%d:%s\n", gid, gid, splitUser[0]), nil -} - -// generatePasswdEntry generates an entry or entries into /etc/passwd as -// required by container configuration. -// Generally speaking, we will make an entry under two circumstances: -// 1. The container is started as a specific user who is not in /etc/passwd. -// This only triggers if the user is given as a *numeric* ID. -// 2. It is requested that Libpod add the user that launched Podman to -// /etc/passwd via AddCurrentUserPasswdEntry (though this does not trigger if -// the user in question already exists in /etc/passwd) or the UID to be added -// is 0). -// 3. The user specified additional host user accounts to add the the /etc/passwd file -// Returns password entry (as a string that can be appended to /etc/passwd) and -// any error that occurred. -func (c *Container) generatePasswdEntry() (string, error) { - passwdString := "" - - addedUID := 0 - for _, userid := range c.config.HostUsers { - // Look up User on host - u, err := util.LookupUser(userid) - if err != nil { - return "", err - } - entry, err := c.userPasswdEntry(u) - if err != nil { - return "", err - } - passwdString += entry - } - if c.config.AddCurrentUserPasswdEntry { - entry, uid, _, err := c.generateCurrentUserPasswdEntry() - if err != nil { - return "", err - } - passwdString += entry - addedUID = uid - } - if c.config.User != "" { - entry, err := c.generateUserPasswdEntry(addedUID) - if err != nil { - return "", err - } - passwdString += entry - } - - return passwdString, nil -} - -// generateCurrentUserPasswdEntry generates an /etc/passwd entry for the user -// running the container engine. -// Returns a passwd entry for the user, and the UID and GID of the added entry. -func (c *Container) generateCurrentUserPasswdEntry() (string, int, int, error) { - uid := rootless.GetRootlessUID() - if uid == 0 { - return "", 0, 0, nil - } - - u, err := user.LookupId(strconv.Itoa(uid)) - if err != nil { - return "", 0, 0, fmt.Errorf("failed to get current user: %w", err) - } - pwd, err := c.userPasswdEntry(u) - if err != nil { - return "", 0, 0, err - } - - return pwd, uid, rootless.GetRootlessGID(), nil -} - -func (c *Container) userPasswdEntry(u *user.User) (string, error) { - // Look up the user to see if it exists in the container image. - _, err := lookup.GetUser(c.state.Mountpoint, u.Username) - if err != runcuser.ErrNoPasswdEntries { - return "", err - } - - // Look up the UID to see if it exists in the container image. - _, err = lookup.GetUser(c.state.Mountpoint, u.Uid) - if err != runcuser.ErrNoPasswdEntries { - return "", err - } - - // If the user's actual home directory exists, or was mounted in - use - // that. - homeDir := c.WorkingDir() - hDir := u.HomeDir - for hDir != "/" { - if MountExists(c.config.Spec.Mounts, hDir) { - homeDir = u.HomeDir - break - } - hDir = filepath.Dir(hDir) - } - if homeDir != u.HomeDir { - for _, hDir := range c.UserVolumes() { - if hDir == u.HomeDir { - homeDir = u.HomeDir - break - } - } - } - // Set HOME environment if not already set - hasHomeSet := false - for _, s := range c.config.Spec.Process.Env { - if strings.HasPrefix(s, "HOME=") { - hasHomeSet = true - break - } - } - if !hasHomeSet { - c.config.Spec.Process.Env = append(c.config.Spec.Process.Env, fmt.Sprintf("HOME=%s", homeDir)) - } - if c.config.PasswdEntry != "" { - return c.passwdEntry(u.Username, u.Uid, u.Gid, u.Name, homeDir), nil - } - - return fmt.Sprintf("%s:*:%s:%s:%s:%s:/bin/sh\n", u.Username, u.Uid, u.Gid, u.Name, homeDir), nil -} - -// generateUserPasswdEntry generates an /etc/passwd entry for the container user -// to run in the container. -// The UID and GID of the added entry will also be returned. -// Accepts one argument, that being any UID that has already been added to the -// passwd file by other functions; if it matches the UID we were given, we don't -// need to do anything. -func (c *Container) generateUserPasswdEntry(addedUID int) (string, error) { - var ( - groupspec string - gid int - ) - if c.config.User == "" { - return "", nil - } - splitSpec := strings.SplitN(c.config.User, ":", 2) - userspec := splitSpec[0] - if len(splitSpec) > 1 { - groupspec = splitSpec[1] - } - // If a non numeric User, then don't generate passwd - uid, err := strconv.ParseUint(userspec, 10, 32) - if err != nil { - return "", nil //nolint: nilerr - } - - if addedUID != 0 && int(uid) == addedUID { - return "", nil - } - - // Look up the user to see if it exists in the container image - _, err = lookup.GetUser(c.state.Mountpoint, userspec) - if err != runcuser.ErrNoPasswdEntries { - return "", err - } - - if groupspec != "" { - ugid, err := strconv.ParseUint(groupspec, 10, 32) - if err == nil { - gid = int(ugid) - } else { - group, err := lookup.GetGroup(c.state.Mountpoint, groupspec) - if err != nil { - return "", fmt.Errorf("unable to get gid %s from group file: %w", groupspec, err) - } - gid = group.Gid - } - } - - if c.config.PasswdEntry != "" { - entry := c.passwdEntry(fmt.Sprintf("%d", uid), fmt.Sprintf("%d", uid), fmt.Sprintf("%d", gid), "container user", c.WorkingDir()) - return entry, nil - } - - return fmt.Sprintf("%d:*:%d:%d:container user:%s:/bin/sh\n", uid, uid, gid, c.WorkingDir()), nil -} - -func (c *Container) passwdEntry(username string, uid, gid, name, homeDir string) string { - s := c.config.PasswdEntry - s = strings.ReplaceAll(s, "$USERNAME", username) - s = strings.ReplaceAll(s, "$UID", uid) - s = strings.ReplaceAll(s, "$GID", gid) - s = strings.ReplaceAll(s, "$NAME", name) - s = strings.ReplaceAll(s, "$HOME", homeDir) - return s + "\n" -} - -// generatePasswdAndGroup generates container-specific passwd and group files -// iff g.config.User is a number or we are configured to make a passwd entry for -// the current user or the user specified HostsUsers -// Returns path to file to mount at /etc/passwd, path to file to mount at -// /etc/group, and any error that occurred. If no passwd/group file were -// required, the empty string will be returned for those path (this may occur -// even if no error happened). -// This may modify the mounted container's /etc/passwd and /etc/group instead of -// making copies to bind-mount in, so we don't break useradd (it wants to make a -// copy of /etc/passwd and rename the copy to /etc/passwd, which is impossible -// with a bind mount). This is done in cases where the container is *not* -// read-only. In this case, the function will return nothing ("", "", nil). -func (c *Container) generatePasswdAndGroup() (string, string, error) { - if !c.config.AddCurrentUserPasswdEntry && c.config.User == "" && - len(c.config.HostUsers) == 0 { - return "", "", nil - } - - needPasswd := true - needGroup := true - - // First, check if there's a mount at /etc/passwd or group, we don't - // want to interfere with user mounts. - if MountExists(c.config.Spec.Mounts, "/etc/passwd") { - needPasswd = false - } - if MountExists(c.config.Spec.Mounts, "/etc/group") { - needGroup = false - } - - // Next, check if we already made the files. If we didn't, don't need to - // do anything more. - if needPasswd { - passwdPath := filepath.Join(c.config.StaticDir, "passwd") - if _, err := os.Stat(passwdPath); err == nil { - needPasswd = false - } - } - if needGroup { - groupPath := filepath.Join(c.config.StaticDir, "group") - if _, err := os.Stat(groupPath); err == nil { - needGroup = false - } - } - - // If we don't need a /etc/passwd or /etc/group at this point we can - // just return. - if !needPasswd && !needGroup { - return "", "", nil - } - - passwdPath := "" - groupPath := "" - - ro := c.IsReadOnly() - - if needPasswd { - passwdEntry, err := c.generatePasswdEntry() - if err != nil { - return "", "", err - } - - needsWrite := passwdEntry != "" - switch { - case ro && needsWrite: - logrus.Debugf("Making /etc/passwd for container %s", c.ID()) - originPasswdFile, err := securejoin.SecureJoin(c.state.Mountpoint, "/etc/passwd") - if err != nil { - return "", "", fmt.Errorf("error creating path to container %s /etc/passwd: %w", c.ID(), err) - } - orig, err := ioutil.ReadFile(originPasswdFile) - if err != nil && !os.IsNotExist(err) { - return "", "", err - } - passwdFile, err := c.writeStringToStaticDir("passwd", string(orig)+passwdEntry) - if err != nil { - return "", "", fmt.Errorf("failed to create temporary passwd file: %w", err) - } - if err := os.Chmod(passwdFile, 0644); err != nil { - return "", "", err - } - passwdPath = passwdFile - case !ro && needsWrite: - logrus.Debugf("Modifying container %s /etc/passwd", c.ID()) - containerPasswd, err := securejoin.SecureJoin(c.state.Mountpoint, "/etc/passwd") - if err != nil { - return "", "", fmt.Errorf("error looking up location of container %s /etc/passwd: %w", c.ID(), err) - } - - f, err := os.OpenFile(containerPasswd, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0600) - if err != nil { - return "", "", fmt.Errorf("container %s: %w", c.ID(), err) - } - defer f.Close() - - if _, err := f.WriteString(passwdEntry); err != nil { - return "", "", fmt.Errorf("unable to append to container %s /etc/passwd: %w", c.ID(), err) - } - default: - logrus.Debugf("Not modifying container %s /etc/passwd", c.ID()) - } - } - if needGroup { - groupEntry, err := c.generateGroupEntry() - if err != nil { - return "", "", err - } - - needsWrite := groupEntry != "" - switch { - case ro && needsWrite: - logrus.Debugf("Making /etc/group for container %s", c.ID()) - originGroupFile, err := securejoin.SecureJoin(c.state.Mountpoint, "/etc/group") - if err != nil { - return "", "", fmt.Errorf("error creating path to container %s /etc/group: %w", c.ID(), err) - } - orig, err := ioutil.ReadFile(originGroupFile) - if err != nil && !os.IsNotExist(err) { - return "", "", err - } - groupFile, err := c.writeStringToStaticDir("group", string(orig)+groupEntry) - if err != nil { - return "", "", fmt.Errorf("failed to create temporary group file: %w", err) - } - if err := os.Chmod(groupFile, 0644); err != nil { - return "", "", err - } - groupPath = groupFile - case !ro && needsWrite: - logrus.Debugf("Modifying container %s /etc/group", c.ID()) - containerGroup, err := securejoin.SecureJoin(c.state.Mountpoint, "/etc/group") - if err != nil { - return "", "", fmt.Errorf("error looking up location of container %s /etc/group: %w", c.ID(), err) - } - - f, err := os.OpenFile(containerGroup, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0600) - if err != nil { - return "", "", fmt.Errorf("container %s: %w", c.ID(), err) - } - defer f.Close() - - if _, err := f.WriteString(groupEntry); err != nil { - return "", "", fmt.Errorf("unable to append to container %s /etc/group: %w", c.ID(), err) - } - default: - logrus.Debugf("Not modifying container %s /etc/group", c.ID()) - } - } - - return passwdPath, groupPath, nil -} - func isRootlessCgroupSet(cgroup string) bool { // old versions of podman were setting the CgroupParent to CgroupfsDefaultCgroupParent // by default. Avoid breaking these versions and check whether the cgroup parent is @@ -3058,198 +398,257 @@ func (c *Container) getOCICgroupPath() (string, error) { } } -func (c *Container) copyTimezoneFile(zonePath string) (string, error) { - localtimeCopy := filepath.Join(c.state.RunDir, "localtime") - file, err := os.Stat(zonePath) - if err != nil { - return "", err - } - if file.IsDir() { - return "", errors.New("invalid timezone: is a directory") - } - src, err := os.Open(zonePath) - if err != nil { - return "", err - } - defer src.Close() - dest, err := os.Create(localtimeCopy) - if err != nil { - return "", err - } - defer dest.Close() - _, err = io.Copy(dest, src) - if err != nil { - return "", err - } - if err := c.relabel(localtimeCopy, c.config.MountLabel, false); err != nil { - return "", err +// If the container is rootless, set up the slirp4netns network +func (c *Container) setupRootlessNetwork() error { + // set up slirp4netns again because slirp4netns will die when conmon exits + if c.config.NetMode.IsSlirp4netns() { + err := c.runtime.setupSlirp4netns(c, c.state.NetNS) + if err != nil { + return err + } } - if err := dest.Chown(c.RootUID(), c.RootGID()); err != nil { - return "", err + + // set up rootlesskit port forwarder again since it dies when conmon exits + // we use rootlesskit port forwarder only as rootless and when bridge network is used + if rootless.IsRootless() && c.config.NetMode.IsBridge() && len(c.config.PortMappings) > 0 { + err := c.runtime.setupRootlessPortMappingViaRLK(c, c.state.NetNS.Path(), c.state.NetworkStatus) + if err != nil { + return err + } } - return localtimeCopy, err + return nil } -func (c *Container) cleanupOverlayMounts() error { - return overlay.CleanupContent(c.config.StaticDir) +func openDirectory(path string) (fd int, err error) { + return unix.Open(path, unix.O_RDONLY|unix.O_PATH, 0) } -// Creates and mounts an empty dir to mount secrets into, if it does not already exist -func (c *Container) createSecretMountDir() error { - src := filepath.Join(c.state.RunDir, "/run/secrets") - _, err := os.Stat(src) - if os.IsNotExist(err) { - oldUmask := umask.Set(0) - defer umask.Set(oldUmask) +func (c *Container) addNetworkNamespace(g *generate.Generator) error { + if c.config.CreateNetNS { + if c.config.PostConfigureNetNS { + if err := g.AddOrReplaceLinuxNamespace(string(spec.NetworkNamespace), ""); err != nil { + return err + } + } else { + if err := g.AddOrReplaceLinuxNamespace(string(spec.NetworkNamespace), c.state.NetNS.Path()); err != nil { + return err + } + } + } + return nil +} + +func (c *Container) addSystemdMounts(g *generate.Generator) error { + if c.Systemd() { + if err := c.setupSystemd(g.Mounts(), *g); err != nil { + return fmt.Errorf("error adding systemd-specific mounts: %w", err) + } + } + return nil +} - if err := os.MkdirAll(src, 0755); err != nil { +func (c *Container) addSharedNamespaces(g *generate.Generator) error { + if c.config.IPCNsCtr != "" { + if err := c.addNamespaceContainer(g, IPCNS, c.config.IPCNsCtr, spec.IPCNamespace); err != nil { return err } - if err := label.Relabel(src, c.config.MountLabel, false); err != nil { + } + if c.config.MountNsCtr != "" { + if err := c.addNamespaceContainer(g, MountNS, c.config.MountNsCtr, spec.MountNamespace); err != nil { return err } - if err := os.Chown(src, c.RootUID(), c.RootGID()); err != nil { + } + if c.config.NetNsCtr != "" { + if err := c.addNamespaceContainer(g, NetNS, c.config.NetNsCtr, spec.NetworkNamespace); err != nil { return err } - c.state.BindMounts["/run/secrets"] = src - return nil + } + if c.config.PIDNsCtr != "" { + if err := c.addNamespaceContainer(g, PIDNS, c.config.PIDNsCtr, spec.PIDNamespace); err != nil { + return err + } + } + if c.config.UserNsCtr != "" { + if err := c.addNamespaceContainer(g, UserNS, c.config.UserNsCtr, spec.UserNamespace); err != nil { + return err + } + if len(g.Config.Linux.UIDMappings) == 0 { + // runc complains if no mapping is specified, even if we join another ns. So provide a dummy mapping + g.AddLinuxUIDMapping(uint32(0), uint32(0), uint32(1)) + g.AddLinuxGIDMapping(uint32(0), uint32(0), uint32(1)) + } } - return err -} - -// Fix ownership and permissions of the specified volume if necessary. -func (c *Container) fixVolumePermissions(v *ContainerNamedVolume) error { - vol, err := c.runtime.state.Volume(v.Name) + availableUIDs, availableGIDs, err := rootless.GetAvailableIDMaps() if err != nil { - return fmt.Errorf("error retrieving named volume %s for container %s: %w", v.Name, c.ID(), err) + if os.IsNotExist(err) { + // The kernel-provided files only exist if user namespaces are supported + logrus.Debugf("User or group ID mappings not available: %s", err) + } else { + return err + } + } else { + g.Config.Linux.UIDMappings = rootless.MaybeSplitMappings(g.Config.Linux.UIDMappings, availableUIDs) + g.Config.Linux.GIDMappings = rootless.MaybeSplitMappings(g.Config.Linux.GIDMappings, availableGIDs) } - vol.lock.Lock() - defer vol.lock.Unlock() + // Hostname handling: + // If we have a UTS namespace, set Hostname in the OCI spec. + // Set the HOSTNAME environment variable unless explicitly overridden by + // the user (already present in OCI spec). If we don't have a UTS ns, + // set it to the host's hostname instead. + hostname := c.Hostname() + foundUTS := false - // The volume may need a copy-up. Check the state. - if err := vol.update(); err != nil { - return err + for _, i := range c.config.Spec.Linux.Namespaces { + if i.Type == spec.UTSNamespace && i.Path == "" { + foundUTS = true + g.SetHostname(hostname) + break + } } - - // Volumes owned by a volume driver are not chowned - we don't want to - // mess with a mount not managed by us. - if vol.state.NeedsChown && !vol.UsesVolumeDriver() { - vol.state.NeedsChown = false - - uid := int(c.config.Spec.Process.User.UID) - gid := int(c.config.Spec.Process.User.GID) - - if c.config.IDMappings.UIDMap != nil { - p := idtools.IDPair{ - UID: uid, - GID: gid, - } - mappings := idtools.NewIDMappingsFromMaps(c.config.IDMappings.UIDMap, c.config.IDMappings.GIDMap) - newPair, err := mappings.ToHost(p) - if err != nil { - return fmt.Errorf("error mapping user %d:%d: %w", uid, gid, err) - } - uid = newPair.UID - gid = newPair.GID + if !foundUTS { + tmpHostname, err := os.Hostname() + if err != nil { + return err } + hostname = tmpHostname + } + needEnv := true + for _, checkEnv := range g.Config.Process.Env { + if strings.SplitN(checkEnv, "=", 2)[0] == "HOSTNAME" { + needEnv = false + break + } + } + if needEnv { + g.AddProcessEnv("HOSTNAME", hostname) + } - vol.state.UIDChowned = uid - vol.state.GIDChowned = gid - - if err := vol.save(); err != nil { + if c.config.UTSNsCtr != "" { + if err := c.addNamespaceContainer(g, UTSNS, c.config.UTSNsCtr, spec.UTSNamespace); err != nil { return err } - - mountPoint, err := vol.MountPoint() - if err != nil { + } + if c.config.CgroupNsCtr != "" { + if err := c.addNamespaceContainer(g, CgroupNS, c.config.CgroupNsCtr, spec.CgroupNamespace); err != nil { return err } + } - if err := os.Lchown(mountPoint, uid, gid); err != nil { + if c.config.UserNsCtr == "" && c.config.IDMappings.AutoUserNs { + if err := g.AddOrReplaceLinuxNamespace(string(spec.UserNamespace), ""); err != nil { return err } + g.ClearLinuxUIDMappings() + for _, uidmap := range c.config.IDMappings.UIDMap { + g.AddLinuxUIDMapping(uint32(uidmap.HostID), uint32(uidmap.ContainerID), uint32(uidmap.Size)) + } + g.ClearLinuxGIDMappings() + for _, gidmap := range c.config.IDMappings.GIDMap { + g.AddLinuxGIDMapping(uint32(gidmap.HostID), uint32(gidmap.ContainerID), uint32(gidmap.Size)) + } + } + return nil +} - // Make sure the new volume matches the permissions of the target directory. - // https://github.com/containers/podman/issues/10188 - st, err := os.Lstat(filepath.Join(c.state.Mountpoint, v.Dest)) - if err == nil { - if stat, ok := st.Sys().(*syscall.Stat_t); ok { - if err := os.Lchown(mountPoint, int(stat.Uid), int(stat.Gid)); err != nil { - return err +func (c *Container) addRootPropagation(g *generate.Generator, mounts []spec.Mount) error { + // Determine property of RootPropagation based on volume properties. If + // a volume is shared, then keep root propagation shared. This should + // work for slave and private volumes too. + // + // For slave volumes, it can be either [r]shared/[r]slave. + // + // For private volumes any root propagation value should work. + rootPropagation := "" + for _, m := range mounts { + for _, opt := range m.Options { + switch opt { + case MountShared, MountRShared: + if rootPropagation != MountShared && rootPropagation != MountRShared { + rootPropagation = MountShared + } + case MountSlave, MountRSlave: + if rootPropagation != MountShared && rootPropagation != MountRShared && rootPropagation != MountSlave && rootPropagation != MountRSlave { + rootPropagation = MountRSlave } } - if err := os.Chmod(mountPoint, st.Mode()); err != nil { - return err - } - stat := st.Sys().(*syscall.Stat_t) - atime := time.Unix(int64(stat.Atim.Sec), int64(stat.Atim.Nsec)) //nolint: unconvert - if err := os.Chtimes(mountPoint, atime, st.ModTime()); err != nil { - return err - } - } else if !os.IsNotExist(err) { + } + } + if rootPropagation != "" { + logrus.Debugf("Set root propagation to %q", rootPropagation) + if err := g.SetLinuxRootPropagation(rootPropagation); err != nil { return err } } return nil } -func (c *Container) relabel(src, mountLabel string, recurse bool) error { - if !selinux.GetEnabled() || mountLabel == "" { - return nil - } - // only relabel on initial creation of container - if !c.ensureState(define.ContainerStateConfigured, define.ContainerStateUnknown) { - label, err := label.FileLabel(src) - if err != nil { - return err - } - // If labels are different, might be on a tmpfs - if label == mountLabel { - return nil - } +func (c *Container) setProcessLabel(g *generate.Generator) { + g.SetProcessSelinuxLabel(c.ProcessLabel()) +} + +func (c *Container) setMountLabel(g *generate.Generator) { + g.SetLinuxMountLabel(c.MountLabel()) +} + +func (c *Container) setCgroupsPath(g *generate.Generator) error { + cgroupPath, err := c.getOCICgroupPath() + if err != nil { + return err } - return label.Relabel(src, mountLabel, recurse) + g.SetLinuxCgroupsPath(cgroupPath) + return nil } -func (c *Container) ChangeHostPathOwnership(src string, recurse bool, uid, gid int) error { - // only chown on initial creation of container - if !c.ensureState(define.ContainerStateConfigured, define.ContainerStateUnknown) { - st, err := os.Stat(src) +func (c *Container) addSlirp4netnsDNS(nameservers []string) []string { + // slirp4netns has a built in DNS forwarder. + if c.config.NetMode.IsSlirp4netns() { + slirp4netnsDNS, err := GetSlirp4netnsDNS(c.slirp4netnsSubnet) if err != nil { - return err - } - - // If labels are different, might be on a tmpfs - if int(st.Sys().(*syscall.Stat_t).Uid) == uid && int(st.Sys().(*syscall.Stat_t).Gid) == gid { - return nil + logrus.Warn("Failed to determine Slirp4netns DNS: ", err.Error()) + } else { + nameservers = append(nameservers, slirp4netnsDNS.String()) } } - return chown.ChangeHostPathOwnership(src, recurse, uid, gid) + return nameservers } -// If the container is rootless, set up the slirp4netns network -func (c *Container) setupRootlessNetwork() error { - // set up slirp4netns again because slirp4netns will die when conmon exits +func (c *Container) isSlirp4netnsIPv6() (bool, error) { if c.config.NetMode.IsSlirp4netns() { - err := c.runtime.setupSlirp4netns(c, c.state.NetNS) + ctrNetworkSlipOpts := []string{} + if c.config.NetworkOptions != nil { + ctrNetworkSlipOpts = append(ctrNetworkSlipOpts, c.config.NetworkOptions["slirp4netns"]...) + } + slirpOpts, err := parseSlirp4netnsNetworkOptions(c.runtime, ctrNetworkSlipOpts) if err != nil { - return err + return false, err } + return slirpOpts.enableIPv6, nil } - // set up rootlesskit port forwarder again since it dies when conmon exits - // we use rootlesskit port forwarder only as rootless and when bridge network is used - if rootless.IsRootless() && c.config.NetMode.IsBridge() && len(c.config.PortMappings) > 0 { - err := c.runtime.setupRootlessPortMappingViaRLK(c, c.state.NetNS.Path(), c.state.NetworkStatus) - if err != nil { - return err + return false, nil +} + +// check for net=none +func (c *Container) hasNetNone() bool { + if !c.config.CreateNetNS { + for _, ns := range c.config.Spec.Linux.Namespaces { + if ns.Type == spec.NetworkNamespace { + if ns.Path == "" { + return true + } + } } } - return nil + return false } -func openDirectory(path string) (fd int, err error) { - return unix.Open(path, unix.O_RDONLY|unix.O_PATH, 0) +func setVolumeAtime(mountPoint string, st os.FileInfo) error { + stat := st.Sys().(*syscall.Stat_t) + atime := time.Unix(int64(stat.Atim.Sec), int64(stat.Atim.Nsec)) //nolint: unconvert + if err := os.Chtimes(mountPoint, atime, st.ModTime()); err != nil { + return err + } + return nil } diff --git a/libpod/container_internal_unsupported.go b/libpod/container_internal_unsupported.go index de92ff260..1967c577b 100644 --- a/libpod/container_internal_unsupported.go +++ b/libpod/container_internal_unsupported.go @@ -1,5 +1,5 @@ -//go:build !linux -// +build !linux +//go:build !linux && !freebsd +// +build !linux,!freebsd package libpod @@ -69,21 +69,21 @@ func (c *Container) restore(ctx context.Context, options ContainerCheckpointOpti // getHostsEntries returns the container ip host entries for the correct netmode func (c *Container) getHostsEntries() (etchosts.HostEntries, error) { - return nil, errors.New("unspported (*Container) getHostsEntries") + return nil, errors.New("unsupported (*Container) getHostsEntries") } // Fix ownership and permissions of the specified volume if necessary. func (c *Container) fixVolumePermissions(v *ContainerNamedVolume) error { - return errors.New("unspported (*Container) fixVolumePermissions") + return errors.New("unsupported (*Container) fixVolumePermissions") } func (c *Container) expectPodCgroup() (bool, error) { - return false, errors.New("unspported (*Container) expectPodCgroup") + return false, errors.New("unsupported (*Container) expectPodCgroup") } // Get cgroup path in a format suitable for the OCI spec func (c *Container) getOCICgroupPath() (string, error) { - return "", errors.New("unspported (*Container) getOCICgroupPath") + return "", errors.New("unsupported (*Container) getOCICgroupPath") } func getLocalhostHostEntry(c *Container) etchosts.HostEntries { diff --git a/libpod/container_linux.go b/libpod/container_linux.go index 8b517e69f..9c17a1966 100644 --- a/libpod/container_linux.go +++ b/libpod/container_linux.go @@ -5,6 +5,7 @@ package libpod import ( "github.com/containernetworking/plugins/pkg/ns" + spec "github.com/opencontainers/runtime-spec/specs-go" ) type containerPlatformState struct { @@ -13,3 +14,17 @@ type containerPlatformState struct { // told to join another container's network namespace NetNS ns.NetNS `json:"-"` } + +func networkDisabled(c *Container) (bool, error) { + if c.config.CreateNetNS { + return false, nil + } + if !c.config.PostConfigureNetNS { + for _, ns := range c.config.Spec.Linux.Namespaces { + if ns.Type == spec.NetworkNamespace { + return ns.Path == "", nil + } + } + } + return false, nil +} diff --git a/libpod/container_validate.go b/libpod/container_validate.go index da33f6db7..f4611ecce 100644 --- a/libpod/container_validate.go +++ b/libpod/container_validate.go @@ -137,5 +137,9 @@ func (c *Container) validate() error { if c.config.SdNotifyMode == define.SdNotifyModeIgnore && len(c.config.SdNotifySocket) > 0 { return fmt.Errorf("cannot set sd-notify socket %q with sd-notify mode %q", c.config.SdNotifySocket, c.config.SdNotifyMode) } + + if c.config.HealthCheckOnFailureAction != define.HealthCheckOnFailureActionNone && c.config.HealthCheckConfig == nil { + return fmt.Errorf("cannot set on-failure action to %s without a health check", c.config.HealthCheckOnFailureAction.String()) + } return nil } diff --git a/libpod/define/config.go b/libpod/define/config.go index 34c1a675d..1fad5cc9a 100644 --- a/libpod/define/config.go +++ b/libpod/define/config.go @@ -85,4 +85,4 @@ const PassthroughLogging = "passthrough" const RLimitDefaultValue = uint64(1048576) // BindMountPrefix distinguishes its annotations from others -const BindMountPrefix = "bind-mount-options:" +const BindMountPrefix = "bind-mount-options" diff --git a/libpod/define/container_inspect.go b/libpod/define/container_inspect.go index 5982d684c..da5c58f27 100644 --- a/libpod/define/container_inspect.go +++ b/libpod/define/container_inspect.go @@ -55,6 +55,8 @@ type InspectContainerConfig struct { StopSignal uint `json:"StopSignal"` // Configured healthcheck for the container Healthcheck *manifest.Schema2HealthConfig `json:"Healthcheck,omitempty"` + // HealthcheckOnFailureAction defines an action to take once the container turns unhealthy. + HealthcheckOnFailureAction string `json:"HealthcheckOnFailureAction,omitempty"` // CreateCommand is the full command plus arguments of the process the // container has been created with. CreateCommand []string `json:"CreateCommand,omitempty"` diff --git a/libpod/define/errors.go b/libpod/define/errors.go index fd27e89de..be471c27e 100644 --- a/libpod/define/errors.go +++ b/libpod/define/errors.go @@ -179,6 +179,9 @@ var ( // ErrNetworkInUse indicates the requested operation failed because the network was in use ErrNetworkInUse = errors.New("network is being used") + // ErrNetworkConnected indicates that the required operation failed because the container is already a network endpoint + ErrNetworkConnected = errors.New("network is already connected") + // ErrStoreNotInitialized indicates that the container storage was never // initialized. ErrStoreNotInitialized = errors.New("the container storage was never initialized") diff --git a/libpod/define/exec_codes.go b/libpod/define/exec_codes.go index 3f2da4910..a84730e72 100644 --- a/libpod/define/exec_codes.go +++ b/libpod/define/exec_codes.go @@ -11,8 +11,8 @@ const ( // ExecErrorCodeGeneric is the default error code to return from an exec session if libpod failed // prior to calling the runtime ExecErrorCodeGeneric = 125 - // ExecErrorCodeCannotInvoke is the error code to return when the runtime fails to invoke a command - // an example of this can be found by trying to execute a directory: + // ExecErrorCodeCannotInvoke is the error code to return when the runtime fails to invoke a command. + // An example of this can be found by trying to execute a directory: // `podman exec -l /etc` ExecErrorCodeCannotInvoke = 126 // ExecErrorCodeNotFound is the error code to return when a command cannot be found diff --git a/libpod/define/healthchecks.go b/libpod/define/healthchecks.go index f71274350..274e02561 100644 --- a/libpod/define/healthchecks.go +++ b/libpod/define/healthchecks.go @@ -1,5 +1,10 @@ package define +import ( + "fmt" + "strings" +) + const ( // HealthCheckHealthy describes a healthy container HealthCheckHealthy string = "healthy" @@ -57,3 +62,72 @@ const ( // HealthConfigTestCmdShell runs commands with the system's default shell HealthConfigTestCmdShell = "CMD-SHELL" ) + +// HealthCheckOnFailureAction defines how Podman reacts when a container's health +// status turns unhealthy. +type HealthCheckOnFailureAction int + +// Healthcheck on-failure actions. +const ( + // HealthCheckOnFailureActionNonce instructs Podman to not react on an unhealthy status. + HealthCheckOnFailureActionNone = iota // Must be first iota for backwards compatibility + // HealthCheckOnFailureActionInvalid denotes an invalid on-failure policy. + HealthCheckOnFailureActionInvalid = iota + // HealthCheckOnFailureActionNonce instructs Podman to kill the container on an unhealthy status. + HealthCheckOnFailureActionKill = iota + // HealthCheckOnFailureActionNonce instructs Podman to restart the container on an unhealthy status. + HealthCheckOnFailureActionRestart = iota + // HealthCheckOnFailureActionNonce instructs Podman to stop the container on an unhealthy status. + HealthCheckOnFailureActionStop = iota +) + +// String representations for on-failure actions. +const ( + strHealthCheckOnFailureActionNone = "none" + strHealthCheckOnFailureActionInvalid = "invalid" + strHealthCheckOnFailureActionKill = "kill" + strHealthCheckOnFailureActionRestart = "restart" + strHealthCheckOnFailureActionStop = "stop" +) + +// SupportedHealthCheckOnFailureActions lists all supported healthcheck restart policies. +var SupportedHealthCheckOnFailureActions = []string{ + strHealthCheckOnFailureActionNone, + strHealthCheckOnFailureActionKill, + strHealthCheckOnFailureActionRestart, + strHealthCheckOnFailureActionStop, +} + +// String returns the string representation of the HealthCheckOnFailureAction. +func (h HealthCheckOnFailureAction) String() string { + switch h { + case HealthCheckOnFailureActionNone: + return strHealthCheckOnFailureActionNone + case HealthCheckOnFailureActionKill: + return strHealthCheckOnFailureActionKill + case HealthCheckOnFailureActionRestart: + return strHealthCheckOnFailureActionRestart + case HealthCheckOnFailureActionStop: + return strHealthCheckOnFailureActionStop + default: + return strHealthCheckOnFailureActionInvalid + } +} + +// ParseHealthCheckOnFailureAction parses the specified string into a HealthCheckOnFailureAction. +// An error is returned for an invalid input. +func ParseHealthCheckOnFailureAction(s string) (HealthCheckOnFailureAction, error) { + switch s { + case "", strHealthCheckOnFailureActionNone: + return HealthCheckOnFailureActionNone, nil + case strHealthCheckOnFailureActionKill: + return HealthCheckOnFailureActionKill, nil + case strHealthCheckOnFailureActionRestart: + return HealthCheckOnFailureActionRestart, nil + case strHealthCheckOnFailureActionStop: + return HealthCheckOnFailureActionStop, nil + default: + err := fmt.Errorf("invalid on-failure action %q for health check: supported actions are %s", s, strings.Join(SupportedHealthCheckOnFailureActions, ",")) + return HealthCheckOnFailureActionInvalid, err + } +} diff --git a/libpod/define/mount.go b/libpod/define/mount.go index 1b0d019c8..db444fd83 100644 --- a/libpod/define/mount.go +++ b/libpod/define/mount.go @@ -1,8 +1,6 @@ package define const ( - // TypeBind is the type for mounting host dir - TypeBind = "bind" // TypeVolume is the type for named volumes TypeVolume = "volume" // TypeTmpfs is the type for mounting tmpfs diff --git a/libpod/define/mount_freebsd.go b/libpod/define/mount_freebsd.go new file mode 100644 index 000000000..e080c9ec6 --- /dev/null +++ b/libpod/define/mount_freebsd.go @@ -0,0 +1,8 @@ +//go:build freebsd + +package define + +const ( + // TypeBind is the type for mounting host dir + TypeBind = "nullfs" +) diff --git a/libpod/define/mount_linux.go b/libpod/define/mount_linux.go new file mode 100644 index 000000000..5ef848905 --- /dev/null +++ b/libpod/define/mount_linux.go @@ -0,0 +1,8 @@ +//go:build linux + +package define + +const ( + // TypeBind is the type for mounting host dir + TypeBind = "bind" +) diff --git a/libpod/define/mount_unsupported.go b/libpod/define/mount_unsupported.go new file mode 100644 index 000000000..cb8642fe2 --- /dev/null +++ b/libpod/define/mount_unsupported.go @@ -0,0 +1,8 @@ +//go:build !linux && !freebsd + +package define + +const ( + // TypeBind is the type for mounting host dir + TypeBind = "bind" +) diff --git a/libpod/events.go b/libpod/events.go index c9e4c9d26..60142cb60 100644 --- a/libpod/events.go +++ b/libpod/events.go @@ -55,6 +55,12 @@ func (c *Container) newContainerExitedEvent(exitCode int32) { e.Image = c.config.RootfsImageName e.Type = events.Container e.ContainerExitCode = int(exitCode) + + e.Details = events.Details{ + ID: e.ID, + Attributes: c.Labels(), + } + if err := c.runtime.eventer.Write(e); err != nil { logrus.Errorf("Unable to write container exited event: %q", err) } @@ -70,6 +76,12 @@ func (c *Container) newExecDiedEvent(sessionID string, exitCode int) { e.ContainerExitCode = exitCode e.Attributes = make(map[string]string) e.Attributes["execID"] = sessionID + + e.Details = events.Details{ + ID: e.ID, + Attributes: c.Labels(), + } + if err := c.runtime.eventer.Write(e); err != nil { logrus.Errorf("Unable to write exec died event: %q", err) } diff --git a/libpod/healthcheck.go b/libpod/healthcheck.go index 9b9d12b17..e835af9f0 100644 --- a/libpod/healthcheck.go +++ b/libpod/healthcheck.go @@ -2,6 +2,7 @@ package libpod import ( "bufio" + "context" "errors" "fmt" "io/ioutil" @@ -12,6 +13,7 @@ import ( "github.com/containers/podman/v4/libpod/define" "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" ) const ( @@ -29,9 +31,14 @@ func (r *Runtime) HealthCheck(name string) (define.HealthCheckStatus, error) { if err != nil { return define.HealthCheckContainerNotFound, fmt.Errorf("unable to look up %s to perform a health check: %w", name, err) } + hcStatus, err := checkHealthCheckCanBeRun(container) if err == nil { - return container.runHealthCheck() + hcStatus, err := container.runHealthCheck() + if err := container.processHealthCheckStatus(hcStatus); err != nil { + return hcStatus, err + } + return hcStatus, err } return hcStatus, err } @@ -127,13 +134,45 @@ func (c *Container) runHealthCheck() (define.HealthCheckStatus, error) { hcResult = define.HealthCheckFailure hcErr = fmt.Errorf("healthcheck command exceeded timeout of %s", c.HealthCheckConfig().Timeout.String()) } + hcl := newHealthCheckLog(timeStart, timeEnd, returnCode, eventLog) if err := c.updateHealthCheckLog(hcl, inStartPeriod); err != nil { return hcResult, fmt.Errorf("unable to update health check log %s for %s: %w", c.healthCheckLogPath(), c.ID(), err) } + return hcResult, hcErr } +func (c *Container) processHealthCheckStatus(status define.HealthCheckStatus) error { + if status == define.HealthCheckSuccess { + return nil + } + + switch c.config.HealthCheckOnFailureAction { + case define.HealthCheckOnFailureActionNone: // Nothing to do + + case define.HealthCheckOnFailureActionKill: + if err := c.Kill(uint(unix.SIGKILL)); err != nil { + return fmt.Errorf("killing container health-check turned unhealthy: %w", err) + } + + case define.HealthCheckOnFailureActionRestart: + if err := c.RestartWithTimeout(context.Background(), c.config.StopTimeout); err != nil { + return fmt.Errorf("restarting container after health-check turned unhealthy: %w", err) + } + + case define.HealthCheckOnFailureActionStop: + if err := c.Stop(); err != nil { + return fmt.Errorf("stopping container after health-check turned unhealthy: %w", err) + } + + default: // Should not happen but better be safe than sorry + return fmt.Errorf("unsupported on-failure action %d", c.config.HealthCheckOnFailureAction) + } + + return nil +} + func checkHealthCheckCanBeRun(c *Container) (define.HealthCheckStatus, error) { cstate, err := c.State() if err != nil { diff --git a/libpod/kube.go b/libpod/kube.go index 8c09a6bb5..1f4831006 100644 --- a/libpod/kube.go +++ b/libpod/kube.go @@ -62,6 +62,7 @@ func (p *Pod) GenerateForKube(ctx context.Context) (*v1.Pod, []v1.ServicePort, e extraHost := make([]v1.HostAlias, 0) hostNetwork := false + hostUsers := true if p.HasInfraContainer() { infraContainer, err := p.getInfraContainer() if err != nil { @@ -87,8 +88,9 @@ func (p *Pod) GenerateForKube(ctx context.Context) (*v1.Pod, []v1.ServicePort, e return nil, servicePorts, err } hostNetwork = infraContainer.NetworkMode() == string(namespaces.NetworkMode(specgen.Host)) + hostUsers = infraContainer.IDMappings().HostUIDMapping && infraContainer.IDMappings().HostGIDMapping } - pod, err := p.podWithContainers(ctx, allContainers, ports, hostNetwork) + pod, err := p.podWithContainers(ctx, allContainers, ports, hostNetwork, hostUsers) if err != nil { return nil, servicePorts, err } @@ -267,6 +269,8 @@ func GenerateKubeServiceFromV1Pod(pod *v1.Pod, servicePorts []v1.ServicePort) (Y } service.Spec = serviceSpec service.ObjectMeta = pod.ObjectMeta + // Reset the annotations for the service as the pod annotations are not needed for the service + service.ObjectMeta.Annotations = nil tm := v12.TypeMeta{ Kind: "Service", APIVersion: pod.TypeMeta.APIVersion, @@ -346,7 +350,7 @@ func containersToServicePorts(containers []v1.Container) ([]v1.ServicePort, erro return sps, nil } -func (p *Pod) podWithContainers(ctx context.Context, containers []*Container, ports []v1.ContainerPort, hostNetwork bool) (*v1.Pod, error) { +func (p *Pod) podWithContainers(ctx context.Context, containers []*Container, ports []v1.ContainerPort, hostNetwork, hostUsers bool) (*v1.Pod, error) { deDupPodVolumes := make(map[string]*v1.Volume) first := true podContainers := make([]v1.Container, 0, len(containers)) @@ -383,7 +387,7 @@ func (p *Pod) podWithContainers(ctx context.Context, containers []*Container, po return nil, err } for k, v := range annotations { - podAnnotations[define.BindMountPrefix+k] = TruncateKubeAnnotation(v) + podAnnotations[define.BindMountPrefix] = TruncateKubeAnnotation(k + ":" + v) } // Since port bindings for the pod are handled by the // infra container, wipe them here only if we are sharing the net namespace @@ -444,10 +448,11 @@ func (p *Pod) podWithContainers(ctx context.Context, containers []*Container, po podVolumes, &dnsInfo, hostNetwork, + hostUsers, hostname), nil } -func newPodObject(podName string, annotations map[string]string, initCtrs, containers []v1.Container, volumes []v1.Volume, dnsOptions *v1.PodDNSConfig, hostNetwork bool, hostname string) *v1.Pod { +func newPodObject(podName string, annotations map[string]string, initCtrs, containers []v1.Container, volumes []v1.Volume, dnsOptions *v1.PodDNSConfig, hostNetwork, hostUsers bool, hostname string) *v1.Pod { tm := v12.TypeMeta{ Kind: "Pod", APIVersion: "v1", @@ -466,12 +471,21 @@ func newPodObject(podName string, annotations map[string]string, initCtrs, conta CreationTimestamp: v12.Now(), Annotations: annotations, } + // Set enableServiceLinks to false as podman doesn't use the service port environment variables + enableServiceLinks := false + // Set automountServiceAccountToken to false as podman doesn't use service account tokens + automountServiceAccountToken := false ps := v1.PodSpec{ - Containers: containers, - Hostname: hostname, - HostNetwork: hostNetwork, - InitContainers: initCtrs, - Volumes: volumes, + Containers: containers, + Hostname: hostname, + HostNetwork: hostNetwork, + InitContainers: initCtrs, + Volumes: volumes, + EnableServiceLinks: &enableServiceLinks, + AutomountServiceAccountToken: &automountServiceAccountToken, + } + if !hostUsers { + ps.HostUsers = &hostUsers } if dnsOptions != nil && (len(dnsOptions.Nameservers)+len(dnsOptions.Searches)+len(dnsOptions.Options) > 0) { ps.DNSConfig = dnsOptions @@ -490,6 +504,7 @@ func simplePodWithV1Containers(ctx context.Context, ctrs []*Container) (*v1.Pod, kubeCtrs := make([]v1.Container, 0, len(ctrs)) kubeInitCtrs := []v1.Container{} kubeVolumes := make([]v1.Volume, 0) + hostUsers := true hostNetwork := true podDNS := v1.PodDNSConfig{} kubeAnnotations := make(map[string]string) @@ -519,12 +534,15 @@ func simplePodWithV1Containers(ctx context.Context, ctrs []*Container) (*v1.Pod, if !ctr.HostNetwork() { hostNetwork = false } + if !(ctr.IDMappings().HostUIDMapping && ctr.IDMappings().HostGIDMapping) { + hostUsers = false + } kubeCtr, kubeVols, ctrDNS, annotations, err := containerToV1Container(ctx, ctr) if err != nil { return nil, err } for k, v := range annotations { - kubeAnnotations[define.BindMountPrefix+k] = TruncateKubeAnnotation(v) + kubeAnnotations[define.BindMountPrefix] = TruncateKubeAnnotation(k + ":" + v) } if isInit { kubeInitCtrs = append(kubeInitCtrs, kubeCtr) @@ -580,6 +598,7 @@ func simplePodWithV1Containers(ctx context.Context, ctrs []*Container) (*v1.Pod, kubeVolumes, &podDNS, hostNetwork, + hostUsers, hostname), nil } diff --git a/libpod/networking_linux.go b/libpod/networking_linux.go index c05796768..c10c3c0b2 100644 --- a/libpod/networking_linux.go +++ b/libpod/networking_linux.go @@ -1357,6 +1357,11 @@ func (c *Container) NetworkConnect(nameOrID, netName string, netOpts types.PerNe } if err := c.runtime.state.NetworkConnect(c, netName, netOpts); err != nil { + // Docker compat: treat requests to attach already attached networks as a no-op, ignoring opts + if errors.Is(err, define.ErrNetworkConnected) && c.ensureState(define.ContainerStateConfigured) { + return nil + } + return err } c.newNetworkEvent(events.NetworkConnect, netName) diff --git a/libpod/networking_unsupported.go b/libpod/networking_unsupported.go index 76ffabb5e..9429287f9 100644 --- a/libpod/networking_unsupported.go +++ b/libpod/networking_unsupported.go @@ -5,6 +5,7 @@ package libpod import ( "errors" + "net" "path/filepath" "github.com/containers/common/libnetwork/types" @@ -84,3 +85,7 @@ func (r *Runtime) GetRootlessNetNs(new bool) (*RootlessNetNS, error) { func (c *Container) convertPortMappings() []types.PortMapping { return []types.PortMapping{} } + +func GetSlirp4netnsIP(subnet *net.IPNet) (*net.IP, error) { + return nil, errors.New("not implemented GetSlirp4netnsIP") +} diff --git a/libpod/oci.go b/libpod/oci.go index 70053db1b..e5b9a0dcd 100644 --- a/libpod/oci.go +++ b/libpod/oci.go @@ -5,6 +5,7 @@ import ( "github.com/containers/common/pkg/resize" "github.com/containers/podman/v4/libpod/define" + "github.com/opencontainers/runtime-spec/specs-go" ) // OCIRuntime is an implementation of an OCI runtime. @@ -148,6 +149,9 @@ type OCIRuntime interface { // RuntimeInfo returns verbose information about the runtime. RuntimeInfo() (*define.ConmonInfo, *define.OCIRuntimeInfo, error) + + // UpdateContainer updates the given container's cgroup configuration. + UpdateContainer(ctr *Container, res *specs.LinuxResources) error } // AttachOptions are options used when attached to a container or an exec diff --git a/libpod/oci_conmon_attach_common.go b/libpod/oci_conmon_attach_common.go index a9e9b2bb5..dec749837 100644 --- a/libpod/oci_conmon_attach_common.go +++ b/libpod/oci_conmon_attach_common.go @@ -280,20 +280,20 @@ func readStdio(conn *net.UnixConn, streams *define.AttachStreams, receiveStdoutE var err error select { case err = <-receiveStdoutError: - if err := conn.CloseWrite(); err != nil { + if err := socketCloseWrite(conn); err != nil { logrus.Errorf("Failed to close stdin: %v", err) } return err case err = <-stdinDone: if err == define.ErrDetach { - if err := conn.CloseWrite(); err != nil { + if err := socketCloseWrite(conn); err != nil { logrus.Errorf("Failed to close stdin: %v", err) } return err } if err == nil { // copy stdin is done, close it - if connErr := conn.CloseWrite(); connErr != nil { + if connErr := socketCloseWrite(conn); connErr != nil { logrus.Errorf("Unable to close conn: %v", connErr) } } diff --git a/libpod/oci_conmon_common.go b/libpod/oci_conmon_common.go index 2c7c39726..8ef8ae721 100644 --- a/libpod/oci_conmon_common.go +++ b/libpod/oci_conmon_common.go @@ -277,15 +277,6 @@ func (r *ConmonOCIRuntime) UpdateContainerStatus(ctr *Container) error { ctr.ID(), state.Status, define.ErrInternal) } - // Only grab exit status if we were not already stopped - // If we were, it should already be in the database - if ctr.state.State == define.ContainerStateStopped && oldState != define.ContainerStateStopped { - if _, err := ctr.Wait(context.Background()); err != nil { - logrus.Errorf("Waiting for container %s to exit: %v", ctr.ID(), err) - } - return nil - } - // Handle ContainerStateStopping - keep it unless the container // transitioned to no longer running. if oldState == define.ContainerStateStopping && (ctr.state.State == define.ContainerStatePaused || ctr.state.State == define.ContainerStateRunning) { @@ -316,6 +307,52 @@ func (r *ConmonOCIRuntime) StartContainer(ctr *Container) error { return nil } +// UpdateContainer updates the given container's cgroup configuration +func (r *ConmonOCIRuntime) UpdateContainer(ctr *Container, resources *spec.LinuxResources) error { + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return err + } + env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} + if path, ok := os.LookupEnv("PATH"); ok { + env = append(env, fmt.Sprintf("PATH=%s", path)) + } + args := r.runtimeFlags + args = append(args, "update") + tempFile, additionalArgs, err := generateResourceFile(resources) + if err != nil { + return err + } + defer os.Remove(tempFile) + + args = append(args, additionalArgs...) + return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, append(args, ctr.ID())...) +} + +func generateResourceFile(res *spec.LinuxResources) (string, []string, error) { + flags := []string{} + if res == nil { + return "", flags, nil + } + + f, err := ioutil.TempFile("", "podman") + if err != nil { + return "", nil, err + } + + j, err := json.Marshal(res) + if err != nil { + return "", nil, err + } + _, err = f.WriteString(string(j)) + if err != nil { + return "", nil, err + } + + flags = append(flags, "--resources="+f.Name()) + return f.Name(), flags, nil +} + // KillContainer sends the given signal to the given container. // If all is set, send to all PIDs in the container. // All is only supported if the container created cgroups. @@ -392,13 +429,11 @@ func (r *ConmonOCIRuntime) StopContainer(ctr *Container, timeout uint, all bool) } } - if err := r.KillContainer(ctr, 9, all); err != nil { + if err := r.KillContainer(ctr, uint(unix.SIGKILL), all); err != nil { // Again, check if the container is gone. If it is, exit cleanly. - err := unix.Kill(ctr.state.PID, 0) - if err == unix.ESRCH { + if aliveErr := unix.Kill(ctr.state.PID, 0); errors.Is(aliveErr, unix.ESRCH) { return nil } - return fmt.Errorf("error sending SIGKILL to container %s: %w", ctr.ID(), err) } @@ -440,6 +475,16 @@ func (r *ConmonOCIRuntime) UnpauseContainer(ctr *Container) error { return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, append(r.runtimeFlags, "resume", ctr.ID())...) } +// This filters out ENOTCONN errors which can happen on FreeBSD if the +// other side of the connection is already closed. +func socketCloseWrite(conn *net.UnixConn) error { + err := conn.CloseWrite() + if err != nil && errors.Is(err, syscall.ENOTCONN) { + return nil + } + return err +} + // HTTPAttach performs an attach for the HTTP API. // The caller must handle closing the HTTP connection after this returns. // The cancel channel is not closed; it is up to the caller to do so after @@ -652,7 +697,7 @@ func (r *ConmonOCIRuntime) HTTPAttach(ctr *Container, req *http.Request, w http. return err } // copy stdin is done, close it - if connErr := conn.CloseWrite(); connErr != nil { + if connErr := socketCloseWrite(conn); connErr != nil { logrus.Errorf("Unable to close conn: %v", connErr) } case <-cancel: diff --git a/libpod/oci_conmon_exec_common.go b/libpod/oci_conmon_exec_common.go index 16cd7ef9f..735dbb9c4 100644 --- a/libpod/oci_conmon_exec_common.go +++ b/libpod/oci_conmon_exec_common.go @@ -12,7 +12,6 @@ import ( "syscall" "time" - "github.com/containers/common/pkg/capabilities" "github.com/containers/common/pkg/config" "github.com/containers/common/pkg/resize" cutil "github.com/containers/common/pkg/util" @@ -386,7 +385,7 @@ func (r *ConmonOCIRuntime) startExec(c *Container, sessionID string, options *Ex finalEnv = append(finalEnv, fmt.Sprintf("%s=%s", k, v)) } - processFile, err := prepareProcessExec(c, options, finalEnv, sessionID) + processFile, err := c.prepareProcessExec(options, finalEnv, sessionID) if err != nil { return nil, nil, err } @@ -654,7 +653,7 @@ func attachExecHTTP(c *Container, sessionID string, r *http.Request, w http.Resp return err } // copy stdin is done, close it - if connErr := conn.CloseWrite(); connErr != nil { + if connErr := socketCloseWrite(conn); connErr != nil { logrus.Errorf("Unable to close conn: %v", connErr) } case <-cancel: @@ -665,7 +664,7 @@ func attachExecHTTP(c *Container, sessionID string, r *http.Request, w http.Resp // prepareProcessExec returns the path of the process.json used in runc exec -p // caller is responsible to close the returned *os.File if needed. -func prepareProcessExec(c *Container, options *ExecOptions, env []string, sessionID string) (*os.File, error) { +func (c *Container) prepareProcessExec(options *ExecOptions, env []string, sessionID string) (*os.File, error) { f, err := ioutil.TempFile(c.execBundlePath(sessionID), "exec-process-") if err != nil { return nil, err @@ -745,34 +744,9 @@ func prepareProcessExec(c *Container, options *ExecOptions, env []string, sessio pspec.User = processUser } - ctrSpec, err := c.specFromState() - if err != nil { - return nil, err - } - - allCaps, err := capabilities.BoundingSet() - if err != nil { + if err := c.setProcessCapabilitiesExec(options, user, execUser, pspec); err != nil { return nil, err } - if options.Privileged { - pspec.Capabilities.Bounding = allCaps - } else { - pspec.Capabilities.Bounding = ctrSpec.Process.Capabilities.Bounding - } - - // Always unset the inheritable capabilities similarly to what the Linux kernel does - // They are used only when using capabilities with uid != 0. - pspec.Capabilities.Inheritable = []string{} - - if execUser.Uid == 0 { - pspec.Capabilities.Effective = pspec.Capabilities.Bounding - pspec.Capabilities.Permitted = pspec.Capabilities.Bounding - } else if user == c.config.User { - pspec.Capabilities.Effective = ctrSpec.Process.Capabilities.Effective - pspec.Capabilities.Inheritable = ctrSpec.Process.Capabilities.Effective - pspec.Capabilities.Permitted = ctrSpec.Process.Capabilities.Effective - pspec.Capabilities.Ambient = ctrSpec.Process.Capabilities.Effective - } hasHomeSet := false for _, s := range pspec.Env { diff --git a/libpod/oci_conmon_exec_freebsd.go b/libpod/oci_conmon_exec_freebsd.go new file mode 100644 index 000000000..bf30404a1 --- /dev/null +++ b/libpod/oci_conmon_exec_freebsd.go @@ -0,0 +1,10 @@ +package libpod + +import ( + "github.com/opencontainers/runc/libcontainer/user" + spec "github.com/opencontainers/runtime-spec/specs-go" +) + +func (c *Container) setProcessCapabilitiesExec(options *ExecOptions, user string, execUser *user.ExecUser, pspec *spec.Process) error { + return nil +} diff --git a/libpod/oci_conmon_exec_linux.go b/libpod/oci_conmon_exec_linux.go new file mode 100644 index 000000000..617e8d601 --- /dev/null +++ b/libpod/oci_conmon_exec_linux.go @@ -0,0 +1,39 @@ +package libpod + +import ( + "github.com/containers/common/pkg/capabilities" + "github.com/opencontainers/runc/libcontainer/user" + spec "github.com/opencontainers/runtime-spec/specs-go" +) + +func (c *Container) setProcessCapabilitiesExec(options *ExecOptions, user string, execUser *user.ExecUser, pspec *spec.Process) error { + ctrSpec, err := c.specFromState() + if err != nil { + return err + } + + allCaps, err := capabilities.BoundingSet() + if err != nil { + return err + } + if options.Privileged { + pspec.Capabilities.Bounding = allCaps + } else { + pspec.Capabilities.Bounding = ctrSpec.Process.Capabilities.Bounding + } + + // Always unset the inheritable capabilities similarly to what the Linux kernel does + // They are used only when using capabilities with uid != 0. + pspec.Capabilities.Inheritable = []string{} + + if execUser.Uid == 0 { + pspec.Capabilities.Effective = pspec.Capabilities.Bounding + pspec.Capabilities.Permitted = pspec.Capabilities.Bounding + } else if user == c.config.User { + pspec.Capabilities.Effective = ctrSpec.Process.Capabilities.Effective + pspec.Capabilities.Inheritable = ctrSpec.Process.Capabilities.Effective + pspec.Capabilities.Permitted = ctrSpec.Process.Capabilities.Effective + pspec.Capabilities.Ambient = ctrSpec.Process.Capabilities.Effective + } + return nil +} diff --git a/libpod/oci_conmon_freebsd.go b/libpod/oci_conmon_freebsd.go index 6f7ac7fc6..d74f2af01 100644 --- a/libpod/oci_conmon_freebsd.go +++ b/libpod/oci_conmon_freebsd.go @@ -19,6 +19,9 @@ func (r *ConmonOCIRuntime) withContainerSocketLabel(ctr *Container, closure func // moveConmonToCgroupAndSignal gets a container's cgroupParent and moves the conmon process to that cgroup // it then signals for conmon to start by sending nonce data down the start fd func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec.Cmd, startFd *os.File) error { - // No equivalent on FreeBSD + // No equivalent to cgroup on FreeBSD, just signal conmon to start + if err := writeConmonPipeData(startFd); err != nil { + return err + } return nil } diff --git a/libpod/oci_missing.go b/libpod/oci_missing.go index 2ab2b4577..bbf2957ff 100644 --- a/libpod/oci_missing.go +++ b/libpod/oci_missing.go @@ -8,6 +8,7 @@ import ( "github.com/containers/common/pkg/resize" "github.com/containers/podman/v4/libpod/define" + spec "github.com/opencontainers/runtime-spec/specs-go" "github.com/sirupsen/logrus" ) @@ -80,6 +81,11 @@ func (r *MissingRuntime) StartContainer(ctr *Container) error { return r.printError() } +// UpdateContainer is not available as the runtime is missing +func (r *MissingRuntime) UpdateContainer(ctr *Container, resources *spec.LinuxResources) error { + return r.printError() +} + // KillContainer is not available as the runtime is missing // TODO: We could attempt to unix.Kill() the PID as recorded in the state if we // really want to smooth things out? Won't be perfect, but if the container has diff --git a/libpod/options.go b/libpod/options.go index d31741094..71ad3d11e 100644 --- a/libpod/options.go +++ b/libpod/options.go @@ -1413,9 +1413,10 @@ func WithNamedVolumes(volumes []*ContainerNamedVolume) CtrCreateOption { } ctr.config.NamedVolumes = append(ctr.config.NamedVolumes, &ContainerNamedVolume{ - Name: vol.Name, - Dest: vol.Dest, - Options: mountOpts, + Name: vol.Name, + Dest: vol.Dest, + Options: mountOpts, + IsAnonymous: vol.IsAnonymous, }) } @@ -1472,6 +1473,17 @@ func WithHealthCheck(healthCheck *manifest.Schema2HealthConfig) CtrCreateOption } } +// WithHealthCheckOnFailureAction adds an on-failure action to health-check config +func WithHealthCheckOnFailureAction(action define.HealthCheckOnFailureAction) CtrCreateOption { + return func(ctr *Container) error { + if ctr.valid { + return define.ErrCtrFinalized + } + ctr.config.HealthCheckOnFailureAction = action + return nil + } +} + // WithPreserveFDs forwards from the process running Libpod into the container // the given number of extra FDs (starting after the standard streams) to the created container func WithPreserveFDs(fd uint) CtrCreateOption { diff --git a/libpod/runtime.go b/libpod/runtime.go index 9b97fd724..1503b2344 100644 --- a/libpod/runtime.go +++ b/libpod/runtime.go @@ -5,6 +5,7 @@ import ( "context" "errors" "fmt" + "math/rand" "os" "path/filepath" "strings" @@ -112,6 +113,13 @@ type Runtime struct { secretsManager *secrets.SecretsManager } +func init() { + // generateName calls namesgenerator.GetRandomName which the + // global RNG from math/rand. Seed it here to make sure we + // don't get the same name every time. + rand.Seed(time.Now().UnixNano()) +} + // SetXdgDirs ensures the XDG_RUNTIME_DIR env and XDG_CONFIG_HOME variables are set. // containers/image uses XDG_RUNTIME_DIR to locate the auth file, XDG_CONFIG_HOME is // use for the containers.conf configuration file. diff --git a/libpod/runtime_ctr.go b/libpod/runtime_ctr.go index 703ae5cbe..fb4f80aa6 100644 --- a/libpod/runtime_ctr.go +++ b/libpod/runtime_ctr.go @@ -171,12 +171,17 @@ func (r *Runtime) initContainerVariables(rSpec *spec.Spec, config *ContainerConf if config == nil { ctr.config.ID = stringid.GenerateNonCryptoID() size, err := units.FromHumanSize(r.config.Containers.ShmSize) - if err != nil { - return nil, fmt.Errorf("converting containers.conf ShmSize %s to an int: %w", r.config.Containers.ShmSize, err) + if useDevShm { + if err != nil { + return nil, fmt.Errorf("converting containers.conf ShmSize %s to an int: %w", r.config.Containers.ShmSize, err) + } + ctr.config.ShmSize = size + ctr.config.NoShm = false + ctr.config.NoShmShare = false + } else { + ctr.config.NoShm = true + ctr.config.NoShmShare = true } - ctr.config.ShmSize = size - ctr.config.NoShm = false - ctr.config.NoShmShare = false ctr.config.StopSignal = 15 ctr.config.StopTimeout = r.config.Engine.StopTimeout @@ -474,6 +479,11 @@ func (r *Runtime) setupContainer(ctx context.Context, ctr *Container) (_ *Contai return nil, fmt.Errorf("error retrieving named volume %s for new container: %w", vol.Name, err) } } + if vol.IsAnonymous { + // If SetAnonymous is true, make this an anonymous volume + // this is needed for emptyDir volumes from kube yamls + isAnonymous = true + } logrus.Debugf("Creating new volume %s for container", vol.Name) @@ -523,7 +533,7 @@ func (r *Runtime) setupContainer(ctx context.Context, ctr *Container) (_ *Contai } } - if !MountExists(ctr.config.Spec.Mounts, "/dev/shm") && ctr.config.ShmDir == "" && !ctr.config.NoShm { + if useDevShm && !MountExists(ctr.config.Spec.Mounts, "/dev/shm") && ctr.config.ShmDir == "" && !ctr.config.NoShm { ctr.config.ShmDir = filepath.Join(ctr.bundlePath(), "shm") if err := os.MkdirAll(ctr.config.ShmDir, 0700); err != nil { if !os.IsExist(err) { @@ -788,7 +798,7 @@ func (r *Runtime) removeContainer(ctx context.Context, c *Container, force, remo // Deallocate the container's lock if err := c.lock.Free(); err != nil { - if cleanupErr == nil { + if cleanupErr == nil && !os.IsNotExist(err) { cleanupErr = fmt.Errorf("error freeing lock for container %s: %w", c.ID(), err) } else { logrus.Errorf("Free container lock: %v", err) @@ -814,11 +824,11 @@ func (r *Runtime) removeContainer(ctx context.Context, c *Container, force, remo // Ignore error, since podman will report original error volumesFrom, _ := c.volumesFrom() if len(volumesFrom) > 0 { - logrus.Debugf("Cleaning up volume not possible since volume is in use (%s)", v) + logrus.Debugf("Cleaning up volume not possible since volume is in use (%s)", v.Name) continue } } - logrus.Errorf("Cleaning up volume (%s): %v", v, err) + logrus.Errorf("Cleaning up volume (%s): %v", v.Name, err) } } } @@ -968,7 +978,7 @@ func (r *Runtime) evictContainer(ctx context.Context, idOrName string, removeVol continue } if err := r.removeVolume(ctx, volume, false, timeout, false); err != nil && err != define.ErrNoSuchVolume && err != define.ErrVolumeBeingUsed { - logrus.Errorf("Cleaning up volume (%s): %v", v, err) + logrus.Errorf("Cleaning up volume (%s): %v", v.Name, err) } } } diff --git a/libpod/runtime_ctr_freebsd.go b/libpod/runtime_ctr_freebsd.go new file mode 100644 index 000000000..a8870a38c --- /dev/null +++ b/libpod/runtime_ctr_freebsd.go @@ -0,0 +1,5 @@ +package libpod + +const ( + useDevShm = false +) diff --git a/libpod/runtime_ctr_linux.go b/libpod/runtime_ctr_linux.go new file mode 100644 index 000000000..7812d8238 --- /dev/null +++ b/libpod/runtime_ctr_linux.go @@ -0,0 +1,5 @@ +package libpod + +const ( + useDevShm = true +) diff --git a/libpod/runtime_test.go b/libpod/runtime_test.go new file mode 100644 index 000000000..2e16c7fcd --- /dev/null +++ b/libpod/runtime_test.go @@ -0,0 +1,28 @@ +package libpod + +import ( + "math/rand" + "os" + "testing" + + "github.com/stretchr/testify/assert" +) + +func Test_generateName(t *testing.T) { + state, path, _, err := getEmptyBoltState() + assert.NoError(t, err) + defer os.RemoveAll(path) + defer state.Close() + + r := &Runtime{ + state: state, + } + + // Test that (*Runtime).generateName returns different names + // if called twice, even if the global RNG has the default + // seed. + n1, _ := r.generateName() + rand.Seed(1) + n2, _ := r.generateName() + assert.NotEqual(t, n1, n2) +} |