//go:build linux || freebsd // +build linux freebsd package libpod import ( "context" "errors" "fmt" "io" "io/ioutil" "math" "os" "path" "path/filepath" "strconv" "strings" "time" metadata "github.com/checkpoint-restore/checkpointctl/lib" "github.com/checkpoint-restore/go-criu/v5/stats" cdi "github.com/container-orchestrated-devices/container-device-interface/pkg/cdi" "github.com/containers/buildah" "github.com/containers/buildah/pkg/chrootuser" "github.com/containers/buildah/pkg/overlay" butil "github.com/containers/buildah/util" "github.com/containers/common/libnetwork/types" "github.com/containers/common/pkg/apparmor" cutil "github.com/containers/common/pkg/util" is "github.com/containers/image/v5/storage" "github.com/containers/podman/v4/libpod/define" "github.com/containers/podman/v4/libpod/events" "github.com/containers/podman/v4/pkg/annotations" "github.com/containers/podman/v4/pkg/checkpoint/crutils" "github.com/containers/podman/v4/pkg/criu" "github.com/containers/podman/v4/pkg/lookup" "github.com/containers/podman/v4/pkg/rootless" "github.com/containers/podman/v4/pkg/util" "github.com/containers/podman/v4/version" "github.com/containers/storage/pkg/archive" "github.com/containers/storage/pkg/idtools" securejoin "github.com/cyphar/filepath-securejoin" runcuser "github.com/opencontainers/runc/libcontainer/user" spec "github.com/opencontainers/runtime-spec/specs-go" "github.com/opencontainers/runtime-tools/generate" "github.com/opencontainers/selinux/go-selinux/label" "github.com/sirupsen/logrus" ) // Internal only function which returns upper and work dir from // overlay options. func getOverlayUpperAndWorkDir(options []string) (string, string, error) { upperDir := "" workDir := "" for _, o := range options { if strings.HasPrefix(o, "upperdir") { splitOpt := strings.SplitN(o, "=", 2) if len(splitOpt) > 1 { upperDir = splitOpt[1] if upperDir == "" { return "", "", errors.New("cannot accept empty value for upperdir") } } } if strings.HasPrefix(o, "workdir") { splitOpt := strings.SplitN(o, "=", 2) if len(splitOpt) > 1 { workDir = splitOpt[1] if workDir == "" { return "", "", errors.New("cannot accept empty value for workdir") } } } } if (upperDir != "" && workDir == "") || (upperDir == "" && workDir != "") { return "", "", errors.New("must specify both upperdir and workdir") } return upperDir, workDir, nil } // Generate spec for a container // Accepts a map of the container's dependencies func (c *Container) generateSpec(ctx context.Context) (*spec.Spec, error) { overrides := c.getUserOverrides() execUser, err := lookup.GetUserGroupInfo(c.state.Mountpoint, c.config.User, overrides) if err != nil { if cutil.StringInSlice(c.config.User, c.config.HostUsers) { execUser, err = lookupHostUser(c.config.User) } if err != nil { return nil, err } } // NewFromSpec() is deprecated according to its comment // however the recommended replace just causes a nil map panic //nolint:staticcheck g := generate.NewFromSpec(c.config.Spec) // If the flag to mount all devices is set for a privileged container, add // all the devices from the host's machine into the container if c.config.MountAllDevices { if err := util.AddPrivilegedDevices(&g); err != nil { return nil, err } } // If network namespace was requested, add it now if err := c.addNetworkNamespace(&g); err != nil { return nil, err } // Apply AppArmor checks and load the default profile if needed. if len(c.config.Spec.Process.ApparmorProfile) > 0 { updatedProfile, err := apparmor.CheckProfileAndLoadDefault(c.config.Spec.Process.ApparmorProfile) if err != nil { return nil, err } g.SetProcessApparmorProfile(updatedProfile) } if err := c.makeBindMounts(); err != nil { return nil, err } if err := c.mountNotifySocket(g); err != nil { return nil, err } // Get host UID and GID based on the container process UID and GID. hostUID, hostGID, err := butil.GetHostIDs(util.IDtoolsToRuntimeSpec(c.config.IDMappings.UIDMap), util.IDtoolsToRuntimeSpec(c.config.IDMappings.GIDMap), uint32(execUser.Uid), uint32(execUser.Gid)) if err != nil { return nil, err } // Add named volumes for _, namedVol := range c.config.NamedVolumes { volume, err := c.runtime.GetVolume(namedVol.Name) if err != nil { return nil, fmt.Errorf("error retrieving volume %s to add to container %s: %w", namedVol.Name, c.ID(), err) } mountPoint, err := volume.MountPoint() if err != nil { return nil, err } overlayFlag := false upperDir := "" workDir := "" for _, o := range namedVol.Options { if o == "O" { overlayFlag = true upperDir, workDir, err = getOverlayUpperAndWorkDir(namedVol.Options) if err != nil { return nil, err } } } if overlayFlag { var overlayMount spec.Mount var overlayOpts *overlay.Options contentDir, err := overlay.TempDir(c.config.StaticDir, c.RootUID(), c.RootGID()) if err != nil { return nil, err } overlayOpts = &overlay.Options{RootUID: c.RootUID(), RootGID: c.RootGID(), UpperDirOptionFragment: upperDir, WorkDirOptionFragment: workDir, GraphOpts: c.runtime.store.GraphOptions(), } overlayMount, err = overlay.MountWithOptions(contentDir, mountPoint, namedVol.Dest, overlayOpts) if err != nil { return nil, fmt.Errorf("mounting overlay failed %q: %w", mountPoint, err) } for _, o := range namedVol.Options { if o == "U" { if err := c.ChangeHostPathOwnership(mountPoint, true, int(hostUID), int(hostGID)); err != nil { return nil, err } if err := c.ChangeHostPathOwnership(contentDir, true, int(hostUID), int(hostGID)); err != nil { return nil, err } } } g.AddMount(overlayMount) } else { volMount := spec.Mount{ Type: define.TypeBind, Source: mountPoint, Destination: namedVol.Dest, Options: namedVol.Options, } g.AddMount(volMount) } } // Check if the spec file mounts contain the options z, Z or U. // If they have z or Z, relabel the source directory and then remove the option. // If they have U, chown the source directory and them remove the option. for i := range g.Config.Mounts { m := &g.Config.Mounts[i] var options []string for _, o := range m.Options { switch o { case "U": if m.Type == "tmpfs" { options = append(options, []string{fmt.Sprintf("uid=%d", execUser.Uid), fmt.Sprintf("gid=%d", execUser.Gid)}...) } else { // only chown on initial creation of container if err := c.ChangeHostPathOwnership(m.Source, true, int(hostUID), int(hostGID)); err != nil { return nil, err } } case "z": fallthrough case "Z": if err := c.relabel(m.Source, c.MountLabel(), label.IsShared(o)); err != nil { return nil, err } default: options = append(options, o) } } m.Options = options } c.setProcessLabel(&g) c.setMountLabel(&g) // Add bind mounts to container for dstPath, srcPath := range c.state.BindMounts { newMount := spec.Mount{ Type: define.TypeBind, Source: srcPath, Destination: dstPath, Options: bindOptions, } if c.IsReadOnly() && dstPath != "/dev/shm" { newMount.Options = append(newMount.Options, "ro", "nosuid", "noexec", "nodev") } if dstPath == "/dev/shm" && c.state.BindMounts["/dev/shm"] == c.config.ShmDir { newMount.Options = append(newMount.Options, "nosuid", "noexec", "nodev") } if !MountExists(g.Mounts(), dstPath) { g.AddMount(newMount) } else { logrus.Infof("User mount overriding libpod mount at %q", dstPath) } } // Add overlay volumes for _, overlayVol := range c.config.OverlayVolumes { upperDir, workDir, err := getOverlayUpperAndWorkDir(overlayVol.Options) if err != nil { return nil, err } contentDir, err := overlay.TempDir(c.config.StaticDir, c.RootUID(), c.RootGID()) if err != nil { return nil, err } overlayOpts := &overlay.Options{RootUID: c.RootUID(), RootGID: c.RootGID(), UpperDirOptionFragment: upperDir, WorkDirOptionFragment: workDir, GraphOpts: c.runtime.store.GraphOptions(), } overlayMount, err := overlay.MountWithOptions(contentDir, overlayVol.Source, overlayVol.Dest, overlayOpts) if err != nil { return nil, fmt.Errorf("mounting overlay failed %q: %w", overlayVol.Source, err) } // Check overlay volume options for _, o := range overlayVol.Options { if o == "U" { if err := c.ChangeHostPathOwnership(overlayVol.Source, true, int(hostUID), int(hostGID)); err != nil { return nil, err } if err := c.ChangeHostPathOwnership(contentDir, true, int(hostUID), int(hostGID)); err != nil { return nil, err } } } g.AddMount(overlayMount) } // Add image volumes as overlay mounts for _, volume := range c.config.ImageVolumes { // Mount the specified image. img, _, err := c.runtime.LibimageRuntime().LookupImage(volume.Source, nil) if err != nil { return nil, fmt.Errorf("error creating image volume %q:%q: %w", volume.Source, volume.Dest, err) } mountPoint, err := img.Mount(ctx, nil, "") if err != nil { return nil, fmt.Errorf("error mounting image volume %q:%q: %w", volume.Source, volume.Dest, err) } contentDir, err := overlay.TempDir(c.config.StaticDir, c.RootUID(), c.RootGID()) if err != nil { return nil, fmt.Errorf("failed to create TempDir in the %s directory: %w", c.config.StaticDir, err) } var overlayMount spec.Mount if volume.ReadWrite { overlayMount, err = overlay.Mount(contentDir, mountPoint, volume.Dest, c.RootUID(), c.RootGID(), c.runtime.store.GraphOptions()) } else { overlayMount, err = overlay.MountReadOnly(contentDir, mountPoint, volume.Dest, c.RootUID(), c.RootGID(), c.runtime.store.GraphOptions()) } if err != nil { return nil, fmt.Errorf("creating overlay mount for image %q failed: %w", volume.Source, err) } g.AddMount(overlayMount) } hasHomeSet := false for _, s := range c.config.Spec.Process.Env { if strings.HasPrefix(s, "HOME=") { hasHomeSet = true break } } if !hasHomeSet && execUser.Home != "" { c.config.Spec.Process.Env = append(c.config.Spec.Process.Env, fmt.Sprintf("HOME=%s", execUser.Home)) } if c.config.User != "" { // User and Group must go together g.SetProcessUID(uint32(execUser.Uid)) g.SetProcessGID(uint32(execUser.Gid)) g.AddProcessAdditionalGid(uint32(execUser.Gid)) } if c.config.Umask != "" { decVal, err := strconv.ParseUint(c.config.Umask, 8, 32) if err != nil { return nil, fmt.Errorf("invalid Umask Value: %w", err) } umask := uint32(decVal) g.Config.Process.User.Umask = &umask } // Add addition groups if c.config.GroupAdd is not empty if len(c.config.Groups) > 0 { gids, err := lookup.GetContainerGroups(c.config.Groups, c.state.Mountpoint, overrides) if err != nil { return nil, fmt.Errorf("error looking up supplemental groups for container %s: %w", c.ID(), err) } for _, gid := range gids { g.AddProcessAdditionalGid(gid) } } if err := c.addSystemdMounts(&g); err != nil { return nil, err } // Look up and add groups the user belongs to, if a group wasn't directly specified if !strings.Contains(c.config.User, ":") { // the gidMappings that are present inside the container user namespace var gidMappings []idtools.IDMap switch { case len(c.config.IDMappings.GIDMap) > 0: gidMappings = c.config.IDMappings.GIDMap case rootless.IsRootless(): // Check whether the current user namespace has enough gids available. availableGids, err := rootless.GetAvailableGids() if err != nil { return nil, fmt.Errorf("cannot read number of available GIDs: %w", err) } gidMappings = []idtools.IDMap{{ ContainerID: 0, HostID: 0, Size: int(availableGids), }} default: gidMappings = []idtools.IDMap{{ ContainerID: 0, HostID: 0, Size: math.MaxInt32, }} } for _, gid := range execUser.Sgids { isGIDAvailable := false for _, m := range gidMappings { if gid >= m.ContainerID && gid < m.ContainerID+m.Size { isGIDAvailable = true break } } if isGIDAvailable { g.AddProcessAdditionalGid(uint32(gid)) } else { logrus.Warnf("Additional gid=%d is not present in the user namespace, skip setting it", gid) } } } // Add shared namespaces from other containers if err := c.addSharedNamespaces(&g); err != nil { return nil, err } g.SetRootPath(c.state.Mountpoint) g.AddAnnotation(annotations.Created, c.config.CreatedTime.Format(time.RFC3339Nano)) g.AddAnnotation("org.opencontainers.image.stopSignal", fmt.Sprintf("%d", c.config.StopSignal)) if _, exists := g.Config.Annotations[annotations.ContainerManager]; !exists { g.AddAnnotation(annotations.ContainerManager, annotations.ContainerManagerLibpod) } if err := c.setCgroupsPath(&g); err != nil { return nil, err } // Warning: CDI may alter g.Config in place. if len(c.config.CDIDevices) > 0 { registry := cdi.GetRegistry( cdi.WithAutoRefresh(false), ) if err := registry.Refresh(); err != nil { logrus.Debugf("The following error was triggered when refreshing the CDI registry: %v", err) } _, err := registry.InjectDevices(g.Config, c.config.CDIDevices...) if err != nil { return nil, fmt.Errorf("error setting up CDI devices: %w", err) } } // Mounts need to be sorted so paths will not cover other paths mounts := sortMounts(g.Mounts()) g.ClearMounts() for _, m := range mounts { // We need to remove all symlinks from tmpfs mounts. // Runc and other runtimes may choke on them. // Easy solution: use securejoin to do a scoped evaluation of // the links, then trim off the mount prefix. if m.Type == "tmpfs" { finalPath, err := securejoin.SecureJoin(c.state.Mountpoint, m.Destination) if err != nil { return nil, fmt.Errorf("error resolving symlinks for mount destination %s: %w", m.Destination, err) } trimmedPath := strings.TrimPrefix(finalPath, strings.TrimSuffix(c.state.Mountpoint, "/")) m.Destination = trimmedPath } g.AddMount(m) } if err := c.addRootPropagation(&g, mounts); err != nil { return nil, err } // Warning: precreate hooks may alter g.Config in place. if c.state.ExtensionStageHooks, err = c.setupOCIHooks(ctx, g.Config); err != nil { return nil, fmt.Errorf("error setting up OCI Hooks: %w", err) } if len(c.config.EnvSecrets) > 0 { manager, err := c.runtime.SecretsManager() if err != nil { return nil, err } if err != nil { return nil, err } for name, secr := range c.config.EnvSecrets { _, data, err := manager.LookupSecretData(secr.Name) if err != nil { return nil, err } g.AddProcessEnv(name, string(data)) } } // Pass down the LISTEN_* environment (see #10443). for _, key := range []string{"LISTEN_PID", "LISTEN_FDS", "LISTEN_FDNAMES"} { if val, ok := os.LookupEnv(key); ok { // Force the PID to `1` since we cannot rely on (all // versions of) all runtimes to do it for us. if key == "LISTEN_PID" { val = "1" } g.AddProcessEnv(key, val) } } return g.Config, nil } // isWorkDirSymlink returns true if resolved workdir is symlink or a chain of symlinks, // and final resolved target is present either on volume, mount or inside of container // otherwise it returns false. Following function is meant for internal use only and // can change at any point of time. func (c *Container) isWorkDirSymlink(resolvedPath string) bool { // We cannot create workdir since explicit --workdir is // set in config but workdir could also be a symlink. // If it's a symlink, check if the resolved target is present in the container. // If so, that's a valid use case: return nil. maxSymLinks := 0 for { // Linux only supports a chain of 40 links. // Reference: https://github.com/torvalds/linux/blob/master/include/linux/namei.h#L13 if maxSymLinks > 40 { break } resolvedSymlink, err := os.Readlink(resolvedPath) if err != nil { // End sym-link resolution loop. break } if resolvedSymlink != "" { _, resolvedSymlinkWorkdir, err := c.resolvePath(c.state.Mountpoint, resolvedSymlink) if isPathOnVolume(c, resolvedSymlinkWorkdir) || isPathOnBindMount(c, resolvedSymlinkWorkdir) { // Resolved symlink exists on external volume or mount return true } if err != nil { // Could not resolve path so end sym-link resolution loop. break } if resolvedSymlinkWorkdir != "" { resolvedPath = resolvedSymlinkWorkdir _, err := os.Stat(resolvedSymlinkWorkdir) if err == nil { // Symlink resolved successfully and resolved path exists on container, // this is a valid use-case so return nil. logrus.Debugf("Workdir is a symlink with target to %q and resolved symlink exists on container", resolvedSymlink) return true } } } maxSymLinks++ } return false } // resolveWorkDir resolves the container's workdir and, depending on the // configuration, will create it, or error out if it does not exist. // Note that the container must be mounted before. func (c *Container) resolveWorkDir() error { workdir := c.WorkingDir() // If the specified workdir is a subdir of a volume or mount, // we don't need to do anything. The runtime is taking care of // that. if isPathOnVolume(c, workdir) || isPathOnBindMount(c, workdir) { logrus.Debugf("Workdir %q resolved to a volume or mount", workdir) return nil } _, resolvedWorkdir, err := c.resolvePath(c.state.Mountpoint, workdir) if err != nil { return err } logrus.Debugf("Workdir %q resolved to host path %q", workdir, resolvedWorkdir) st, err := os.Stat(resolvedWorkdir) if err == nil { if !st.IsDir() { return fmt.Errorf("workdir %q exists on container %s, but is not a directory", workdir, c.ID()) } return nil } if !c.config.CreateWorkingDir { // No need to create it (e.g., `--workdir=/foo`), so let's make sure // the path exists on the container. if err != nil { if os.IsNotExist(err) { // If resolved Workdir path gets marked as a valid symlink, // return nil cause this is valid use-case. if c.isWorkDirSymlink(resolvedWorkdir) { return nil } return fmt.Errorf("workdir %q does not exist on container %s", workdir, c.ID()) } // This might be a serious error (e.g., permission), so // we need to return the full error. return fmt.Errorf("error detecting workdir %q on container %s: %w", workdir, c.ID(), err) } return nil } if err := os.MkdirAll(resolvedWorkdir, 0755); err != nil { if os.IsExist(err) { return nil } return fmt.Errorf("error creating container %s workdir: %w", c.ID(), err) } // Ensure container entrypoint is created (if required). uid, gid, _, err := chrootuser.GetUser(c.state.Mountpoint, c.User()) if err != nil { return fmt.Errorf("error looking up %s inside of the container %s: %w", c.User(), c.ID(), err) } if err := os.Chown(resolvedWorkdir, int(uid), int(gid)); err != nil { return fmt.Errorf("error chowning container %s workdir to container root: %w", c.ID(), err) } return nil } func (c *Container) getUserOverrides() *lookup.Overrides { var hasPasswdFile, hasGroupFile bool overrides := lookup.Overrides{} for _, m := range c.config.Spec.Mounts { if m.Destination == "/etc/passwd" { overrides.ContainerEtcPasswdPath = m.Source hasPasswdFile = true } if m.Destination == "/etc/group" { overrides.ContainerEtcGroupPath = m.Source hasGroupFile = true } if m.Destination == "/etc" { if !hasPasswdFile { overrides.ContainerEtcPasswdPath = filepath.Join(m.Source, "passwd") } if !hasGroupFile { overrides.ContainerEtcGroupPath = filepath.Join(m.Source, "group") } } } if path, ok := c.state.BindMounts["/etc/passwd"]; ok { overrides.ContainerEtcPasswdPath = path } return &overrides } func lookupHostUser(name string) (*runcuser.ExecUser, error) { var execUser runcuser.ExecUser // Look up User on host u, err := util.LookupUser(name) if err != nil { return &execUser, err } uid, err := strconv.ParseUint(u.Uid, 8, 32) if err != nil { return &execUser, err } gid, err := strconv.ParseUint(u.Gid, 8, 32) if err != nil { return &execUser, err } execUser.Uid = int(uid) execUser.Gid = int(gid) execUser.Home = u.HomeDir return &execUser, nil } // mountNotifySocket mounts the NOTIFY_SOCKET into the container if it's set // and if the sdnotify mode is set to container. It also sets c.notifySocket // to avoid redundantly looking up the env variable. func (c *Container) mountNotifySocket(g generate.Generator) error { if c.config.SdNotifySocket == "" { return nil } if c.config.SdNotifyMode != define.SdNotifyModeContainer { return nil } notifyDir := filepath.Join(c.bundlePath(), "notify") logrus.Debugf("Checking notify %q dir", notifyDir) if err := os.MkdirAll(notifyDir, 0755); err != nil { if !os.IsExist(err) { return fmt.Errorf("unable to create notify %q dir: %w", notifyDir, err) } } if err := label.Relabel(notifyDir, c.MountLabel(), true); err != nil { return fmt.Errorf("relabel failed %q: %w", notifyDir, err) } logrus.Debugf("Add bindmount notify %q dir", notifyDir) if _, ok := c.state.BindMounts["/run/notify"]; !ok { c.state.BindMounts["/run/notify"] = notifyDir } // Set the container's notify socket to the proxy socket created by conmon g.AddProcessEnv("NOTIFY_SOCKET", "/run/notify/notify.sock") return nil } func (c *Container) addCheckpointImageMetadata(importBuilder *buildah.Builder) error { // Get information about host environment hostInfo, err := c.Runtime().hostInfo() if err != nil { return fmt.Errorf("getting host info: %v", err) } criuVersion, err := criu.GetCriuVersion() if err != nil { return fmt.Errorf("getting criu version: %v", err) } rootfsImageID, rootfsImageName := c.Image() // Add image annotations with information about the container and the host. // This information is useful to check compatibility before restoring the checkpoint checkpointImageAnnotations := map[string]string{ define.CheckpointAnnotationName: c.config.Name, define.CheckpointAnnotationRawImageName: c.config.RawImageName, define.CheckpointAnnotationRootfsImageID: rootfsImageID, define.CheckpointAnnotationRootfsImageName: rootfsImageName, define.CheckpointAnnotationPodmanVersion: version.Version.String(), define.CheckpointAnnotationCriuVersion: strconv.Itoa(criuVersion), define.CheckpointAnnotationRuntimeName: hostInfo.OCIRuntime.Name, define.CheckpointAnnotationRuntimeVersion: hostInfo.OCIRuntime.Version, define.CheckpointAnnotationConmonVersion: hostInfo.Conmon.Version, define.CheckpointAnnotationHostArch: hostInfo.Arch, define.CheckpointAnnotationHostKernel: hostInfo.Kernel, define.CheckpointAnnotationCgroupVersion: hostInfo.CgroupsVersion, define.CheckpointAnnotationDistributionVersion: hostInfo.Distribution.Version, define.CheckpointAnnotationDistributionName: hostInfo.Distribution.Distribution, } for key, value := range checkpointImageAnnotations { importBuilder.SetAnnotation(key, value) } return nil } func (c *Container) resolveCheckpointImageName(options *ContainerCheckpointOptions) error { if options.CreateImage == "" { return nil } // Resolve image name resolvedImageName, err := c.runtime.LibimageRuntime().ResolveName(options.CreateImage) if err != nil { return err } options.CreateImage = resolvedImageName return nil } func (c *Container) createCheckpointImage(ctx context.Context, options ContainerCheckpointOptions) error { if options.CreateImage == "" { return nil } logrus.Debugf("Create checkpoint image %s", options.CreateImage) // Create storage reference imageRef, err := is.Transport.ParseStoreReference(c.runtime.store, options.CreateImage) if err != nil { return errors.New("failed to parse image name") } // Build an image scratch builderOptions := buildah.BuilderOptions{ FromImage: "scratch", } importBuilder, err := buildah.NewBuilder(ctx, c.runtime.store, builderOptions) if err != nil { return err } // Clean up buildah working container defer func() { if err := importBuilder.Delete(); err != nil { logrus.Errorf("Image builder delete failed: %v", err) } }() if err := c.prepareCheckpointExport(); err != nil { return err } // Export checkpoint into temporary tar file tmpDir, err := ioutil.TempDir("", "checkpoint_image_") if err != nil { return err } defer os.RemoveAll(tmpDir) options.TargetFile = path.Join(tmpDir, "checkpoint.tar") if err := c.exportCheckpoint(options); err != nil { return err } // Copy checkpoint from temporary tar file in the image addAndCopyOptions := buildah.AddAndCopyOptions{} if err := importBuilder.Add("", true, addAndCopyOptions, options.TargetFile); err != nil { return err } if err := c.addCheckpointImageMetadata(importBuilder); err != nil { return err } commitOptions := buildah.CommitOptions{ Squash: true, SystemContext: c.runtime.imageContext, } // Create checkpoint image id, _, _, err := importBuilder.Commit(ctx, imageRef, commitOptions) if err != nil { return err } logrus.Debugf("Created checkpoint image: %s", id) return nil } func (c *Container) exportCheckpoint(options ContainerCheckpointOptions) error { if len(c.Dependencies()) == 1 { // Check if the dependency is an infra container. If it is we can checkpoint // the container out of the Pod. if c.config.Pod == "" { return errors.New("cannot export checkpoints of containers with dependencies") } pod, err := c.runtime.state.Pod(c.config.Pod) if err != nil { return fmt.Errorf("container %s is in pod %s, but pod cannot be retrieved: %w", c.ID(), c.config.Pod, err) } infraID, err := pod.InfraContainerID() if err != nil { return fmt.Errorf("cannot retrieve infra container ID for pod %s: %w", c.config.Pod, err) } if c.Dependencies()[0] != infraID { return errors.New("cannot export checkpoints of containers with dependencies") } } if len(c.Dependencies()) > 1 { return errors.New("cannot export checkpoints of containers with dependencies") } logrus.Debugf("Exporting checkpoint image of container %q to %q", c.ID(), options.TargetFile) includeFiles := []string{ "artifacts", metadata.DevShmCheckpointTar, metadata.ConfigDumpFile, metadata.SpecDumpFile, metadata.NetworkStatusFile, stats.StatsDump, } if c.LogDriver() == define.KubernetesLogging || c.LogDriver() == define.JSONLogging { includeFiles = append(includeFiles, "ctr.log") } if options.PreCheckPoint { includeFiles = append(includeFiles, preCheckpointDir) } else { includeFiles = append(includeFiles, metadata.CheckpointDirectory) } // Get root file-system changes included in the checkpoint archive var addToTarFiles []string if !options.IgnoreRootfs { // To correctly track deleted files, let's go through the output of 'podman diff' rootFsChanges, err := c.runtime.GetDiff("", c.ID(), define.DiffContainer) if err != nil { return fmt.Errorf("error exporting root file-system diff for %q: %w", c.ID(), err) } addToTarFiles, err := crutils.CRCreateRootFsDiffTar(&rootFsChanges, c.state.Mountpoint, c.bundlePath()) if err != nil { return err } includeFiles = append(includeFiles, addToTarFiles...) } // Folder containing archived volumes that will be included in the export expVolDir := filepath.Join(c.bundlePath(), metadata.CheckpointVolumesDirectory) // Create an archive for each volume associated with the container if !options.IgnoreVolumes { if err := os.MkdirAll(expVolDir, 0700); err != nil { return fmt.Errorf("error creating volumes export directory %q: %w", expVolDir, err) } for _, v := range c.config.NamedVolumes { volumeTarFilePath := filepath.Join(metadata.CheckpointVolumesDirectory, v.Name+".tar") volumeTarFileFullPath := filepath.Join(c.bundlePath(), volumeTarFilePath) volumeTarFile, err := os.Create(volumeTarFileFullPath) if err != nil { return fmt.Errorf("error creating %q: %w", volumeTarFileFullPath, err) } volume, err := c.runtime.GetVolume(v.Name) if err != nil { return err } mp, err := volume.MountPoint() if err != nil { return err } if mp == "" { return fmt.Errorf("volume %s is not mounted, cannot export: %w", volume.Name(), define.ErrInternal) } input, err := archive.TarWithOptions(mp, &archive.TarOptions{ Compression: archive.Uncompressed, IncludeSourceDir: true, }) if err != nil { return fmt.Errorf("error reading volume directory %q: %w", v.Dest, err) } _, err = io.Copy(volumeTarFile, input) if err != nil { return err } volumeTarFile.Close() includeFiles = append(includeFiles, volumeTarFilePath) } } input, err := archive.TarWithOptions(c.bundlePath(), &archive.TarOptions{ Compression: options.Compression, IncludeSourceDir: true, IncludeFiles: includeFiles, }) if err != nil { return fmt.Errorf("error reading checkpoint directory %q: %w", c.ID(), err) } outFile, err := os.Create(options.TargetFile) if err != nil { return fmt.Errorf("error creating checkpoint export file %q: %w", options.TargetFile, err) } defer outFile.Close() if err := os.Chmod(options.TargetFile, 0600); err != nil { return err } _, err = io.Copy(outFile, input) if err != nil { return err } for _, file := range addToTarFiles { os.Remove(filepath.Join(c.bundlePath(), file)) } if !options.IgnoreVolumes { os.RemoveAll(expVolDir) } return nil } func (c *Container) checkpointRestoreSupported(version int) error { if !criu.CheckForCriu(version) { return fmt.Errorf("checkpoint/restore requires at least CRIU %d", version) } if !c.ociRuntime.SupportsCheckpoint() { return errors.New("configured runtime does not support checkpoint/restore") } return nil } func (c *Container) checkpoint(ctx context.Context, options ContainerCheckpointOptions) (*define.CRIUCheckpointRestoreStatistics, int64, error) { if err := c.checkpointRestoreSupported(criu.MinCriuVersion); err != nil { return nil, 0, err } if c.state.State != define.ContainerStateRunning { return nil, 0, fmt.Errorf("%q is not running, cannot checkpoint: %w", c.state.State, define.ErrCtrStateInvalid) } if c.AutoRemove() && options.TargetFile == "" { return nil, 0, errors.New("cannot checkpoint containers that have been started with '--rm' unless '--export' is used") } if err := c.resolveCheckpointImageName(&options); err != nil { return nil, 0, err } if err := crutils.CRCreateFileWithLabel(c.bundlePath(), "dump.log", c.MountLabel()); err != nil { return nil, 0, err } // Setting CheckpointLog early in case there is a failure. c.state.CheckpointLog = path.Join(c.bundlePath(), "dump.log") c.state.CheckpointPath = c.CheckpointPath() runtimeCheckpointDuration, err := c.ociRuntime.CheckpointContainer(c, options) if err != nil { return nil, 0, err } // Keep the content of /dev/shm directory if c.config.ShmDir != "" && c.state.BindMounts["/dev/shm"] == c.config.ShmDir { shmDirTarFileFullPath := filepath.Join(c.bundlePath(), metadata.DevShmCheckpointTar) shmDirTarFile, err := os.Create(shmDirTarFileFullPath) if err != nil { return nil, 0, err } defer shmDirTarFile.Close() input, err := archive.TarWithOptions(c.config.ShmDir, &archive.TarOptions{ Compression: archive.Uncompressed, IncludeSourceDir: true, }) if err != nil { return nil, 0, err } if _, err = io.Copy(shmDirTarFile, input); err != nil { return nil, 0, err } } // Save network.status. This is needed to restore the container with // the same IP. Currently limited to one IP address in a container // with one interface. // FIXME: will this break something? if _, err := metadata.WriteJSONFile(c.getNetworkStatus(), c.bundlePath(), metadata.NetworkStatusFile); err != nil { return nil, 0, err } defer c.newContainerEvent(events.Checkpoint) // There is a bug from criu: https://github.com/checkpoint-restore/criu/issues/116 // We have to change the symbolic link from absolute path to relative path if options.WithPrevious { os.Remove(path.Join(c.CheckpointPath(), "parent")) if err := os.Symlink("../pre-checkpoint", path.Join(c.CheckpointPath(), "parent")); err != nil { return nil, 0, err } } if options.TargetFile != "" { if err := c.exportCheckpoint(options); err != nil { return nil, 0, err } } else { if err := c.createCheckpointImage(ctx, options); err != nil { return nil, 0, err } } logrus.Debugf("Checkpointed container %s", c.ID()) if !options.KeepRunning && !options.PreCheckPoint { c.state.State = define.ContainerStateStopped c.state.Checkpointed = true c.state.CheckpointedTime = time.Now() c.state.Restored = false c.state.RestoredTime = time.Time{} // Clean up Storage and Network if err := c.cleanup(ctx); err != nil { return nil, 0, err } } criuStatistics, err := func() (*define.CRIUCheckpointRestoreStatistics, error) { if !options.PrintStats { return nil, nil } statsDirectory, err := os.Open(c.bundlePath()) if err != nil { return nil, fmt.Errorf("not able to open %q: %w", c.bundlePath(), err) } dumpStatistics, err := stats.CriuGetDumpStats(statsDirectory) if err != nil { return nil, fmt.Errorf("displaying checkpointing statistics not possible: %w", err) } return &define.CRIUCheckpointRestoreStatistics{ FreezingTime: dumpStatistics.GetFreezingTime(), FrozenTime: dumpStatistics.GetFrozenTime(), MemdumpTime: dumpStatistics.GetMemdumpTime(), MemwriteTime: dumpStatistics.GetMemwriteTime(), PagesScanned: dumpStatistics.GetPagesScanned(), PagesWritten: dumpStatistics.GetPagesWritten(), }, nil }() if err != nil { return nil, 0, err } if !options.Keep && !options.PreCheckPoint { cleanup := []string{ "dump.log", stats.StatsDump, metadata.ConfigDumpFile, metadata.SpecDumpFile, } for _, del := range cleanup { file := filepath.Join(c.bundlePath(), del) if err := os.Remove(file); err != nil { logrus.Debugf("Unable to remove file %s", file) } } // The file has been deleted. Do not mention it. c.state.CheckpointLog = "" } c.state.FinishedTime = time.Now() return criuStatistics, runtimeCheckpointDuration, c.save() } func (c *Container) generateContainerSpec() error { // Make sure the newly created config.json exists on disk // NewFromSpec() is deprecated according to its comment // however the recommended replace just causes a nil map panic //nolint:staticcheck g := generate.NewFromSpec(c.config.Spec) if err := c.saveSpec(g.Config); err != nil { return fmt.Errorf("saving imported container specification for restore failed: %w", err) } return nil } func (c *Container) importCheckpointImage(ctx context.Context, imageID string) error { img, _, err := c.Runtime().LibimageRuntime().LookupImage(imageID, nil) if err != nil { return err } mountPoint, err := img.Mount(ctx, nil, "") defer func() { if err := c.unmount(true); err != nil { logrus.Errorf("Failed to unmount container: %v", err) } }() if err != nil { return err } // Import all checkpoint files except ConfigDumpFile and SpecDumpFile. We // generate new container config files to enable to specifying a new // container name. checkpoint := []string{ "artifacts", metadata.CheckpointDirectory, metadata.CheckpointVolumesDirectory, metadata.DevShmCheckpointTar, metadata.RootFsDiffTar, metadata.DeletedFilesFile, metadata.PodOptionsFile, metadata.PodDumpFile, } for _, name := range checkpoint { src := filepath.Join(mountPoint, name) dst := filepath.Join(c.bundlePath(), name) if err := archive.NewDefaultArchiver().CopyWithTar(src, dst); err != nil { logrus.Debugf("Can't import '%s' from checkpoint image", name) } } return c.generateContainerSpec() } func (c *Container) importCheckpointTar(input string) error { if err := crutils.CRImportCheckpointWithoutConfig(c.bundlePath(), input); err != nil { return err } return c.generateContainerSpec() } func (c *Container) importPreCheckpoint(input string) error { archiveFile, err := os.Open(input) if err != nil { return fmt.Errorf("failed to open pre-checkpoint archive for import: %w", err) } defer archiveFile.Close() err = archive.Untar(archiveFile, c.bundlePath(), nil) if err != nil { return fmt.Errorf("unpacking of pre-checkpoint archive %s failed: %w", input, err) } return nil } func (c *Container) restore(ctx context.Context, options ContainerCheckpointOptions) (criuStatistics *define.CRIUCheckpointRestoreStatistics, runtimeRestoreDuration int64, retErr error) { minCriuVersion := func() int { if options.Pod == "" { return criu.MinCriuVersion } return criu.PodCriuVersion }() if err := c.checkpointRestoreSupported(minCriuVersion); err != nil { return nil, 0, err } if options.Pod != "" && !crutils.CRRuntimeSupportsPodCheckpointRestore(c.ociRuntime.Path()) { return nil, 0, fmt.Errorf("runtime %s does not support pod restore", c.ociRuntime.Path()) } if !c.ensureState(define.ContainerStateConfigured, define.ContainerStateExited) { return nil, 0, fmt.Errorf("container %s is running or paused, cannot restore: %w", c.ID(), define.ErrCtrStateInvalid) } if options.ImportPrevious != "" { if err := c.importPreCheckpoint(options.ImportPrevious); err != nil { return nil, 0, err } } if options.TargetFile != "" { if err := c.importCheckpointTar(options.TargetFile); err != nil { return nil, 0, err } } else if options.CheckpointImageID != "" { if err := c.importCheckpointImage(ctx, options.CheckpointImageID); err != nil { return nil, 0, err } } // Let's try to stat() CRIU's inventory file. If it does not exist, it makes // no sense to try a restore. This is a minimal check if a checkpoint exist. if _, err := os.Stat(filepath.Join(c.CheckpointPath(), "inventory.img")); os.IsNotExist(err) { return nil, 0, fmt.Errorf("a complete checkpoint for this container cannot be found, cannot restore: %w", err) } if err := crutils.CRCreateFileWithLabel(c.bundlePath(), "restore.log", c.MountLabel()); err != nil { return nil, 0, err } // Setting RestoreLog early in case there is a failure. c.state.RestoreLog = path.Join(c.bundlePath(), "restore.log") c.state.CheckpointPath = c.CheckpointPath() // Read network configuration from checkpoint var netStatus map[string]types.StatusBlock _, err := metadata.ReadJSONFile(&netStatus, c.bundlePath(), metadata.NetworkStatusFile) if err != nil { logrus.Infof("Failed to unmarshal network status, cannot restore the same ip/mac: %v", err) } // If the restored container should get a new name, the IP address of // the container will not be restored. This assumes that if a new name is // specified, the container is restored multiple times. // TODO: This implicit restoring with or without IP depending on an // unrelated restore parameter (--name) does not seem like the // best solution. if err == nil && options.Name == "" && (!options.IgnoreStaticIP || !options.IgnoreStaticMAC) { // The file with the network.status does exist. Let's restore the // container with the same networks settings as during checkpointing. networkOpts, err := c.networks() if err != nil { return nil, 0, err } netOpts := make(map[string]types.PerNetworkOptions, len(netStatus)) for network, perNetOpts := range networkOpts { // unset mac and ips before we start adding the ones from the status perNetOpts.StaticMAC = nil perNetOpts.StaticIPs = nil for name, netInt := range netStatus[network].Interfaces { perNetOpts.InterfaceName = name if !options.IgnoreStaticIP { perNetOpts.StaticMAC = netInt.MacAddress } if !options.IgnoreStaticIP { for _, netAddress := range netInt.Subnets { perNetOpts.StaticIPs = append(perNetOpts.StaticIPs, netAddress.IPNet.IP) } } // Normally interfaces have a length of 1, only for some special cni configs we could get more. // For now just use the first interface to get the ips this should be good enough for most cases. break } netOpts[network] = perNetOpts } c.perNetworkOpts = netOpts } defer func() { if retErr != nil { if err := c.cleanup(ctx); err != nil { logrus.Errorf("Cleaning up container %s: %v", c.ID(), err) } } }() if err := c.prepare(); err != nil { return nil, 0, err } // Read config jsonPath := filepath.Join(c.bundlePath(), "config.json") logrus.Debugf("generate.NewFromFile at %v", jsonPath) g, err := generate.NewFromFile(jsonPath) if err != nil { logrus.Debugf("generate.NewFromFile failed with %v", err) return nil, 0, err } // Restoring from an import means that we are doing migration if options.TargetFile != "" || options.CheckpointImageID != "" { g.SetRootPath(c.state.Mountpoint) } // We want to have the same network namespace as before. if c.config.CreateNetNS { netNSPath := "" if !c.config.PostConfigureNetNS { netNSPath = c.state.NetNS.Path() } if err := g.AddOrReplaceLinuxNamespace(string(spec.NetworkNamespace), netNSPath); err != nil { return nil, 0, err } } if options.Pod != "" { // Running in a Pod means that we have to change all namespace settings to // the ones from the infrastructure container. pod, err := c.runtime.LookupPod(options.Pod) if err != nil { return nil, 0, fmt.Errorf("pod %q cannot be retrieved: %w", options.Pod, err) } infraContainer, err := pod.InfraContainer() if err != nil { return nil, 0, fmt.Errorf("cannot retrieved infra container from pod %q: %w", options.Pod, err) } infraContainer.lock.Lock() if err := infraContainer.syncContainer(); err != nil { infraContainer.lock.Unlock() return nil, 0, fmt.Errorf("error syncing infrastructure container %s status: %w", infraContainer.ID(), err) } if infraContainer.state.State != define.ContainerStateRunning { if err := infraContainer.initAndStart(ctx); err != nil { infraContainer.lock.Unlock() return nil, 0, fmt.Errorf("error starting infrastructure container %s status: %w", infraContainer.ID(), err) } } infraContainer.lock.Unlock() if c.config.IPCNsCtr != "" { nsPath, err := infraContainer.namespacePath(IPCNS) if err != nil { return nil, 0, fmt.Errorf("cannot retrieve IPC namespace path for Pod %q: %w", options.Pod, err) } if err := g.AddOrReplaceLinuxNamespace(string(spec.IPCNamespace), nsPath); err != nil { return nil, 0, err } } if c.config.NetNsCtr != "" { nsPath, err := infraContainer.namespacePath(NetNS) if err != nil { return nil, 0, fmt.Errorf("cannot retrieve network namespace path for Pod %q: %w", options.Pod, err) } if err := g.AddOrReplaceLinuxNamespace(string(spec.NetworkNamespace), nsPath); err != nil { return nil, 0, err } } if c.config.PIDNsCtr != "" { nsPath, err := infraContainer.namespacePath(PIDNS) if err != nil { return nil, 0, fmt.Errorf("cannot retrieve PID namespace path for Pod %q: %w", options.Pod, err) } if err := g.AddOrReplaceLinuxNamespace(string(spec.PIDNamespace), nsPath); err != nil { return nil, 0, err } } if c.config.UTSNsCtr != "" { nsPath, err := infraContainer.namespacePath(UTSNS) if err != nil { return nil, 0, fmt.Errorf("cannot retrieve UTS namespace path for Pod %q: %w", options.Pod, err) } if err := g.AddOrReplaceLinuxNamespace(string(spec.UTSNamespace), nsPath); err != nil { return nil, 0, err } } if c.config.CgroupNsCtr != "" { nsPath, err := infraContainer.namespacePath(CgroupNS) if err != nil { return nil, 0, fmt.Errorf("cannot retrieve Cgroup namespace path for Pod %q: %w", options.Pod, err) } if err := g.AddOrReplaceLinuxNamespace(string(spec.CgroupNamespace), nsPath); err != nil { return nil, 0, err } } } if err := c.makeBindMounts(); err != nil { return nil, 0, err } if options.TargetFile != "" || options.CheckpointImageID != "" { for dstPath, srcPath := range c.state.BindMounts { newMount := spec.Mount{ Type: "bind", Source: srcPath, Destination: dstPath, Options: []string{"bind", "private"}, } if c.IsReadOnly() && dstPath != "/dev/shm" { newMount.Options = append(newMount.Options, "ro", "nosuid", "noexec", "nodev") } if dstPath == "/dev/shm" && c.state.BindMounts["/dev/shm"] == c.config.ShmDir { newMount.Options = append(newMount.Options, "nosuid", "noexec", "nodev") } if !MountExists(g.Mounts(), dstPath) { g.AddMount(newMount) } } } // Restore /dev/shm content if c.config.ShmDir != "" && c.state.BindMounts["/dev/shm"] == c.config.ShmDir { shmDirTarFileFullPath := filepath.Join(c.bundlePath(), metadata.DevShmCheckpointTar) if _, err := os.Stat(shmDirTarFileFullPath); err != nil { logrus.Debug("Container checkpoint doesn't contain dev/shm: ", err.Error()) } else { shmDirTarFile, err := os.Open(shmDirTarFileFullPath) if err != nil { return nil, 0, err } defer shmDirTarFile.Close() if err := archive.UntarUncompressed(shmDirTarFile, c.config.ShmDir, nil); err != nil { return nil, 0, err } } } // Cleanup for a working restore. if err := c.removeConmonFiles(); err != nil { return nil, 0, err } // Save the OCI spec to disk if err := c.saveSpec(g.Config); err != nil { return nil, 0, err } // When restoring from an imported archive, allow restoring the content of volumes. // Volumes are created in setupContainer() if !options.IgnoreVolumes && (options.TargetFile != "" || options.CheckpointImageID != "") { for _, v := range c.config.NamedVolumes { volumeFilePath := filepath.Join(c.bundlePath(), metadata.CheckpointVolumesDirectory, v.Name+".tar") volumeFile, err := os.Open(volumeFilePath) if err != nil { return nil, 0, fmt.Errorf("failed to open volume file %s: %w", volumeFilePath, err) } defer volumeFile.Close() volume, err := c.runtime.GetVolume(v.Name) if err != nil { return nil, 0, fmt.Errorf("failed to retrieve volume %s: %w", v.Name, err) } mountPoint, err := volume.MountPoint() if err != nil { return nil, 0, err } if mountPoint == "" { return nil, 0, fmt.Errorf("unable to import volume %s as it is not mounted: %w", volume.Name(), err) } if err := archive.UntarUncompressed(volumeFile, mountPoint, nil); err != nil { return nil, 0, fmt.Errorf("failed to extract volume %s to %s: %w", volumeFilePath, mountPoint, err) } } } // Before actually restarting the container, apply the root file-system changes if !options.IgnoreRootfs { if err := crutils.CRApplyRootFsDiffTar(c.bundlePath(), c.state.Mountpoint); err != nil { return nil, 0, err } if err := crutils.CRRemoveDeletedFiles(c.ID(), c.bundlePath(), c.state.Mountpoint); err != nil { return nil, 0, err } } runtimeRestoreDuration, err = c.ociRuntime.CreateContainer(c, &options) if err != nil { return nil, 0, err } criuStatistics, err = func() (*define.CRIUCheckpointRestoreStatistics, error) { if !options.PrintStats { return nil, nil } statsDirectory, err := os.Open(c.bundlePath()) if err != nil { return nil, fmt.Errorf("not able to open %q: %w", c.bundlePath(), err) } restoreStatistics, err := stats.CriuGetRestoreStats(statsDirectory) if err != nil { return nil, fmt.Errorf("displaying restore statistics not possible: %w", err) } return &define.CRIUCheckpointRestoreStatistics{ PagesCompared: restoreStatistics.GetPagesCompared(), PagesSkippedCow: restoreStatistics.GetPagesSkippedCow(), ForkingTime: restoreStatistics.GetForkingTime(), RestoreTime: restoreStatistics.GetRestoreTime(), PagesRestored: restoreStatistics.GetPagesRestored(), }, nil }() if err != nil { return nil, 0, err } logrus.Debugf("Restored container %s", c.ID()) c.state.State = define.ContainerStateRunning c.state.Checkpointed = false c.state.Restored = true c.state.CheckpointedTime = time.Time{} c.state.RestoredTime = time.Now() if !options.Keep { // Delete all checkpoint related files. At this point, in theory, all files // should exist. Still ignoring errors for now as the container should be // restored and running. Not erroring out just because some cleanup operation // failed. Starting with the checkpoint directory err = os.RemoveAll(c.CheckpointPath()) if err != nil { logrus.Debugf("Non-fatal: removal of checkpoint directory (%s) failed: %v", c.CheckpointPath(), err) } c.state.CheckpointPath = "" err = os.RemoveAll(c.PreCheckPointPath()) if err != nil { logrus.Debugf("Non-fatal: removal of pre-checkpoint directory (%s) failed: %v", c.PreCheckPointPath(), err) } err = os.RemoveAll(c.CheckpointVolumesPath()) if err != nil { logrus.Debugf("Non-fatal: removal of checkpoint volumes directory (%s) failed: %v", c.CheckpointVolumesPath(), err) } cleanup := [...]string{ "restore.log", "dump.log", stats.StatsDump, stats.StatsRestore, metadata.DevShmCheckpointTar, metadata.NetworkStatusFile, metadata.RootFsDiffTar, metadata.DeletedFilesFile, } for _, del := range cleanup { file := filepath.Join(c.bundlePath(), del) err = os.Remove(file) if err != nil { logrus.Debugf("Non-fatal: removal of checkpoint file (%s) failed: %v", file, err) } } c.state.CheckpointLog = "" c.state.RestoreLog = "" } return criuStatistics, runtimeRestoreDuration, c.save() }