// +build linux package libpod import ( "context" "fmt" "io" "io/ioutil" "net" "os" "path" "path/filepath" "strconv" "strings" "sync" "syscall" "time" cnitypes "github.com/containernetworking/cni/pkg/types/current" "github.com/containernetworking/plugins/pkg/ns" "github.com/containers/buildah/pkg/secrets" "github.com/containers/libpod/libpod/define" crioAnnotations "github.com/containers/libpod/pkg/annotations" "github.com/containers/libpod/pkg/apparmor" "github.com/containers/libpod/pkg/cgroups" "github.com/containers/libpod/pkg/criu" "github.com/containers/libpod/pkg/lookup" "github.com/containers/libpod/pkg/resolvconf" "github.com/containers/libpod/pkg/rootless" "github.com/containers/storage/pkg/archive" securejoin "github.com/cyphar/filepath-securejoin" "github.com/opencontainers/runc/libcontainer/user" spec "github.com/opencontainers/runtime-spec/specs-go" "github.com/opencontainers/runtime-tools/generate" "github.com/opencontainers/selinux/go-selinux/label" "github.com/opentracing/opentracing-go" "github.com/pkg/errors" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) func (c *Container) mountSHM(shmOptions string) error { if err := unix.Mount("shm", c.config.ShmDir, "tmpfs", unix.MS_NOEXEC|unix.MS_NOSUID|unix.MS_NODEV, label.FormatMountLabel(shmOptions, c.config.MountLabel)); err != nil { return errors.Wrapf(err, "failed to mount shm tmpfs %q", c.config.ShmDir) } return nil } func (c *Container) unmountSHM(mount string) error { if err := unix.Unmount(mount, unix.MNT_DETACH); err != nil { if err != syscall.EINVAL { logrus.Warnf("container %s failed to unmount %s : %v", c.ID(), mount, err) } else { logrus.Debugf("container %s failed to unmount %s : %v", c.ID(), mount, err) } } return nil } // prepare mounts the container and sets up other required resources like net // namespaces func (c *Container) prepare() (Err error) { var ( wg sync.WaitGroup netNS ns.NetNS networkStatus []*cnitypes.Result createNetNSErr, mountStorageErr error mountPoint string tmpStateLock sync.Mutex ) wg.Add(2) go func() { defer wg.Done() // Set up network namespace if not already set up if c.config.CreateNetNS && c.state.NetNS == nil && !c.config.PostConfigureNetNS { netNS, networkStatus, createNetNSErr = c.runtime.createNetNS(c) tmpStateLock.Lock() defer tmpStateLock.Unlock() // Assign NetNS attributes to container if createNetNSErr == nil { c.state.NetNS = netNS c.state.NetworkStatus = networkStatus } } }() // Mount storage if not mounted go func() { defer wg.Done() mountPoint, mountStorageErr = c.mountStorage() if mountStorageErr != nil { return } tmpStateLock.Lock() defer tmpStateLock.Unlock() // Finish up mountStorage c.state.Mounted = true c.state.Mountpoint = mountPoint logrus.Debugf("Created root filesystem for container %s at %s", c.ID(), c.state.Mountpoint) }() wg.Wait() var createErr error if createNetNSErr != nil { createErr = createNetNSErr } if mountStorageErr != nil { logrus.Errorf("Error preparing container %s: %v", c.ID(), createErr) createErr = mountStorageErr } // Only trigger storage cleanup if mountStorage was successful. // Otherwise, we may mess up mount counters. if createNetNSErr != nil && mountStorageErr == nil { if err := c.cleanupStorage(); err != nil { // createErr is guaranteed non-nil, so print // unconditionally logrus.Errorf("Error preparing container %s: %v", c.ID(), createErr) createErr = errors.Wrapf(err, "error unmounting storage for container %s after network create failure", c.ID()) } } // It's OK to unconditionally trigger network cleanup. If the network // isn't ready it will do nothing. if createErr != nil { if err := c.cleanupNetwork(); err != nil { logrus.Errorf("Error preparing container %s: %v", c.ID(), createErr) createErr = errors.Wrapf(err, "error cleaning up container %s network after setup failure", c.ID()) } } if createErr != nil { return createErr } // Save changes to container state return c.save() } // cleanupNetwork unmounts and cleans up the container's network func (c *Container) cleanupNetwork() error { if c.config.NetNsCtr != "" { return nil } netDisabled, err := c.NetworkDisabled() if err != nil { return err } if netDisabled { return nil } if c.state.NetNS == nil { logrus.Debugf("Network is already cleaned up, skipping...") return nil } // Stop the container's network namespace (if it has one) if err := c.runtime.teardownNetNS(c); err != nil { logrus.Errorf("unable to cleanup network for container %s: %q", c.ID(), err) } c.state.NetNS = nil c.state.NetworkStatus = nil if c.valid { return c.save() } return nil } // Generate spec for a container // Accepts a map of the container's dependencies func (c *Container) generateSpec(ctx context.Context) (*spec.Spec, error) { span, _ := opentracing.StartSpanFromContext(ctx, "generateSpec") span.SetTag("type", "container") defer span.Finish() execUser, err := lookup.GetUserGroupInfo(c.state.Mountpoint, c.config.User, nil) if err != nil { return nil, err } g := generate.NewFromSpec(c.config.Spec) // If network namespace was requested, add it now if c.config.CreateNetNS { if c.config.PostConfigureNetNS { if err := g.AddOrReplaceLinuxNamespace(string(spec.NetworkNamespace), ""); err != nil { return nil, err } } else { if err := g.AddOrReplaceLinuxNamespace(string(spec.NetworkNamespace), c.state.NetNS.Path()); err != nil { return nil, err } } } // Apply AppArmor checks and load the default profile if needed. if !c.config.Privileged { updatedProfile, err := apparmor.CheckProfileAndLoadDefault(c.config.Spec.Process.ApparmorProfile) if err != nil { return nil, err } g.SetProcessApparmorProfile(updatedProfile) } if err := c.makeBindMounts(); err != nil { return nil, err } // Check if the spec file mounts contain the label Relabel flags z or Z. // If they do, relabel the source directory and then remove the option. for i := range g.Config.Mounts { m := &g.Config.Mounts[i] var options []string for _, o := range m.Options { switch o { case "z": fallthrough case "Z": if err := label.Relabel(m.Source, c.MountLabel(), label.IsShared(o)); err != nil { return nil, errors.Wrapf(err, "relabel failed %q", m.Source) } default: options = append(options, o) } } m.Options = options } g.SetProcessSelinuxLabel(c.ProcessLabel()) g.SetLinuxMountLabel(c.MountLabel()) // Add named volumes for _, namedVol := range c.config.NamedVolumes { volume, err := c.runtime.GetVolume(namedVol.Name) if err != nil { return nil, errors.Wrapf(err, "error retrieving volume %s to add to container %s", namedVol.Name, c.ID()) } mountPoint := volume.MountPoint() volMount := spec.Mount{ Type: "bind", Source: mountPoint, Destination: namedVol.Dest, Options: namedVol.Options, } g.AddMount(volMount) } // Add bind mounts to container for dstPath, srcPath := range c.state.BindMounts { newMount := spec.Mount{ Type: "bind", Source: srcPath, Destination: dstPath, Options: []string{"bind", "private"}, } if c.IsReadOnly() && dstPath != "/dev/shm" { newMount.Options = append(newMount.Options, "ro", "nosuid", "noexec", "nodev") } if !MountExists(g.Mounts(), dstPath) { g.AddMount(newMount) } else { logrus.Warnf("User mount overriding libpod mount at %q", dstPath) } } if c.config.User != "" { // User and Group must go together g.SetProcessUID(uint32(execUser.Uid)) g.SetProcessGID(uint32(execUser.Gid)) } // Add addition groups if c.config.GroupAdd is not empty if len(c.config.Groups) > 0 { gids, _ := lookup.GetContainerGroups(c.config.Groups, c.state.Mountpoint, nil) for _, gid := range gids { g.AddProcessAdditionalGid(gid) } } if c.config.Systemd { if err := c.setupSystemd(g.Mounts(), g); err != nil { return nil, errors.Wrapf(err, "error adding systemd-specific mounts") } } // Look up and add groups the user belongs to, if a group wasn't directly specified if !rootless.IsRootless() && !strings.Contains(c.config.User, ":") { for _, gid := range execUser.Sgids { g.AddProcessAdditionalGid(uint32(gid)) } } // Add shared namespaces from other containers if c.config.IPCNsCtr != "" { if err := c.addNamespaceContainer(&g, IPCNS, c.config.IPCNsCtr, spec.IPCNamespace); err != nil { return nil, err } } if c.config.MountNsCtr != "" { if err := c.addNamespaceContainer(&g, MountNS, c.config.MountNsCtr, spec.MountNamespace); err != nil { return nil, err } } if c.config.NetNsCtr != "" { if err := c.addNamespaceContainer(&g, NetNS, c.config.NetNsCtr, spec.NetworkNamespace); err != nil { return nil, err } } if c.config.PIDNsCtr != "" { if err := c.addNamespaceContainer(&g, PIDNS, c.config.PIDNsCtr, spec.PIDNamespace); err != nil { return nil, err } } if c.config.UserNsCtr != "" { if err := c.addNamespaceContainer(&g, UserNS, c.config.UserNsCtr, spec.UserNamespace); err != nil { return nil, err } if len(g.Config.Linux.UIDMappings) == 0 { // runc complains if no mapping is specified, even if we join another ns. So provide a dummy mapping g.AddLinuxUIDMapping(uint32(0), uint32(0), uint32(1)) g.AddLinuxGIDMapping(uint32(0), uint32(0), uint32(1)) } } if c.config.UTSNsCtr != "" { if err := c.addNamespaceContainer(&g, UTSNS, c.config.UTSNsCtr, spec.UTSNamespace); err != nil { return nil, err } } if c.config.CgroupNsCtr != "" { if err := c.addNamespaceContainer(&g, CgroupNS, c.config.CgroupNsCtr, spec.CgroupNamespace); err != nil { return nil, err } } g.SetRootPath(c.state.Mountpoint) g.AddAnnotation(crioAnnotations.Created, c.config.CreatedTime.Format(time.RFC3339Nano)) g.AddAnnotation("org.opencontainers.image.stopSignal", fmt.Sprintf("%d", c.config.StopSignal)) for _, i := range c.config.Spec.Linux.Namespaces { if i.Type == spec.UTSNamespace { hostname := c.Hostname() g.SetHostname(hostname) g.AddProcessEnv("HOSTNAME", hostname) break } } // Only add container environment variable if not already present foundContainerEnv := false for _, env := range g.Config.Process.Env { if strings.HasPrefix(env, "container=") { foundContainerEnv = true break } } if !foundContainerEnv { g.AddProcessEnv("container", "libpod") } unified, err := cgroups.IsCgroup2UnifiedMode() if err != nil { return nil, err } if rootless.IsRootless() && !unified { g.SetLinuxCgroupsPath("") } else if c.runtime.config.CgroupManager == SystemdCgroupsManager { // When runc is set to use Systemd as a cgroup manager, it // expects cgroups to be passed as follows: // slice:prefix:name systemdCgroups := fmt.Sprintf("%s:libpod:%s", path.Base(c.config.CgroupParent), c.ID()) logrus.Debugf("Setting CGroups for container %s to %s", c.ID(), systemdCgroups) g.SetLinuxCgroupsPath(systemdCgroups) } else { cgroupPath, err := c.CGroupPath() if err != nil { return nil, err } logrus.Debugf("Setting CGroup path for container %s to %s", c.ID(), cgroupPath) g.SetLinuxCgroupsPath(cgroupPath) } // Mounts need to be sorted so paths will not cover other paths mounts := sortMounts(g.Mounts()) g.ClearMounts() // Determine property of RootPropagation based on volume properties. If // a volume is shared, then keep root propagation shared. This should // work for slave and private volumes too. // // For slave volumes, it can be either [r]shared/[r]slave. // // For private volumes any root propagation value should work. rootPropagation := "" for _, m := range mounts { // We need to remove all symlinks from tmpfs mounts. // Runc and other runtimes may choke on them. // Easy solution: use securejoin to do a scoped evaluation of // the links, then trim off the mount prefix. if m.Type == "tmpfs" { finalPath, err := securejoin.SecureJoin(c.state.Mountpoint, m.Destination) if err != nil { return nil, errors.Wrapf(err, "error resolving symlinks for mount destination %s", m.Destination) } trimmedPath := strings.TrimPrefix(finalPath, strings.TrimSuffix(c.state.Mountpoint, "/")) m.Destination = trimmedPath } g.AddMount(m) for _, opt := range m.Options { switch opt { case MountShared, MountRShared: if rootPropagation != MountShared && rootPropagation != MountRShared { rootPropagation = MountShared } case MountSlave, MountRSlave: if rootPropagation != MountShared && rootPropagation != MountRShared && rootPropagation != MountSlave && rootPropagation != MountRSlave { rootPropagation = MountRSlave } } } } if rootPropagation != "" { logrus.Debugf("set root propagation to %q", rootPropagation) if err := g.SetLinuxRootPropagation(rootPropagation); err != nil { return nil, err } } // Warning: precreate hooks may alter g.Config in place. if c.state.ExtensionStageHooks, err = c.setupOCIHooks(ctx, g.Config); err != nil { return nil, errors.Wrapf(err, "error setting up OCI Hooks") } return g.Config, nil } // systemd expects to have /run, /run/lock and /tmp on tmpfs // It also expects to be able to write to /sys/fs/cgroup/systemd and /var/log/journal func (c *Container) setupSystemd(mounts []spec.Mount, g generate.Generator) error { options := []string{"rw", "rprivate", "noexec", "nosuid", "nodev"} for _, dest := range []string{"/run", "/run/lock"} { if MountExists(mounts, dest) { continue } tmpfsMnt := spec.Mount{ Destination: dest, Type: "tmpfs", Source: "tmpfs", Options: append(options, "tmpcopyup", "size=65536k"), } g.AddMount(tmpfsMnt) } for _, dest := range []string{"/tmp", "/var/log/journal"} { if MountExists(mounts, dest) { continue } tmpfsMnt := spec.Mount{ Destination: dest, Type: "tmpfs", Source: "tmpfs", Options: append(options, "tmpcopyup"), } g.AddMount(tmpfsMnt) } unified, err := cgroups.IsCgroup2UnifiedMode() if err != nil { return err } if unified { g.RemoveMount("/sys/fs/cgroup") sourcePath := filepath.Join("/sys/fs/cgroup") systemdMnt := spec.Mount{ Destination: "/sys/fs/cgroup", Type: "bind", Source: sourcePath, Options: []string{"bind", "private", "rw"}, } g.AddMount(systemdMnt) } else { systemdMnt := spec.Mount{ Destination: "/sys/fs/cgroup/systemd", Type: "bind", Source: "/sys/fs/cgroup/systemd", Options: []string{"bind", "nodev", "noexec", "nosuid"}, } g.AddMount(systemdMnt) } return nil } // Add an existing container's namespace to the spec func (c *Container) addNamespaceContainer(g *generate.Generator, ns LinuxNS, ctr string, specNS spec.LinuxNamespaceType) error { nsCtr, err := c.runtime.state.Container(ctr) if err != nil { return errors.Wrapf(err, "error retrieving dependency %s of container %s from state", ctr, c.ID()) } // TODO need unlocked version of this for use in pods nsPath, err := nsCtr.NamespacePath(ns) if err != nil { return err } if err := g.AddOrReplaceLinuxNamespace(string(specNS), nsPath); err != nil { return err } return nil } func (c *Container) exportCheckpoint(dest string, ignoreRootfs bool) (err error) { if (len(c.config.NamedVolumes) > 0) || (len(c.Dependencies()) > 0) { return errors.Errorf("Cannot export checkpoints of containers with named volumes or dependencies") } logrus.Debugf("Exporting checkpoint image of container %q to %q", c.ID(), dest) includeFiles := []string{ "checkpoint", "artifacts", "ctr.log", "config.dump", "spec.dump", "network.status"} // Get root file-system changes included in the checkpoint archive rootfsDiffPath := filepath.Join(c.bundlePath(), "rootfs-diff.tar") if !ignoreRootfs { rootfsDiffFile, err := os.Create(rootfsDiffPath) if err != nil { return errors.Wrapf(err, "error creating root file-system diff file %q", rootfsDiffPath) } tarStream, err := c.runtime.GetDiffTarStream("", c.ID()) if err != nil { return errors.Wrapf(err, "error exporting root file-system diff to %q", rootfsDiffPath) } _, err = io.Copy(rootfsDiffFile, tarStream) if err != nil { return errors.Wrapf(err, "error exporting root file-system diff to %q", rootfsDiffPath) } tarStream.Close() rootfsDiffFile.Close() includeFiles = append(includeFiles, "rootfs-diff.tar") } input, err := archive.TarWithOptions(c.bundlePath(), &archive.TarOptions{ Compression: archive.Gzip, IncludeSourceDir: true, IncludeFiles: includeFiles, }) if err != nil { return errors.Wrapf(err, "error reading checkpoint directory %q", c.ID()) } outFile, err := os.Create(dest) if err != nil { return errors.Wrapf(err, "error creating checkpoint export file %q", dest) } defer outFile.Close() if err := os.Chmod(dest, 0600); err != nil { return errors.Wrapf(err, "cannot chmod %q", dest) } _, err = io.Copy(outFile, input) if err != nil { return err } os.Remove(rootfsDiffPath) return nil } func (c *Container) checkpointRestoreSupported() (err error) { if !criu.CheckForCriu() { return errors.Errorf("Checkpoint/Restore requires at least CRIU %d", criu.MinCriuVersion) } if !c.ociRuntime.featureCheckCheckpointing() { return errors.Errorf("Configured runtime does not support checkpoint/restore") } return nil } func (c *Container) checkpointRestoreLabelLog(fileName string) (err error) { // Create the CRIU log file and label it dumpLog := filepath.Join(c.bundlePath(), fileName) logFile, err := os.OpenFile(dumpLog, os.O_CREATE, 0600) if err != nil { return errors.Wrapf(err, "failed to create CRIU log file %q", dumpLog) } if err := logFile.Close(); err != nil { logrus.Errorf("unable to close log file: %q", err) } if err = label.SetFileLabel(dumpLog, c.MountLabel()); err != nil { return errors.Wrapf(err, "failed to label CRIU log file %q", dumpLog) } return nil } func (c *Container) checkpoint(ctx context.Context, options ContainerCheckpointOptions) (err error) { if err := c.checkpointRestoreSupported(); err != nil { return err } if c.state.State != define.ContainerStateRunning { return errors.Wrapf(define.ErrCtrStateInvalid, "%q is not running, cannot checkpoint", c.state.State) } if err := c.checkpointRestoreLabelLog("dump.log"); err != nil { return err } if err := c.ociRuntime.checkpointContainer(c, options); err != nil { return err } // Save network.status. This is needed to restore the container with // the same IP. Currently limited to one IP address in a container // with one interface. formatJSON, err := json.MarshalIndent(c.state.NetworkStatus, "", " ") if err != nil { return err } if err := ioutil.WriteFile(filepath.Join(c.bundlePath(), "network.status"), formatJSON, 0644); err != nil { return err } if options.TargetFile != "" { if err = c.exportCheckpoint(options.TargetFile, options.IgnoreRootfs); err != nil { return err } } logrus.Debugf("Checkpointed container %s", c.ID()) if !options.KeepRunning { c.state.State = define.ContainerStateStopped // Cleanup Storage and Network if err := c.cleanup(ctx); err != nil { return err } } if !options.Keep { cleanup := []string{ "dump.log", "stats-dump", "config.dump", "spec.dump", } for _, del := range cleanup { file := filepath.Join(c.bundlePath(), del) if err := os.Remove(file); err != nil { logrus.Debugf("unable to remove file %s", file) } } } c.state.FinishedTime = time.Now() return c.save() } func (c *Container) importCheckpoint(input string) (err error) { archiveFile, err := os.Open(input) if err != nil { return errors.Wrapf(err, "Failed to open checkpoint archive %s for import", input) } defer archiveFile.Close() options := &archive.TarOptions{ ExcludePatterns: []string{ // config.dump and spec.dump are only required // container creation "config.dump", "spec.dump", }, } err = archive.Untar(archiveFile, c.bundlePath(), options) if err != nil { return errors.Wrapf(err, "Unpacking of checkpoint archive %s failed", input) } // Make sure the newly created config.json exists on disk g := generate.Generator{Config: c.config.Spec} if err = c.saveSpec(g.Config); err != nil { return errors.Wrap(err, "Saving imported container specification for restore failed") } return nil } func (c *Container) restore(ctx context.Context, options ContainerCheckpointOptions) (err error) { if err := c.checkpointRestoreSupported(); err != nil { return err } if (c.state.State != define.ContainerStateConfigured) && (c.state.State != define.ContainerStateExited) { return errors.Wrapf(define.ErrCtrStateInvalid, "container %s is running or paused, cannot restore", c.ID()) } if options.TargetFile != "" { if err = c.importCheckpoint(options.TargetFile); err != nil { return err } } // Let's try to stat() CRIU's inventory file. If it does not exist, it makes // no sense to try a restore. This is a minimal check if a checkpoint exist. if _, err := os.Stat(filepath.Join(c.CheckpointPath(), "inventory.img")); os.IsNotExist(err) { return errors.Wrapf(err, "A complete checkpoint for this container cannot be found, cannot restore") } if err := c.checkpointRestoreLabelLog("restore.log"); err != nil { return err } // If a container is restored multiple times from an exported checkpoint with // the help of '--import --name', the restore will fail if during 'podman run' // a static container IP was set with '--ip'. The user can tell the restore // process to ignore the static IP with '--ignore-static-ip' if options.IgnoreStaticIP { c.config.StaticIP = nil } // Read network configuration from checkpoint // Currently only one interface with one IP is supported. networkStatusFile, err := os.Open(filepath.Join(c.bundlePath(), "network.status")) // If the restored container should get a new name, the IP address of // the container will not be restored. This assumes that if a new name is // specified, the container is restored multiple times. // TODO: This implicit restoring with or without IP depending on an // unrelated restore parameter (--name) does not seem like the // best solution. if err == nil && options.Name == "" && !options.IgnoreStaticIP { // The file with the network.status does exist. Let's restore the // container with the same IP address as during checkpointing. defer networkStatusFile.Close() var networkStatus []*cnitypes.Result networkJSON, err := ioutil.ReadAll(networkStatusFile) if err != nil { return err } if err := json.Unmarshal(networkJSON, &networkStatus); err != nil { return err } // Take the first IP address var IP net.IP if len(networkStatus) > 0 { if len(networkStatus[0].IPs) > 0 { IP = networkStatus[0].IPs[0].Address.IP } } if IP != nil { // Tell CNI which IP address we want. c.requestedIP = IP } } defer func() { if err != nil { if err2 := c.cleanup(ctx); err2 != nil { logrus.Errorf("error cleaning up container %s: %v", c.ID(), err2) } } }() if err := c.prepare(); err != nil { return err } // Read config jsonPath := filepath.Join(c.bundlePath(), "config.json") logrus.Debugf("generate.NewFromFile at %v", jsonPath) g, err := generate.NewFromFile(jsonPath) if err != nil { logrus.Debugf("generate.NewFromFile failed with %v", err) return err } // Restoring from an import means that we are doing migration if options.TargetFile != "" { g.SetRootPath(c.state.Mountpoint) } // We want to have the same network namespace as before. if c.config.CreateNetNS { if err := g.AddOrReplaceLinuxNamespace(string(spec.NetworkNamespace), c.state.NetNS.Path()); err != nil { return err } } if err := c.makeBindMounts(); err != nil { return err } if options.TargetFile != "" { for dstPath, srcPath := range c.state.BindMounts { newMount := spec.Mount{ Type: "bind", Source: srcPath, Destination: dstPath, Options: []string{"bind", "private"}, } if c.IsReadOnly() && dstPath != "/dev/shm" { newMount.Options = append(newMount.Options, "ro", "nosuid", "noexec", "nodev") } if !MountExists(g.Mounts(), dstPath) { g.AddMount(newMount) } } } // Cleanup for a working restore. if err := c.removeConmonFiles(); err != nil { return err } // Save the OCI spec to disk if err := c.saveSpec(g.Config); err != nil { return err } // Before actually restarting the container, apply the root file-system changes if !options.IgnoreRootfs { rootfsDiffPath := filepath.Join(c.bundlePath(), "rootfs-diff.tar") if _, err := os.Stat(rootfsDiffPath); err == nil { // Only do this if a rootfs-diff.tar actually exists rootfsDiffFile, err := os.Open(rootfsDiffPath) if err != nil { return errors.Wrapf(err, "Failed to open root file-system diff file %s", rootfsDiffPath) } if err := c.runtime.ApplyDiffTarStream(c.ID(), rootfsDiffFile); err != nil { return errors.Wrapf(err, "Failed to apply root file-system diff file %s", rootfsDiffPath) } rootfsDiffFile.Close() } } if err := c.ociRuntime.createContainer(c, &options); err != nil { return err } logrus.Debugf("Restored container %s", c.ID()) c.state.State = define.ContainerStateRunning if !options.Keep { // Delete all checkpoint related files. At this point, in theory, all files // should exist. Still ignoring errors for now as the container should be // restored and running. Not erroring out just because some cleanup operation // failed. Starting with the checkpoint directory err = os.RemoveAll(c.CheckpointPath()) if err != nil { logrus.Debugf("Non-fatal: removal of checkpoint directory (%s) failed: %v", c.CheckpointPath(), err) } cleanup := [...]string{"restore.log", "dump.log", "stats-dump", "stats-restore", "network.status", "rootfs-diff.tar"} for _, del := range cleanup { file := filepath.Join(c.bundlePath(), del) err = os.Remove(file) if err != nil { logrus.Debugf("Non-fatal: removal of checkpoint file (%s) failed: %v", file, err) } } } return c.save() } // Make standard bind mounts to include in the container func (c *Container) makeBindMounts() error { if err := os.Chown(c.state.RunDir, c.RootUID(), c.RootGID()); err != nil { return errors.Wrapf(err, "cannot chown run directory %s", c.state.RunDir) } if c.state.BindMounts == nil { c.state.BindMounts = make(map[string]string) } netDisabled, err := c.NetworkDisabled() if err != nil { return err } if !netDisabled { // If /etc/resolv.conf and /etc/hosts exist, delete them so we // will recreate. Only do this if we aren't sharing them with // another container. if c.config.NetNsCtr == "" { if resolvePath, ok := c.state.BindMounts["/etc/resolv.conf"]; ok { if err := os.Remove(resolvePath); err != nil && !os.IsNotExist(err) { return errors.Wrapf(err, "error removing container %s resolv.conf", c.ID()) } delete(c.state.BindMounts, "/etc/resolv.conf") } if hostsPath, ok := c.state.BindMounts["/etc/hosts"]; ok { if err := os.Remove(hostsPath); err != nil && !os.IsNotExist(err) { return errors.Wrapf(err, "error removing container %s hosts", c.ID()) } delete(c.state.BindMounts, "/etc/hosts") } } if c.config.NetNsCtr != "" && (!c.config.UseImageResolvConf || !c.config.UseImageHosts) { // We share a net namespace. // We want /etc/resolv.conf and /etc/hosts from the // other container. Unless we're not creating both of // them. depCtr, err := c.runtime.state.Container(c.config.NetNsCtr) if err != nil { return errors.Wrapf(err, "error fetching dependency %s of container %s", c.config.NetNsCtr, c.ID()) } // We need that container's bind mounts bindMounts, err := depCtr.BindMounts() if err != nil { return errors.Wrapf(err, "error fetching bind mounts from dependency %s of container %s", depCtr.ID(), c.ID()) } if !c.config.UseImageResolvConf { // The other container may not have a resolv.conf or /etc/hosts // If it doesn't, don't copy them resolvPath, exists := bindMounts["/etc/resolv.conf"] if exists { c.state.BindMounts["/etc/resolv.conf"] = resolvPath } } if !c.config.UseImageHosts { // check if dependency container has an /etc/hosts file hostsPath, exists := bindMounts["/etc/hosts"] if !exists { return errors.Errorf("error finding hosts file of dependency container %s for container %s", depCtr.ID(), c.ID()) } depCtr.lock.Lock() // generate a hosts file for the dependency container, // based on either its old hosts file, or the default, // and add the relevant information from the new container (hosts and IP) hostsPath, err = depCtr.appendHosts(hostsPath, c) if err != nil { depCtr.lock.Unlock() return errors.Wrapf(err, "error creating hosts file for container %s which depends on container %s", c.ID(), depCtr.ID()) } depCtr.lock.Unlock() // finally, save it in the new container c.state.BindMounts["/etc/hosts"] = hostsPath } } else { if !c.config.UseImageResolvConf { newResolv, err := c.generateResolvConf() if err != nil { return errors.Wrapf(err, "error creating resolv.conf for container %s", c.ID()) } c.state.BindMounts["/etc/resolv.conf"] = newResolv } if !c.config.UseImageHosts { newHosts, err := c.generateHosts("/etc/hosts") if err != nil { return errors.Wrapf(err, "error creating hosts file for container %s", c.ID()) } c.state.BindMounts["/etc/hosts"] = newHosts } } if c.state.BindMounts["/etc/hosts"] != "" { if err := label.Relabel(c.state.BindMounts["/etc/hosts"], c.config.MountLabel, true); err != nil { return err } } if c.state.BindMounts["/etc/resolv.conf"] != "" { if err := label.Relabel(c.state.BindMounts["/etc/resolv.conf"], c.config.MountLabel, true); err != nil { return err } } } // SHM is always added when we mount the container c.state.BindMounts["/dev/shm"] = c.config.ShmDir newPasswd, err := c.generatePasswd() if err != nil { return errors.Wrapf(err, "error creating temporary passwd file for container %s", c.ID()) } if newPasswd != "" { // Make /etc/passwd if _, ok := c.state.BindMounts["/etc/passwd"]; ok { // If it already exists, delete so we can recreate delete(c.state.BindMounts, "/etc/passwd") } logrus.Debugf("adding entry to /etc/passwd for non existent default user") c.state.BindMounts["/etc/passwd"] = newPasswd } // Make /etc/hostname // This should never change, so no need to recreate if it exists if _, ok := c.state.BindMounts["/etc/hostname"]; !ok { hostnamePath, err := c.writeStringToRundir("hostname", c.Hostname()) if err != nil { return errors.Wrapf(err, "error creating hostname file for container %s", c.ID()) } c.state.BindMounts["/etc/hostname"] = hostnamePath } // Make .containerenv // Empty file, so no need to recreate if it exists if _, ok := c.state.BindMounts["/run/.containerenv"]; !ok { // Empty string for now, but we may consider populating this later containerenvPath, err := c.writeStringToRundir(".containerenv", "") if err != nil { return errors.Wrapf(err, "error creating containerenv file for container %s", c.ID()) } c.state.BindMounts["/run/.containerenv"] = containerenvPath } // Add Secret Mounts secretMounts := secrets.SecretMountsWithUIDGID(c.config.MountLabel, c.state.RunDir, c.runtime.config.DefaultMountsFile, c.state.RunDir, c.RootUID(), c.RootGID(), rootless.IsRootless()) for _, mount := range secretMounts { if _, ok := c.state.BindMounts[mount.Destination]; !ok { c.state.BindMounts[mount.Destination] = mount.Source } } return nil } // generateResolvConf generates a containers resolv.conf func (c *Container) generateResolvConf() (string, error) { var ( nameservers []string cniNameServers []string ) resolvConf := "/etc/resolv.conf" for _, namespace := range c.config.Spec.Linux.Namespaces { if namespace.Type == spec.NetworkNamespace { if namespace.Path != "" && !strings.HasPrefix(namespace.Path, "/proc/") { definedPath := filepath.Join("/etc/netns", filepath.Base(namespace.Path), "resolv.conf") _, err := os.Stat(definedPath) if err == nil { resolvConf = definedPath } else if !os.IsNotExist(err) { return "", errors.Wrapf(err, "failed to stat %s", definedPath) } } break } } // Determine the endpoint for resolv.conf in case it is a symlink resolvPath, err := filepath.EvalSymlinks(resolvConf) if err != nil { return "", err } contents, err := ioutil.ReadFile(resolvPath) if err != nil { return "", errors.Wrapf(err, "unable to read %s", resolvPath) } // Ensure that the container's /etc/resolv.conf is compatible with its // network configuration. // TODO: set ipv6 enable bool more sanely resolv, err := resolvconf.FilterResolvDNS(contents, true, c.config.CreateNetNS) if err != nil { return "", errors.Wrapf(err, "error parsing host resolv.conf") } // Check if CNI gave back and DNS servers for us to add in cniResponse := c.state.NetworkStatus for _, i := range cniResponse { if i.DNS.Nameservers != nil { cniNameServers = append(cniNameServers, i.DNS.Nameservers...) logrus.Debugf("adding nameserver(s) from cni response of '%q'", i.DNS.Nameservers) } } // If the user provided dns, it trumps all; then dns masq; then resolv.conf if len(c.config.DNSServer) > 0 { // We store DNS servers as net.IP, so need to convert to string for _, server := range c.config.DNSServer { nameservers = append(nameservers, server.String()) } } else if len(cniNameServers) > 0 { nameservers = append(nameservers, cniNameServers...) } else { // Make a new resolv.conf nameservers = resolvconf.GetNameservers(resolv.Content) // slirp4netns has a built in DNS server. if c.config.NetMode.IsSlirp4netns() { nameservers = append([]string{"10.0.2.3"}, nameservers...) } } search := resolvconf.GetSearchDomains(resolv.Content) if len(c.config.DNSSearch) > 0 { search = c.config.DNSSearch } options := resolvconf.GetOptions(resolv.Content) if len(c.config.DNSOption) > 0 { options = c.config.DNSOption } destPath := filepath.Join(c.state.RunDir, "resolv.conf") if err := os.Remove(destPath); err != nil && !os.IsNotExist(err) { return "", errors.Wrapf(err, "error removing resolv.conf for container %s", c.ID()) } // Build resolv.conf if _, err = resolvconf.Build(destPath, nameservers, search, options); err != nil { return "", errors.Wrapf(err, "error building resolv.conf for container %s", c.ID()) } // Relabel resolv.conf for the container if err := label.Relabel(destPath, c.config.MountLabel, true); err != nil { return "", err } return filepath.Join(c.state.RunDir, "resolv.conf"), nil } // generateHosts creates a containers hosts file func (c *Container) generateHosts(path string) (string, error) { orig, err := ioutil.ReadFile(path) if err != nil { return "", errors.Wrapf(err, "unable to read %s", path) } hosts := string(orig) hosts += c.getHosts() return c.writeStringToRundir("hosts", hosts) } // appendHosts appends a container's config and state pertaining to hosts to a container's // local hosts file. netCtr is the container from which the netNS information is // taken. // path is the basis of the hosts file, into which netCtr's netNS information will be appended. func (c *Container) appendHosts(path string, netCtr *Container) (string, error) { return c.appendStringToRundir("hosts", netCtr.getHosts()) } // getHosts finds the pertinent information for a container's host file in its config and state // and returns a string in a format that can be written to the host file func (c *Container) getHosts() string { var hosts string if len(c.config.HostAdd) > 0 { for _, host := range c.config.HostAdd { // the host format has already been verified at this point fields := strings.SplitN(host, ":", 2) hosts += fmt.Sprintf("%s %s\n", fields[1], fields[0]) } } if c.config.NetMode.IsSlirp4netns() { // When using slirp4netns, the interface gets a static IP hosts += fmt.Sprintf("# used by slirp4netns\n%s\t%s\n", "10.0.2.100", c.Hostname()) } if len(c.state.NetworkStatus) > 0 && len(c.state.NetworkStatus[0].IPs) > 0 { ipAddress := strings.Split(c.state.NetworkStatus[0].IPs[0].Address.String(), "/")[0] hosts += fmt.Sprintf("%s\t%s\n", ipAddress, c.Hostname()) } return hosts } // generatePasswd generates a container specific passwd file, // iff g.config.User is a number func (c *Container) generatePasswd() (string, error) { var ( groupspec string gid int ) if c.config.User == "" { return "", nil } splitSpec := strings.SplitN(c.config.User, ":", 2) userspec := splitSpec[0] if len(splitSpec) > 1 { groupspec = splitSpec[1] } // If a non numeric User, then don't generate passwd uid, err := strconv.ParseUint(userspec, 10, 32) if err != nil { return "", nil } // Lookup the user to see if it exists in the container image _, err = lookup.GetUser(c.state.Mountpoint, userspec) if err != nil && err != user.ErrNoPasswdEntries { return "", err } if err == nil { return "", nil } if groupspec != "" { ugid, err := strconv.ParseUint(groupspec, 10, 32) if err == nil { gid = int(ugid) } else { group, err := lookup.GetGroup(c.state.Mountpoint, groupspec) if err != nil { return "", errors.Wrapf(err, "unable to get gid %s from group file", groupspec) } gid = group.Gid } } originPasswdFile := filepath.Join(c.state.Mountpoint, "/etc/passwd") orig, err := ioutil.ReadFile(originPasswdFile) if err != nil && !os.IsNotExist(err) { return "", errors.Wrapf(err, "unable to read passwd file %s", originPasswdFile) } pwd := fmt.Sprintf("%s%d:x:%d:%d:container user:%s:/bin/sh\n", orig, uid, uid, gid, c.WorkingDir()) passwdFile, err := c.writeStringToRundir("passwd", pwd) if err != nil { return "", errors.Wrapf(err, "failed to create temporary passwd file") } if err := os.Chmod(passwdFile, 0644); err != nil { return "", err } return passwdFile, nil } func (c *Container) copyOwnerAndPerms(source, dest string) error { info, err := os.Stat(source) if err != nil { if os.IsNotExist(err) { return nil } return errors.Wrapf(err, "cannot stat `%s`", dest) } if err := os.Chmod(dest, info.Mode()); err != nil { return errors.Wrapf(err, "cannot chmod `%s`", dest) } if err := os.Chown(dest, int(info.Sys().(*syscall.Stat_t).Uid), int(info.Sys().(*syscall.Stat_t).Gid)); err != nil { return errors.Wrapf(err, "cannot chown `%s`", dest) } return nil }