From f7c8fd8a3d6f289a3abee1e2f676bfb956f7195c Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Tue, 18 Sep 2018 09:56:19 +0000 Subject: Add support to checkpoint/restore containers runc uses CRIU to support checkpoint and restore of containers. This brings an initial checkpoint/restore implementation to podman. None of the additional runc flags are yet supported and container migration optimization (pre-copy/post-copy) is also left for the future. The current status is that it is possible to checkpoint and restore a container. I am testing on RHEL-7.x and as the combination of RHEL-7 and CRIU has seccomp troubles I have to create the container without seccomp. With the following steps I am able to checkpoint and restore a container: # podman run --security-opt="seccomp=unconfined" -d registry.fedoraproject.org/f27/httpd # curl -I 10.22.0.78:8080 HTTP/1.1 403 Forbidden # <-- this is actually a good answer # podman container checkpoint # curl -I 10.22.0.78:8080 curl: (7) Failed connect to 10.22.0.78:8080; No route to host # podman container restore # curl -I 10.22.0.78:8080 HTTP/1.1 403 Forbidden I am using CRIU, runc and conmon from git. All required changes for checkpoint/restore support in podman have been merged in the corresponding projects. To have the same IP address in the restored container as before checkpointing, CNI is told which IP address to use. If the saved network configuration cannot be found during restore, the container is restored with a new IP address. For CRIU to restore established TCP connections the IP address of the network namespace used for restore needs to be the same. For TCP connections in the listening state the IP address can change. During restore only one network interface with one IP address is handled correctly. Support to restore containers with more advanced network configuration will be implemented later. v2: * comment typo * print debug messages during cleanup of restore files * use createContainer() instead of createOCIContainer() * introduce helper CheckpointPath() * do not try to restore a container that is paused * use existing helper functions for cleanup * restructure code flow for better readability * do not try to restore if checkpoint/inventory.img is missing * git add checkpoint.go restore.go v3: * move checkpoint/restore under 'podman container' v4: * incorporated changes from latest reviews Signed-off-by: Adrian Reber --- cmd/podman/checkpoint.go | 73 ++++++++++++++ cmd/podman/container.go | 2 + cmd/podman/restore.go | 73 ++++++++++++++ libpod/container_api.go | 30 ++++++ libpod/container_internal.go | 7 +- libpod/container_internal_linux.go | 158 +++++++++++++++++++++++++++++++ libpod/container_internal_unsupported.go | 8 ++ libpod/oci.go | 18 +++- libpod/oci_linux.go | 6 +- libpod/oci_unsupported.go | 2 +- 10 files changed, 371 insertions(+), 6 deletions(-) create mode 100644 cmd/podman/checkpoint.go create mode 100644 cmd/podman/restore.go diff --git a/cmd/podman/checkpoint.go b/cmd/podman/checkpoint.go new file mode 100644 index 000000000..cbbbcd740 --- /dev/null +++ b/cmd/podman/checkpoint.go @@ -0,0 +1,73 @@ +package main + +import ( + "context" + "fmt" + "os" + + "github.com/containers/libpod/cmd/podman/libpodruntime" + "github.com/containers/libpod/pkg/rootless" + "github.com/pkg/errors" + "github.com/urfave/cli" +) + +var ( + checkpointDescription = ` + podman container checkpoint + + Checkpoints one or more running containers. The container name or ID can be used. +` + checkpointFlags = []cli.Flag{ + cli.BoolFlag{ + Name: "keep, k", + Usage: "keep all temporary checkpoint files", + }, + } + checkpointCommand = cli.Command{ + Name: "checkpoint", + Usage: "Checkpoints one or more containers", + Description: checkpointDescription, + Flags: checkpointFlags, + Action: checkpointCmd, + ArgsUsage: "CONTAINER-NAME [CONTAINER-NAME ...]", + } +) + +func checkpointCmd(c *cli.Context) error { + if rootless.IsRootless() { + return errors.New("checkpointing a container requires root") + } + + runtime, err := libpodruntime.GetRuntime(c) + if err != nil { + return errors.Wrapf(err, "could not get runtime") + } + defer runtime.Shutdown(false) + + keep := c.Bool("keep") + args := c.Args() + if len(args) < 1 { + return errors.Errorf("you must provide at least one container name or id") + } + + var lastError error + for _, arg := range args { + ctr, err := runtime.LookupContainer(arg) + if err != nil { + if lastError != nil { + fmt.Fprintln(os.Stderr, lastError) + } + lastError = errors.Wrapf(err, "error looking up container %q", arg) + continue + } + if err = ctr.Checkpoint(context.TODO(), keep); err != nil { + if lastError != nil { + fmt.Fprintln(os.Stderr, lastError) + } + lastError = errors.Wrapf(err, "failed to checkpoint container %v", ctr.ID()) + } else { + fmt.Println(ctr.ID()) + } + } + return lastError +} diff --git a/cmd/podman/container.go b/cmd/podman/container.go index 82c1c824d..ff634278f 100644 --- a/cmd/podman/container.go +++ b/cmd/podman/container.go @@ -7,6 +7,7 @@ import ( var ( subCommands = []cli.Command{ attachCommand, + checkpointCommand, cleanupCommand, commitCommand, createCommand, @@ -23,6 +24,7 @@ var ( // pruneCommand, refreshCommand, restartCommand, + restoreCommand, rmCommand, runCommand, runlabelCommand, diff --git a/cmd/podman/restore.go b/cmd/podman/restore.go new file mode 100644 index 000000000..43ef87ca2 --- /dev/null +++ b/cmd/podman/restore.go @@ -0,0 +1,73 @@ +package main + +import ( + "context" + "fmt" + "os" + + "github.com/containers/libpod/cmd/podman/libpodruntime" + "github.com/containers/libpod/pkg/rootless" + "github.com/pkg/errors" + "github.com/urfave/cli" +) + +var ( + restoreDescription = ` + podman container restore + + Restores a container from a checkpoint. The container name or ID can be used. +` + restoreFlags = []cli.Flag{ + cli.BoolFlag{ + Name: "keep, k", + Usage: "keep all temporary checkpoint files", + }, + } + restoreCommand = cli.Command{ + Name: "restore", + Usage: "Restores one or more containers from a checkpoint", + Description: restoreDescription, + Flags: restoreFlags, + Action: restoreCmd, + ArgsUsage: "CONTAINER-NAME [CONTAINER-NAME ...]", + } +) + +func restoreCmd(c *cli.Context) error { + if rootless.IsRootless() { + return errors.New("restoring a container requires root") + } + + runtime, err := libpodruntime.GetRuntime(c) + if err != nil { + return errors.Wrapf(err, "could not get runtime") + } + defer runtime.Shutdown(false) + + keep := c.Bool("keep") + args := c.Args() + if len(args) < 1 { + return errors.Errorf("you must provide at least one container name or id") + } + + var lastError error + for _, arg := range args { + ctr, err := runtime.LookupContainer(arg) + if err != nil { + if lastError != nil { + fmt.Fprintln(os.Stderr, lastError) + } + lastError = errors.Wrapf(err, "error looking up container %q", arg) + continue + } + if err = ctr.Restore(context.TODO(), keep); err != nil { + if lastError != nil { + fmt.Fprintln(os.Stderr, lastError) + } + lastError = errors.Wrapf(err, "failed to restore container %v", ctr.ID()) + } else { + fmt.Println(ctr.ID()) + } + } + return lastError +} diff --git a/libpod/container_api.go b/libpod/container_api.go index 192ccd347..93becb80d 100644 --- a/libpod/container_api.go +++ b/libpod/container_api.go @@ -832,3 +832,33 @@ func (c *Container) Refresh(ctx context.Context) error { return nil } + +// Checkpoint checkpoints a container +func (c *Container) Checkpoint(ctx context.Context, keep bool) error { + logrus.Debugf("Trying to checkpoint container %s", c) + if !c.batched { + c.lock.Lock() + defer c.lock.Unlock() + + if err := c.syncContainer(); err != nil { + return err + } + } + + return c.checkpoint(ctx, keep) +} + +// Restore restores a container +func (c *Container) Restore(ctx context.Context, keep bool) (err error) { + logrus.Debugf("Trying to restore container %s", c) + if !c.batched { + c.lock.Lock() + defer c.lock.Unlock() + + if err := c.syncContainer(); err != nil { + return err + } + } + + return c.restore(ctx, keep) +} diff --git a/libpod/container_internal.go b/libpod/container_internal.go index 033426817..c925f070b 100644 --- a/libpod/container_internal.go +++ b/libpod/container_internal.go @@ -129,6 +129,11 @@ func (c *Container) ControlSocketPath() string { return filepath.Join(c.bundlePath(), "ctl") } +// CheckpointPath returns the path to the directory containing the checkpoint +func (c *Container) CheckpointPath() string { + return filepath.Join(c.bundlePath(), "checkpoint") +} + // AttachSocketPath retrieves the path of the container's attach socket func (c *Container) AttachSocketPath() string { return filepath.Join(c.runtime.ociRuntime.socketsDir, c.ID(), "attach") @@ -523,7 +528,7 @@ func (c *Container) init(ctx context.Context) error { } // With the spec complete, do an OCI create - if err := c.runtime.ociRuntime.createContainer(c, c.config.CgroupParent); err != nil { + if err := c.runtime.ociRuntime.createContainer(c, c.config.CgroupParent, false); err != nil { return err } diff --git a/libpod/container_internal_linux.go b/libpod/container_internal_linux.go index b77beaf64..0353124dd 100644 --- a/libpod/container_internal_linux.go +++ b/libpod/container_internal_linux.go @@ -4,12 +4,18 @@ package libpod import ( "context" + "encoding/json" "fmt" + "io/ioutil" + "net" + "os" "path" + "path/filepath" "strings" "syscall" "time" + cnitypes "github.com/containernetworking/cni/pkg/types/current" crioAnnotations "github.com/containers/libpod/pkg/annotations" "github.com/containers/libpod/pkg/chrootuser" "github.com/containers/libpod/pkg/rootless" @@ -307,3 +313,155 @@ func (c *Container) addNamespaceContainer(g *generate.Generator, ns LinuxNS, ctr return nil } + +func (c *Container) checkpoint(ctx context.Context, keep bool) (err error) { + + if c.state.State != ContainerStateRunning { + return errors.Wrapf(ErrCtrStateInvalid, "%q is not running, cannot checkpoint", c.state.State) + } + if err := c.runtime.ociRuntime.checkpointContainer(c); err != nil { + return err + } + + // Save network.status. This is needed to restore the container with + // the same IP. Currently limited to one IP address in a container + // with one interface. + formatJSON, err := json.MarshalIndent(c.state.NetworkStatus, "", " ") + if err != nil { + return err + } + if err := ioutil.WriteFile(filepath.Join(c.bundlePath(), "network.status"), formatJSON, 0644); err != nil { + return err + } + + logrus.Debugf("Checkpointed container %s", c.ID()) + + c.state.State = ContainerStateStopped + + // Cleanup Storage and Network + if err := c.cleanup(ctx); err != nil { + return err + } + + if !keep { + // Remove log file + os.Remove(filepath.Join(c.bundlePath(), "dump.log")) + // Remove statistic file + os.Remove(filepath.Join(c.bundlePath(), "stats-dump")) + } + + return c.save() +} + +func (c *Container) restore(ctx context.Context, keep bool) (err error) { + + if (c.state.State != ContainerStateConfigured) && (c.state.State != ContainerStateExited) { + return errors.Wrapf(ErrCtrStateInvalid, "container %s is running or paused, cannot restore", c.ID()) + } + + // Let's try to stat() CRIU's inventory file. If it does not exist, it makes + // no sense to try a restore. This is a minimal check if a checkpoint exist. + if _, err := os.Stat(filepath.Join(c.CheckpointPath(), "inventory.img")); os.IsNotExist(err) { + return errors.Wrapf(err, "A complete checkpoint for this container cannot be found, cannot restore") + } + + // Read network configuration from checkpoint + // Currently only one interface with one IP is supported. + networkStatusFile, err := os.Open(filepath.Join(c.bundlePath(), "network.status")) + if err == nil { + // The file with the network.status does exist. Let's restore the + // container with the same IP address as during checkpointing. + defer networkStatusFile.Close() + var networkStatus []*cnitypes.Result + networkJSON, err := ioutil.ReadAll(networkStatusFile) + if err != nil { + return err + } + json.Unmarshal(networkJSON, &networkStatus) + // Take the first IP address + var IP net.IP + if len(networkStatus) > 0 { + if len(networkStatus[0].IPs) > 0 { + IP = networkStatus[0].IPs[0].Address.IP + } + } + if IP != nil { + env := fmt.Sprintf("IP=%s", IP) + // Tell CNI which IP address we want. + os.Setenv("CNI_ARGS", env) + logrus.Debugf("Restoring container with %s", env) + } + } + + if err := c.prepare(); err != nil { + return err + } + defer func() { + if err != nil { + if err2 := c.cleanup(ctx); err2 != nil { + logrus.Errorf("error cleaning up container %s: %v", c.ID(), err2) + } + } + }() + + // TODO: use existing way to request static IPs, once it is merged in ocicni + // https://github.com/cri-o/ocicni/pull/23/ + + // CNI_ARGS was used to request a certain IP address. Unconditionally remove it. + os.Unsetenv("CNI_ARGS") + + // Read config + jsonPath := filepath.Join(c.bundlePath(), "config.json") + logrus.Debugf("generate.NewFromFile at %v", jsonPath) + g, err := generate.NewFromFile(jsonPath) + if err != nil { + logrus.Debugf("generate.NewFromFile failed with %v", err) + return err + } + + // We want to have the same network namespace as before. + if c.config.CreateNetNS { + g.AddOrReplaceLinuxNamespace(spec.NetworkNamespace, c.state.NetNS.Path()) + } + + // Save the OCI spec to disk + if err := c.saveSpec(g.Spec()); err != nil { + return err + } + + if err := c.makeBindMounts(); err != nil { + return err + } + + // Cleanup for a working restore. + c.removeConmonFiles() + + if err := c.runtime.ociRuntime.createContainer(c, c.config.CgroupParent, true); err != nil { + return err + } + + logrus.Debugf("Restored container %s", c.ID()) + + c.state.State = ContainerStateRunning + + if !keep { + // Delete all checkpoint related files. At this point, in theory, all files + // should exist. Still ignoring errors for now as the container should be + // restored and running. Not erroring out just because some cleanup operation + // failed. Starting with the checkpoint directory + err = os.RemoveAll(c.CheckpointPath()) + if err != nil { + logrus.Debugf("Non-fatal: removal of checkpoint directory (%s) failed: %v", c.CheckpointPath(), err) + } + cleanup := [...]string{"restore.log", "dump.log", "stats-dump", "stats-restore", "network.status"} + for _, delete := range cleanup { + file := filepath.Join(c.bundlePath(), delete) + err = os.Remove(file) + if err != nil { + logrus.Debugf("Non-fatal: removal of checkpoint file (%s) failed: %v", file, err) + } + } + } + + return c.save() +} diff --git a/libpod/container_internal_unsupported.go b/libpod/container_internal_unsupported.go index 45b54efab..eed0449a9 100644 --- a/libpod/container_internal_unsupported.go +++ b/libpod/container_internal_unsupported.go @@ -27,3 +27,11 @@ func (c *Container) cleanupNetwork() error { func (c *Container) generateSpec(ctx context.Context) (*spec.Spec, error) { return nil, ErrNotImplemented } + +func (c *Container) checkpoint(ctx context.Context, keep bool) error { + return ErrNotImplemented +} + +func (c *Container) restore(ctx context.Context, keep bool) error { + return ErrNotImplemented +} diff --git a/libpod/oci.go b/libpod/oci.go index e5db06540..cf2b76ab0 100644 --- a/libpod/oci.go +++ b/libpod/oci.go @@ -227,7 +227,7 @@ func bindPorts(ports []ocicni.PortMapping) ([]*os.File, error) { return files, nil } -func (r *OCIRuntime) createOCIContainer(ctr *Container, cgroupParent string) (err error) { +func (r *OCIRuntime) createOCIContainer(ctr *Container, cgroupParent string, restoreContainer bool) (err error) { var stderrBuf bytes.Buffer runtimeDir, err := GetRootlessRuntimeDir() @@ -289,6 +289,10 @@ func (r *OCIRuntime) createOCIContainer(ctr *Container, cgroupParent string) (er args = append(args, "--syslog") } + if restoreContainer { + args = append(args, "--restore", ctr.CheckpointPath()) + } + logrus.WithFields(logrus.Fields{ "args": args, }).Debugf("running conmon: %s", r.conmonPath) @@ -766,3 +770,15 @@ func (r *OCIRuntime) execStopContainer(ctr *Container, timeout uint) error { return nil } + +// checkpointContainer checkpoints the given container +func (r *OCIRuntime) checkpointContainer(ctr *Container) error { + // imagePath is used by CRIU to store the actual checkpoint files + imagePath := ctr.CheckpointPath() + // workPath will be used to store dump.log and stats-dump + workPath := ctr.bundlePath() + logrus.Debugf("Writing checkpoint to %s", imagePath) + logrus.Debugf("Writing checkpoint logs to %s", workPath) + return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, nil, r.path, "checkpoint", + "--image-path", imagePath, "--work-path", workPath, ctr.ID()) +} diff --git a/libpod/oci_linux.go b/libpod/oci_linux.go index 210ba57d1..0447670b3 100644 --- a/libpod/oci_linux.go +++ b/libpod/oci_linux.go @@ -63,10 +63,10 @@ func newPipe() (parent *os.File, child *os.File, err error) { // CreateContainer creates a container in the OCI runtime // TODO terminal support for container // Presently just ignoring conmon opts related to it -func (r *OCIRuntime) createContainer(ctr *Container, cgroupParent string) (err error) { +func (r *OCIRuntime) createContainer(ctr *Container, cgroupParent string, restoreContainer bool) (err error) { if ctr.state.UserNSRoot == "" { // no need of an intermediate mount ns - return r.createOCIContainer(ctr, cgroupParent) + return r.createOCIContainer(ctr, cgroupParent, restoreContainer) } var wg sync.WaitGroup wg.Add(1) @@ -103,7 +103,7 @@ func (r *OCIRuntime) createContainer(ctr *Container, cgroupParent string) (err e if err != nil { return } - err = r.createOCIContainer(ctr, cgroupParent) + err = r.createOCIContainer(ctr, cgroupParent, restoreContainer) }() wg.Wait() diff --git a/libpod/oci_unsupported.go b/libpod/oci_unsupported.go index 8cb4994d3..b133eb402 100644 --- a/libpod/oci_unsupported.go +++ b/libpod/oci_unsupported.go @@ -15,7 +15,7 @@ func newPipe() (parent *os.File, child *os.File, err error) { return nil, nil, ErrNotImplemented } -func (r *OCIRuntime) createContainer(ctr *Container, cgroupParent string) (err error) { +func (r *OCIRuntime) createContainer(ctr *Container, cgroupParent string, restoreContainer bool) (err error) { return ErrNotImplemented } -- cgit v1.2.3-54-g00ecf From e2b639a32fa2386c53d8a77fe18543bb054a95dd Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Wed, 19 Sep 2018 16:58:34 +0000 Subject: docs: add checkpoint and restore man pages This adds the podman-container-checkpoint and podman-container-restore man pages. Signed-off-by: Adrian Reber --- docs/podman-container-checkpoint.1.md | 30 ++++++++++++++++++++++++++++ docs/podman-container-restore.1.md | 37 +++++++++++++++++++++++++++++++++++ docs/podman-container.1.md | 2 ++ 3 files changed, 69 insertions(+) create mode 100644 docs/podman-container-checkpoint.1.md create mode 100644 docs/podman-container-restore.1.md diff --git a/docs/podman-container-checkpoint.1.md b/docs/podman-container-checkpoint.1.md new file mode 100644 index 000000000..4906e0e12 --- /dev/null +++ b/docs/podman-container-checkpoint.1.md @@ -0,0 +1,30 @@ +% podman-container-checkpoint(1) + +## NAME +podman\-container\-checkpoint - Checkpoints one or more running containers + +## SYNOPSIS +**podman container checkpoint** [*options*] *container* ... + +## DESCRIPTION +Checkpoints all the processes in one or more containers. You may use container IDs or names as input. + +## OPTIONS +**-k**, **--keep** + +Keep all temporary log and statistics files created by CRIU during checkpointing. These files +are not deleted if checkpointing fails for further debugging. If checkpointing succeeds these +files are theoretically not needed, but if these files are needed Podman can keep the files +for further analysis. + +## EXAMPLE + +podman container checkpoint mywebserver + +podman container checkpoint 860a4b23 + +## SEE ALSO +podman(1), podman-container-restore(1) + +## HISTORY +September 2018, Originally compiled by Adrian Reber diff --git a/docs/podman-container-restore.1.md b/docs/podman-container-restore.1.md new file mode 100644 index 000000000..6360bccb0 --- /dev/null +++ b/docs/podman-container-restore.1.md @@ -0,0 +1,37 @@ +% podman-container-restore(1) + +## NAME +podman\-container\-restore - Restores one or more running containers + +## SYNOPSIS +**podman container restore** [*options*] *container* ... + +## DESCRIPTION +Restores a container from a checkpoint. You may use container IDs or names as input. + +## OPTIONS +**-k**, **--keep** + +Keep all temporary log and statistics files created by CRIU during +checkpointing as well as restoring. These files are not deleted if restoring +fails for further debugging. If restoring succeeds these files are +theoretically not needed, but if these files are needed Podman can keep the +files for further analysis. This includes the checkpoint directory with all +files created during checkpointing. The size required by the checkpoint +directory is roughly the same as the amount of memory required by the +processes in the checkpointed container. + +Without the **-k**, **--keep** option the checkpoint will be consumed and cannot be used +again. + +## EXAMPLE + +podman container restore mywebserver + +podman container restore 860a4b23 + +## SEE ALSO +podman(1), podman-container-checkpoint(1) + +## HISTORY +September 2018, Originally compiled by Adrian Reber diff --git a/docs/podman-container.1.md b/docs/podman-container.1.md index bbc325823..eac3343d5 100644 --- a/docs/podman-container.1.md +++ b/docs/podman-container.1.md @@ -14,6 +14,7 @@ The container command allows you to manage containers | Command | Man Page | Description | | ------- | --------------------------------------------------- | ---------------------------------------------------------------------------- | | attach | [podman-attach(1)](podman-attach.1.md) | Attach to a running container. | +| checkpoint | [podman-container-checkpoint(1)](podman-container-checkpoint.1.md) | Checkpoints one or more containers. | | cleanup | [podman-container-cleanup(1)](podman-container-cleanup.1.md) | Cleanup containers network and mountpoints. | | commit | [podman-commit(1)](podman-commit.1.md) | Create new image based on the changed container. | | create | [podman-create(1)](podman-create.1.md) | Create a new container. | @@ -29,6 +30,7 @@ The container command allows you to manage containers | port | [podman-port(1)](podman-port.1.md) | List port mappings for the container. | | refresh | [podman-refresh(1)](podman-container-refresh.1.md) | Refresh the state of all containers | | restart | [podman-restart(1)](podman-restart.1.md) | Restart one or more containers. | +| restore | [podman-container-restore(1)](podman-container-restore.1.md) | Restores one or more containers from a checkpoint. | | rm | [podman-rm(1)](podman-rm.1.md) | Remove one or more containers. | | run | [podman-run(1)](podman-run.1.md) | Run a command in a container. | | start | [podman-start(1)](podman-start.1.md) | Starts one or more containers. | -- cgit v1.2.3-54-g00ecf From 5bafafc7eb31152b12627a1bf07657013ee01027 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Wed, 19 Sep 2018 17:11:17 +0000 Subject: tutorial: add checkpoint/restore to tutorial Signed-off-by: Adrian Reber --- docs/tutorials/podman_tutorial.md | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/docs/tutorials/podman_tutorial.md b/docs/tutorials/podman_tutorial.md index a866b8eed..152d65a59 100644 --- a/docs/tutorials/podman_tutorial.md +++ b/docs/tutorials/podman_tutorial.md @@ -157,6 +157,28 @@ $ sudo podman top 101 31889 31873 0 09:21 ? 00:00:00 nginx: worker process ``` +### Checkpointing the container +Checkpointing a container stops the container while writing the state of all processes in the container to disk. +With this a container can later be restored and continue running at exactly the same point in time as the +checkpoint. This capability requires CRIU 3.11 or later installed on the system. +To checkpoint the container use: +```console +$ sudo podman container checkpoint +``` + +### Restoring the container +Restoring a container is only possible for a previously checkpointed container. The restored container will +continue to run at exactly the same point in time it was checkpointed. +To restore the container use: +```console +$ sudo podman container restore +``` + +After being restored, the container will answer requests again as it did before checkpointing. +```console +# curl http://:8080 +``` + ### Stopping the container To stop the httpd container: ```console -- cgit v1.2.3-54-g00ecf From 5246238e7efbf9280e07f45df9ee14ecc3404f20 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 20 Sep 2018 13:46:50 +0000 Subject: tests: add checkpoint/restore test Signed-off-by: Adrian Reber --- Dockerfile | 12 ++++ test/e2e/checkpoint_test.go | 129 ++++++++++++++++++++++++++++++++++++++++++ test/e2e/libpod_suite_test.go | 37 ++++++++++++ 3 files changed, 178 insertions(+) create mode 100644 test/e2e/checkpoint_test.go diff --git a/Dockerfile b/Dockerfile index 749c5edb9..2c43cb046 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,6 +18,8 @@ RUN apt-get update && apt-get install -y \ libaio-dev \ libcap-dev \ libfuse-dev \ + libnet-dev \ + libnl-3-dev \ libostree-dev \ libprotobuf-dev \ libprotobuf-c0-dev \ @@ -110,6 +112,16 @@ RUN set -x \ && go get -u github.com/mailru/easyjson/... \ && install -D -m 755 "$GOPATH"/bin/easyjson /usr/bin/ +# Install criu +ENV CRIU_COMMIT 584cbe4643c3fc7dc901ff08bf923ca0fe7326f9 +RUN set -x \ + && cd /tmp \ + && git clone https://github.com/checkpoint-restore/criu.git \ + && cd criu \ + && make \ + && install -D -m 755 criu/criu /usr/sbin/ \ + && rm -rf /tmp/criu + # Install cni config #RUN make install.cni RUN mkdir -p /etc/cni/net.d/ diff --git a/test/e2e/checkpoint_test.go b/test/e2e/checkpoint_test.go new file mode 100644 index 000000000..6c5d891a0 --- /dev/null +++ b/test/e2e/checkpoint_test.go @@ -0,0 +1,129 @@ +package integration + +import ( + "fmt" + "os" + + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" +) + +var _ = Describe("Podman checkpoint", func() { + var ( + tempdir string + err error + podmanTest PodmanTest + ) + + BeforeEach(func() { + tempdir, err = CreateTempDirInTempDir() + if err != nil { + os.Exit(1) + } + podmanTest = PodmanCreate(tempdir) + podmanTest.RestoreAllArtifacts() + // At least CRIU 3.11 is needed + skip, err := podmanTest.isCriuAtLeast(31100) + if err != nil || skip { + Skip("CRIU missing or too old.") + } + }) + + AfterEach(func() { + podmanTest.Cleanup() + f := CurrentGinkgoTestDescription() + timedResult := fmt.Sprintf("Test: %s completed in %f seconds", f.TestText, f.Duration.Seconds()) + GinkgoWriter.Write([]byte(timedResult)) + }) + + It("podman checkpoint bogus container", func() { + session := podmanTest.Podman([]string{"container", "checkpoint", "foobar"}) + session.WaitWithDefaultTimeout() + Expect(session.ExitCode()).To(Not(Equal(0))) + }) + + It("podman restore bogus container", func() { + session := podmanTest.Podman([]string{"container", "restore", "foobar"}) + session.WaitWithDefaultTimeout() + Expect(session.ExitCode()).To(Not(Equal(0))) + }) + + It("podman checkpoint a running container by id", func() { + // CRIU does not work with seccomp correctly on RHEL7 + session := podmanTest.Podman([]string{"run", "-it", "--security-opt", "seccomp=unconfined", "-d", ALPINE, "top"}) + session.WaitWithDefaultTimeout() + Expect(session.ExitCode()).To(Equal(0)) + cid := session.OutputToString() + + result := podmanTest.Podman([]string{"container", "checkpoint", cid}) + result.WaitWithDefaultTimeout() + + Expect(result.ExitCode()).To(Equal(0)) + Expect(podmanTest.NumberOfContainersRunning()).To(Equal(0)) + Expect(podmanTest.GetContainerStatus()).To(ContainSubstring("Exited")) + + result = podmanTest.Podman([]string{"container", "restore", cid}) + result.WaitWithDefaultTimeout() + + Expect(result.ExitCode()).To(Equal(0)) + Expect(podmanTest.NumberOfContainersRunning()).To(Equal(1)) + Expect(podmanTest.GetContainerStatus()).To(ContainSubstring("Up")) + }) + + It("podman checkpoint a running container by name", func() { + session := podmanTest.Podman([]string{"run", "-it", "--security-opt", "seccomp=unconfined", "--name", "test_name", "-d", ALPINE, "top"}) + session.WaitWithDefaultTimeout() + Expect(session.ExitCode()).To(Equal(0)) + + result := podmanTest.Podman([]string{"container", "checkpoint", "test_name"}) + result.WaitWithDefaultTimeout() + + Expect(result.ExitCode()).To(Equal(0)) + Expect(podmanTest.NumberOfContainersRunning()).To(Equal(0)) + Expect(podmanTest.GetContainerStatus()).To(ContainSubstring("Exited")) + + result = podmanTest.Podman([]string{"container", "restore", "test_name"}) + result.WaitWithDefaultTimeout() + + Expect(result.ExitCode()).To(Equal(0)) + Expect(podmanTest.NumberOfContainersRunning()).To(Equal(1)) + Expect(podmanTest.GetContainerStatus()).To(ContainSubstring("Up")) + }) + + It("podman pause a checkpointed container by id", func() { + session := podmanTest.Podman([]string{"run", "-it", "--security-opt", "seccomp=unconfined", "-d", ALPINE, "top"}) + session.WaitWithDefaultTimeout() + Expect(session.ExitCode()).To(Equal(0)) + cid := session.OutputToString() + + result := podmanTest.Podman([]string{"container", "checkpoint", cid}) + result.WaitWithDefaultTimeout() + + Expect(result.ExitCode()).To(Equal(0)) + Expect(podmanTest.NumberOfContainersRunning()).To(Equal(0)) + Expect(podmanTest.GetContainerStatus()).To(ContainSubstring("Exited")) + + result = podmanTest.Podman([]string{"pause", cid}) + result.WaitWithDefaultTimeout() + + Expect(result.ExitCode()).To(Equal(125)) + Expect(podmanTest.NumberOfContainersRunning()).To(Equal(0)) + Expect(podmanTest.GetContainerStatus()).To(ContainSubstring("Exited")) + + result = podmanTest.Podman([]string{"container", "restore", cid}) + result.WaitWithDefaultTimeout() + Expect(result.ExitCode()).To(Equal(0)) + Expect(podmanTest.NumberOfContainersRunning()).To(Equal(1)) + + result = podmanTest.Podman([]string{"rm", cid}) + result.WaitWithDefaultTimeout() + Expect(result.ExitCode()).To(Equal(125)) + Expect(podmanTest.NumberOfContainersRunning()).To(Equal(1)) + + result = podmanTest.Podman([]string{"rm", "-f", cid}) + result.WaitWithDefaultTimeout() + Expect(result.ExitCode()).To(Equal(0)) + Expect(podmanTest.NumberOfContainersRunning()).To(Equal(0)) + + }) +}) diff --git a/test/e2e/libpod_suite_test.go b/test/e2e/libpod_suite_test.go index d521632d7..a032b0e88 100644 --- a/test/e2e/libpod_suite_test.go +++ b/test/e2e/libpod_suite_test.go @@ -2,6 +2,7 @@ package integration import ( "bufio" + "bytes" "context" "encoding/json" "fmt" @@ -64,6 +65,7 @@ type PodmanTest struct { TempDir string CgroupManager string Host HostOS + CriuBinary string } // HostOS is a simple struct for the test os @@ -164,6 +166,7 @@ func PodmanCreate(tempDir string) PodmanTest { runCBinary = "/usr/bin/runc" } + criuBinary := "/usr/sbin/criu" CNIConfigDir := "/etc/cni/net.d" p := PodmanTest{ @@ -179,6 +182,7 @@ func PodmanCreate(tempDir string) PodmanTest { TempDir: tempDir, CgroupManager: cgroupManager, Host: host, + CriuBinary: criuBinary, } // Setup registries.conf ENV variable @@ -678,6 +682,39 @@ func (p *PodmanTest) setRegistriesConfigEnv(b []byte) { ioutil.WriteFile(outfile, b, 0644) } +func (p *PodmanTest) isCriuAtLeast(version int) (bool, error) { + cmd := exec.Command(p.CriuBinary, "-V") + var out bytes.Buffer + cmd.Stdout = &out + err := cmd.Run() + if err != nil { + return false, err + } + + var x int + var y int + var z int + + fmt.Sscanf(out.String(), "Version: %d.%d.%d", &x, &y, &z) + + if strings.Contains(out.String(), "GitID") { + // If CRIU is built from git it contains a git ID. + // If that is the case, increase minor by one as this + // could mean we are running a development version. + y = y + 1 + } + + parsed_version := x*10000 + y*100 + z + + fmt.Println(parsed_version) + + if parsed_version >= version { + return false, nil + } else { + return true, nil + } +} + func resetRegistriesConfigEnv() { os.Setenv("REGISTRIES_CONFIG_PATH", "") } -- cgit v1.2.3-54-g00ecf From dc987af0b0146ec5fd2026ca8db403806c3425df Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 20 Sep 2018 18:02:01 +0000 Subject: completions: add checkpoint/restore completions Signed-off-by: Adrian Reber --- completions/bash/podman | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/completions/bash/podman b/completions/bash/podman index f63bf4469..604a25f5d 100644 --- a/completions/bash/podman +++ b/completions/bash/podman @@ -87,6 +87,10 @@ __podman_complete_containers_all() { __podman_complete_containers "$@" --all } +__podman_complete_containers_created() { + __podman_complete_containers "$@" --all --filter status=created +} + __podman_complete_containers_running() { __podman_complete_containers "$@" --filter status=running } @@ -710,6 +714,24 @@ _podman_container_attach() { _podman_attach } +_podman_container_checkpoint() { + local options_with_args=" + --help -h + " + local boolean_options=" + --keep + -k + " + case "$cur" in + -*) + COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur")) + ;; + *) + __podman_complete_containers_running + ;; + esac +} + _podman_container_commit() { _podman_commit } @@ -770,6 +792,24 @@ _podman_container_restart() { _podman_restart } +_podman_container_restore() { + local options_with_args=" + --help -h + " + local boolean_options=" + --keep + -k + " + case "$cur" in + -*) + COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur")) + ;; + *) + __podman_complete_containers_created + ;; + esac +} + _podman_container_rm() { _podman_rm } @@ -817,6 +857,7 @@ _podman_container() { " subcommands=" attach + checkpoint commit create diff @@ -831,6 +872,7 @@ _podman_container() { port refresh restart + restore rm run start -- cgit v1.2.3-54-g00ecf