From 4004f646cd7a99f86b2098587c256071a89a4dc7 Mon Sep 17 00:00:00 2001 From: Matthew Heon Date: Sun, 23 Feb 2020 13:25:12 -0500 Subject: Add basic deadlock detection for container start/remove We can easily tell if we're going to deadlock by comparing lock IDs before actually taking the lock. Add a few checks for this in common places where deadlocks might occur. This does not yet cover pod operations, where detection is more difficult (and costly) due to the number of locks being involved being higher than 2. Also, add some error wrapping on the Podman side, so we can tell people to use `system renumber` when it occurs. Signed-off-by: Matthew Heon --- cmd/podman/rm.go | 5 +++++ libpod/container_internal.go | 3 +++ libpod/define/errors.go | 5 +++++ libpod/runtime_ctr.go | 3 +++ libpod/runtime_volume.go | 3 --- libpod/runtime_volume_linux.go | 3 +++ pkg/adapter/containers.go | 13 +++++++++++++ 7 files changed, 32 insertions(+), 3 deletions(-) diff --git a/cmd/podman/rm.go b/cmd/podman/rm.go index e69565e95..644b0ef76 100644 --- a/cmd/podman/rm.go +++ b/cmd/podman/rm.go @@ -4,8 +4,10 @@ import ( "fmt" "github.com/containers/libpod/cmd/podman/cliconfig" + "github.com/containers/libpod/libpod/define" "github.com/containers/libpod/pkg/adapter" "github.com/pkg/errors" + "github.com/sirupsen/logrus" "github.com/spf13/cobra" ) @@ -77,6 +79,9 @@ func rmCmd(c *cliconfig.RmValues) error { if len(failures) > 0 { for _, err := range failures { + if errors.Cause(err) == define.ErrWillDeadlock { + logrus.Errorf("Potential deadlock detected - please run 'podman system renumber' to resolve") + } exitCode = setExitCode(err) } } diff --git a/libpod/container_internal.go b/libpod/container_internal.go index 11f9721dc..ff43bfc8f 100644 --- a/libpod/container_internal.go +++ b/libpod/container_internal.go @@ -1401,6 +1401,9 @@ func (c *Container) mountNamedVolume(v *ContainerNamedVolume, mountpoint string) return nil, errors.Wrapf(err, "error retrieving named volume %s for container %s", v.Name, c.ID()) } + if vol.config.LockID == c.config.LockID { + return nil, errors.Wrapf(define.ErrWillDeadlock, "container %s and volume %s share lock ID %d", c.ID(), vol.Name(), c.config.LockID) + } vol.lock.Lock() defer vol.lock.Unlock() if vol.needsMount() { diff --git a/libpod/define/errors.go b/libpod/define/errors.go index 523062866..b79cf08dc 100644 --- a/libpod/define/errors.go +++ b/libpod/define/errors.go @@ -61,6 +61,11 @@ var ( // the user. ErrDetach = utils.ErrDetach + // ErrWillDeadlock indicates that the requested operation will cause a + // deadlock. This is usually caused by upgrade issues, and is resolved + // by renumbering the locks. + ErrWillDeadlock = errors.New("deadlock due to lock mismatch") + // ErrNoCgroups indicates that the container does not have its own // CGroup. ErrNoCgroups = errors.New("this container does not have a cgroup") diff --git a/libpod/runtime_ctr.go b/libpod/runtime_ctr.go index 3ad09f27c..39284026c 100644 --- a/libpod/runtime_ctr.go +++ b/libpod/runtime_ctr.go @@ -412,6 +412,9 @@ func (r *Runtime) removeContainer(ctx context.Context, c *Container, force bool, } // Lock the pod while we're removing container + if pod.config.LockID == c.config.LockID { + return errors.Wrapf(define.ErrWillDeadlock, "container %s and pod %s share lock ID %d", c.ID(), pod.ID(), c.config.LockID) + } pod.lock.Lock() defer pod.lock.Unlock() if err := pod.updatePod(); err != nil { diff --git a/libpod/runtime_volume.go b/libpod/runtime_volume.go index 835dccf9c..efc3c5bd9 100644 --- a/libpod/runtime_volume.go +++ b/libpod/runtime_volume.go @@ -36,9 +36,6 @@ func (r *Runtime) RemoveVolume(ctx context.Context, v *Volume, force bool) error } } - v.lock.Lock() - defer v.lock.Unlock() - return r.removeVolume(ctx, v, force) } diff --git a/libpod/runtime_volume_linux.go b/libpod/runtime_volume_linux.go index 037cf4cc2..e9cfda9d4 100644 --- a/libpod/runtime_volume_linux.go +++ b/libpod/runtime_volume_linux.go @@ -124,6 +124,9 @@ func (r *Runtime) removeVolume(ctx context.Context, v *Volume, force bool) error return define.ErrVolumeRemoved } + v.lock.Lock() + defer v.lock.Unlock() + // Update volume status to pick up a potential removal from state if err := v.update(); err != nil { return err diff --git a/pkg/adapter/containers.go b/pkg/adapter/containers.go index 78057e3f9..08e19edb8 100644 --- a/pkg/adapter/containers.go +++ b/pkg/adapter/containers.go @@ -469,6 +469,10 @@ func (r *LocalRuntime) Run(ctx context.Context, c *cliconfig.RunValues, exitCode logrus.Debugf("unable to remove container %s after failing to start and attach to it", ctr.ID()) } } + if errors.Cause(err) == define.ErrWillDeadlock { + logrus.Debugf("Deadlock error: %v", err) + return define.ExitCode(err), errors.Errorf("attempting to start container %s would cause a deadlock; please run 'podman system renumber' to resolve", ctr.ID()) + } return define.ExitCode(err), err } @@ -702,6 +706,11 @@ func (r *LocalRuntime) Start(ctx context.Context, c *cliconfig.StartValues, sigP return exitCode, nil } + if errors.Cause(err) == define.ErrWillDeadlock { + logrus.Debugf("Deadlock error: %v", err) + return define.ExitCode(err), errors.Errorf("attempting to start container %s would cause a deadlock; please run 'podman system renumber' to resolve", ctr.ID()) + } + if ctrRunning { return 0, err } @@ -735,6 +744,10 @@ func (r *LocalRuntime) Start(ctx context.Context, c *cliconfig.StartValues, sigP if lastError != nil { fmt.Fprintln(os.Stderr, lastError) } + if errors.Cause(err) == define.ErrWillDeadlock { + lastError = errors.Wrapf(err, "please run 'podman system renumber' to resolve deadlocks") + continue + } lastError = errors.Wrapf(err, "unable to start container %q", container) continue } -- cgit v1.2.3-54-g00ecf