diff options
author | OpenShift Merge Robot <openshift-merge-robot@users.noreply.github.com> | 2019-05-08 20:19:39 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-05-08 20:19:39 +0200 |
commit | 7b54ebb48fecb4205304586a57a3e6b9b96befa0 (patch) | |
tree | e5262050dec0b31c228f3ac577c49bb7d549a26b | |
parent | 64d1a357e8299c5fcecd4141091424419111cdee (diff) | |
parent | e9c78b411379fc0cfa245bc6b4b25b3bd84077a1 (diff) | |
download | podman-7b54ebb48fecb4205304586a57a3e6b9b96befa0.tar.gz podman-7b54ebb48fecb4205304586a57a3e6b9b96befa0.tar.bz2 podman-7b54ebb48fecb4205304586a57a3e6b9b96befa0.zip |
Merge pull request #3082 from mheon/pod_removal_reliability
Improve robustness of pod removal
-rw-r--r-- | libpod/runtime_pod_linux.go | 126 |
1 files changed, 81 insertions, 45 deletions
diff --git a/libpod/runtime_pod_linux.go b/libpod/runtime_pod_linux.go index 0011c771a..5867b1f87 100644 --- a/libpod/runtime_pod_linux.go +++ b/libpod/runtime_pod_linux.go @@ -149,10 +149,10 @@ func (r *Runtime) removePod(ctx context.Context, p *Pod, removeCtrs, force bool) } // Go through and lock all containers so we can operate on them all at once - dependencies := make(map[string][]string) for _, ctr := range ctrs { - ctr.lock.Lock() - defer ctr.lock.Unlock() + ctrLock := ctr.lock + ctrLock.Lock() + defer ctrLock.Unlock() // Sync all containers if err := ctr.syncContainer(); err != nil { @@ -177,23 +177,12 @@ func (r *Runtime) removePod(ctx context.Context, p *Pod, removeCtrs, force bool) if len(ctr.state.ExecSessions) != 0 && !force { return errors.Wrapf(ErrCtrStateInvalid, "pod %s contains container %s which has active exec sessions", p.ID(), ctr.ID()) } - - deps, err := r.state.ContainerInUse(ctr) - if err != nil { - return err - } - dependencies[ctr.ID()] = deps } - // Check if containers have dependencies - // If they do, and the dependencies are not in the pod, error - for ctr, deps := range dependencies { - for _, dep := range deps { - if _, ok := dependencies[dep]; !ok { - return errors.Wrapf(ErrCtrExists, "container %s depends on container %s not in pod %s", ctr, dep, p.ID()) - } - } - } + // We maintain the invariant that container dependencies must all exist + // within the container's pod. + // No need to check dependencies as such - we're removing all containers + // in the pod at once, no dependency issues. // First loop through all containers and stop them // Do not remove in this loop to ensure that we don't remove unless all @@ -220,18 +209,40 @@ func (r *Runtime) removePod(ctx context.Context, p *Pod, removeCtrs, force bool) } } - // Start removing containers - // We can remove containers even if they have dependencies now - // As we have guaranteed their dependencies are in the pod + // Remove all containers in the pod from the state. + if err := r.state.RemovePodContainers(p); err != nil { + return err + } + + var removalErr error + + // Clean up after our removed containers. + // Errors here are nonfatal - the containers have already been evicted. + // We'll do our best to clean up after them, but we have to keep going + // and remove the pod as well. + // From here until we remove the pod from the state, no error returns. for _, ctr := range ctrs { + // The container no longer exists in the state, mark invalid. + ctr.valid = false + + ctr.newContainerEvent(events.Remove) + // Clean up network namespace, cgroups, mounts if err := ctr.cleanup(ctx); err != nil { - return err + if removalErr == nil { + removalErr = err + } else { + logrus.Errorf("Unable to clean up container %s: %v", ctr.ID(), err) + } } // Stop container's storage if err := ctr.teardownStorage(); err != nil { - return err + if removalErr == nil { + removalErr = err + } else { + logrus.Errorf("Unable to tear down container %s storage: %v", ctr.ID(), err) + } } // Delete the container from runtime (only if we are not @@ -239,26 +250,24 @@ func (r *Runtime) removePod(ctx context.Context, p *Pod, removeCtrs, force bool) if ctr.state.State != ContainerStateConfigured && ctr.state.State != ContainerStateExited { if err := ctr.delete(ctx); err != nil { - return err + if removalErr == nil { + removalErr = err + } else { + logrus.Errorf("Unable to remove container %s from OCI runtime: %v", ctr.ID(), err) + } } } // Free the container's lock if err := ctr.lock.Free(); err != nil { - return err + if removalErr == nil { + removalErr = errors.Wrapf(err, "error freeing container %s lock", ctr.ID()) + } else { + logrus.Errorf("Unable to free container %s lock: %v", ctr.ID(), err) + } } } - // Remove containers from the state - if err := r.state.RemovePodContainers(p); err != nil { - return err - } - - // Mark containers invalid - for _, ctr := range ctrs { - ctr.valid = false - } - // Remove pod cgroup, if present if p.state.CgroupPath != "" { logrus.Debugf("Removing pod cgroup %s", p.state.CgroupPath) @@ -266,10 +275,11 @@ func (r *Runtime) removePod(ctx context.Context, p *Pod, removeCtrs, force bool) switch p.runtime.config.CgroupManager { case SystemdCgroupsManager: if err := deleteSystemdCgroup(p.state.CgroupPath); err != nil { - // The pod is already almost gone. - // No point in hard-failing if we fail - // this bit of cleanup. - logrus.Errorf("Error deleting pod %s cgroup %s: %v", p.ID(), p.state.CgroupPath, err) + if removalErr == nil { + removalErr = errors.Wrapf(err, "error removing pod %s cgroup", p.ID()) + } else { + logrus.Errorf("Error deleting pod %s cgroup %s: %v", p.ID(), p.state.CgroupPath, err) + } } case CgroupfsCgroupsManager: // Delete the cgroupfs cgroup @@ -280,34 +290,60 @@ func (r *Runtime) removePod(ctx context.Context, p *Pod, removeCtrs, force bool) conmonCgroupPath := filepath.Join(p.state.CgroupPath, "conmon") conmonCgroup, err := cgroups.Load(v1CGroups, cgroups.StaticPath(conmonCgroupPath)) if err != nil && err != cgroups.ErrCgroupDeleted { - return err + if removalErr == nil { + removalErr = errors.Wrapf(err, "error retrieving pod %s conmon cgroup", p.ID()) + } else { + logrus.Debugf("Error retrieving pod %s conmon cgroup %s: %v", p.ID(), conmonCgroupPath, err) + } } if err == nil { if err := conmonCgroup.Delete(); err != nil { - logrus.Errorf("Error deleting pod %s conmon cgroup %s: %v", p.ID(), conmonCgroupPath, err) + if removalErr == nil { + removalErr = errors.Wrapf(err, "error removing pod %s conmon cgroup", p.ID()) + } else { + logrus.Errorf("Error deleting pod %s conmon cgroup %s: %v", p.ID(), conmonCgroupPath, err) + } } } cgroup, err := cgroups.Load(v1CGroups, cgroups.StaticPath(p.state.CgroupPath)) if err != nil && err != cgroups.ErrCgroupDeleted { - return err + if removalErr == nil { + removalErr = errors.Wrapf(err, "error retrieving pod %s cgroup", p.ID()) + } else { + logrus.Errorf("Error retrieving pod %s cgroup %s: %v", p.ID(), p.state.CgroupPath, err) + } } if err == nil { if err := cgroup.Delete(); err != nil { - logrus.Errorf("Error deleting pod %s cgroup %s: %v", p.ID(), p.state.CgroupPath, err) + if removalErr == nil { + removalErr = errors.Wrapf(err, "error removing pod %s cgroup", p.ID()) + } else { + logrus.Errorf("Error deleting pod %s cgroup %s: %v", p.ID(), p.state.CgroupPath, err) + } } } default: - return errors.Wrapf(ErrInvalidArg, "unknown cgroups manager %s specified", p.runtime.config.CgroupManager) + // This should be caught much earlier, but let's still + // keep going so we make sure to evict the pod before + // ending up with an inconsistent state. + if removalErr == nil { + removalErr = errors.Wrapf(ErrInternal, "unrecognized cgroup manager %s when removing pod %s cgroups", p.runtime.config.CgroupManager, p.ID()) + } else { + logrus.Errorf("Unknown cgroups manager %s specified - cannot remove pod %s cgroup", p.runtime.config.CgroupManager, p.ID()) + } } } // Remove pod from state if err := r.state.RemovePod(p); err != nil { + if removalErr != nil { + logrus.Errorf("%v", removalErr) + } return err } // Mark pod invalid p.valid = false p.newPodEvent(events.Remove) - return nil + return removalErr } |