From 4eff0c8cf284a6007122aec731e4d97059750166 Mon Sep 17 00:00:00 2001
From: Valentin Rothberg <vrothberg@redhat.com>
Date: Wed, 13 Apr 2022 16:21:21 +0200
Subject: pod: add exit policies

Add the notion of an "exit policy" to a pod.  This policy controls the
behaviour when the last container of pod exits.  Initially, there are
two policies:

 - "continue" : the pod continues running. This is the default policy
                when creating a pod.

 - "stop" : stop the pod when the last container exits. This is the
            default behaviour for `play kube`.

In order to implement the deferred stop of a pod, add a worker queue to
the libpod runtime.  The queue will pick up work items and in this case
helps resolve dead locks that would otherwise occur if we attempted to
stop a pod during container cleanup.

Note that the default restart policy of `play kube` is "Always".  Hence,
in order to really solve #13464, the YAML files must set a custom
restart policy; the tests use "OnFailure".

Fixes: #13464
Signed-off-by: Valentin Rothberg <vrothberg@redhat.com>
---
 libpod/container_internal.go | 42 ++++++++++++++++++++++++++++++++++++
 libpod/define/pod_inspect.go |  2 ++
 libpod/options.go            | 18 ++++++++++++++++
 libpod/pod.go                |  4 ++++
 libpod/pod_api.go            | 51 ++++++++++++++++++++++++++++++++++++++++++++
 libpod/runtime.go            | 14 ++++++++++++
 libpod/runtime_worker.go     | 41 +++++++++++++++++++++++++++++++++++
 7 files changed, 172 insertions(+)
 create mode 100644 libpod/runtime_worker.go

(limited to 'libpod')

diff --git a/libpod/container_internal.go b/libpod/container_internal.go
index 5c6719bdf..7494eb3ec 100644
--- a/libpod/container_internal.go
+++ b/libpod/container_internal.go
@@ -1939,9 +1939,51 @@ func (c *Container) cleanup(ctx context.Context) error {
 		}
 	}
 
+	if err := c.stopPodIfNeeded(context.Background()); err != nil {
+		if lastError == nil {
+			lastError = err
+		} else {
+			logrus.Errorf("Stopping pod of container %s: %v", c.ID(), err)
+		}
+	}
+
 	return lastError
 }
 
+// If the container is part of a pod where only the infra container remains
+// running, attempt to stop the pod.
+func (c *Container) stopPodIfNeeded(ctx context.Context) error {
+	if c.config.Pod == "" {
+		return nil
+	}
+
+	pod, err := c.runtime.state.Pod(c.config.Pod)
+	if err != nil {
+		return fmt.Errorf("container %s is in pod %s, but pod cannot be retrieved: %w", c.ID(), c.config.Pod, err)
+	}
+
+	switch pod.config.ExitPolicy {
+	case config.PodExitPolicyContinue:
+		return nil
+
+	case config.PodExitPolicyStop:
+		// Use the runtime's work queue to stop the pod. This resolves
+		// a number of scenarios where we'd otherwise run into
+		// deadlocks.  For instance, during `pod stop`, the pod has
+		// already been locked.
+		// The work queue is a simple means without having to worry about
+		// future changes that may introduce more deadlock scenarios.
+		c.runtime.queueWork(func() {
+			if err := pod.stopIfOnlyInfraRemains(ctx, c.ID()); err != nil {
+				if !errors.Is(err, define.ErrNoSuchPod) {
+					logrus.Errorf("Checking if infra needs to be stopped: %v", err)
+				}
+			}
+		})
+	}
+	return nil
+}
+
 // delete deletes the container and runs any configured poststop
 // hooks.
 func (c *Container) delete(ctx context.Context) error {
diff --git a/libpod/define/pod_inspect.go b/libpod/define/pod_inspect.go
index e85a660a1..219ffade2 100644
--- a/libpod/define/pod_inspect.go
+++ b/libpod/define/pod_inspect.go
@@ -19,6 +19,8 @@ type InspectPodData struct {
 	// CreateCommand is the full command plus arguments of the process the
 	// container has been created with.
 	CreateCommand []string `json:"CreateCommand,omitempty"`
+	// ExitPolicy of the pod.
+	ExitPolicy string `json:"ExitPolicy,omitempty"`
 	// State represents the current state of the pod.
 	State string `json:"State"`
 	// Hostname is the hostname that the pod will set.
diff --git a/libpod/options.go b/libpod/options.go
index 98eb45e76..9b83cb76a 100644
--- a/libpod/options.go
+++ b/libpod/options.go
@@ -1843,6 +1843,24 @@ func WithPodName(name string) PodCreateOption {
 	}
 }
 
+// WithPodExitPolicy sets the exit policy of the pod.
+func WithPodExitPolicy(policy string) PodCreateOption {
+	return func(pod *Pod) error {
+		if pod.valid {
+			return define.ErrPodFinalized
+		}
+
+		parsed, err := config.ParsePodExitPolicy(policy)
+		if err != nil {
+			return err
+		}
+
+		pod.config.ExitPolicy = parsed
+
+		return nil
+	}
+}
+
 // WithPodHostname sets the hostname of the pod.
 func WithPodHostname(hostname string) PodCreateOption {
 	return func(pod *Pod) error {
diff --git a/libpod/pod.go b/libpod/pod.go
index 237c42901..2211d5be7 100644
--- a/libpod/pod.go
+++ b/libpod/pod.go
@@ -6,6 +6,7 @@ import (
 	"strings"
 	"time"
 
+	"github.com/containers/common/pkg/config"
 	"github.com/containers/podman/v4/libpod/define"
 	"github.com/containers/podman/v4/libpod/lock"
 	"github.com/opencontainers/runtime-spec/specs-go"
@@ -70,6 +71,9 @@ type PodConfig struct {
 	// container has been created with.
 	CreateCommand []string `json:"CreateCommand,omitempty"`
 
+	// The pod's exit policy.
+	ExitPolicy config.PodExitPolicy `json:"ExitPolicy,omitempty"`
+
 	// ID of the pod's lock
 	LockID uint32 `json:"lockID"`
 }
diff --git a/libpod/pod_api.go b/libpod/pod_api.go
index ba30d878e..73b28822b 100644
--- a/libpod/pod_api.go
+++ b/libpod/pod_api.go
@@ -2,6 +2,7 @@ package libpod
 
 import (
 	"context"
+	"fmt"
 
 	"github.com/containers/common/pkg/cgroups"
 	"github.com/containers/podman/v4/libpod/define"
@@ -134,6 +135,10 @@ func (p *Pod) StopWithTimeout(ctx context.Context, cleanup bool, timeout int) (m
 	p.lock.Lock()
 	defer p.lock.Unlock()
 
+	return p.stopWithTimeout(ctx, cleanup, timeout)
+}
+
+func (p *Pod) stopWithTimeout(ctx context.Context, cleanup bool, timeout int) (map[string]error, error) {
 	if !p.valid {
 		return nil, define.ErrPodRemoved
 	}
@@ -195,6 +200,51 @@ func (p *Pod) StopWithTimeout(ctx context.Context, cleanup bool, timeout int) (m
 	return nil, nil
 }
 
+// Stops the pod if only the infra containers remains running.
+func (p *Pod) stopIfOnlyInfraRemains(ctx context.Context, ignoreID string) error {
+	p.lock.Lock()
+	defer p.lock.Unlock()
+
+	infraID := ""
+
+	if p.HasInfraContainer() {
+		infra, err := p.infraContainer()
+		if err != nil {
+			return err
+		}
+		infraID = infra.ID()
+	}
+
+	allCtrs, err := p.runtime.state.PodContainers(p)
+	if err != nil {
+		return err
+	}
+
+	for _, ctr := range allCtrs {
+		if ctr.ID() == infraID || ctr.ID() == ignoreID {
+			continue
+		}
+
+		state, err := ctr.State()
+		if err != nil {
+			return fmt.Errorf("getting state of container %s: %w", ctr.ID(), err)
+		}
+
+		switch state {
+		case define.ContainerStateExited,
+			define.ContainerStateRemoving,
+			define.ContainerStateStopping,
+			define.ContainerStateUnknown:
+			continue
+		default:
+			return nil
+		}
+	}
+
+	_, err = p.stopWithTimeout(ctx, true, -1)
+	return err
+}
+
 // Cleanup cleans up all containers within a pod that have stopped.
 // All containers are cleaned up independently. An error with one container will
 // not prevent other containers being cleaned up.
@@ -661,6 +711,7 @@ func (p *Pod) Inspect() (*define.InspectPodData, error) {
 		Namespace:          p.Namespace(),
 		Created:            p.CreatedTime(),
 		CreateCommand:      p.config.CreateCommand,
+		ExitPolicy:         string(p.config.ExitPolicy),
 		State:              podState,
 		Hostname:           p.config.Hostname,
 		Labels:             p.Labels(),
diff --git a/libpod/runtime.go b/libpod/runtime.go
index 6c2323d88..d5daa2f8a 100644
--- a/libpod/runtime.go
+++ b/libpod/runtime.go
@@ -86,6 +86,10 @@ type Runtime struct {
 	libimageEventsShutdown chan bool
 	lockManager            lock.Manager
 
+	// Worker
+	workerShutdown chan bool
+	workerChannel  chan func()
+
 	// syslog describes whenever logrus should log to the syslog as well.
 	// Note that the syslog hook will be enabled early in cmd/podman/syslog_linux.go
 	// This bool is just needed so that we can set it for netavark interface.
@@ -597,6 +601,8 @@ func makeRuntime(runtime *Runtime) (retErr error) {
 		}
 	}
 
+	runtime.startWorker()
+
 	// Mark the runtime as valid - ready to be used, cannot be modified
 	// further
 	runtime.valid = true
@@ -817,6 +823,14 @@ func (r *Runtime) Shutdown(force bool) error {
 		return define.ErrRuntimeStopped
 	}
 
+	if r.workerShutdown != nil {
+		// Signal the worker routine to shutdown.  The routine will
+		// process all pending work items and then read from the
+		// channel; we're blocked until all work items have been
+		// processed.
+		r.workerShutdown <- true
+	}
+
 	r.valid = false
 
 	// Shutdown all containers if --force is given
diff --git a/libpod/runtime_worker.go b/libpod/runtime_worker.go
new file mode 100644
index 000000000..ca44a27f7
--- /dev/null
+++ b/libpod/runtime_worker.go
@@ -0,0 +1,41 @@
+package libpod
+
+import (
+	"time"
+)
+
+func (r *Runtime) startWorker() {
+	if r.workerChannel == nil {
+		r.workerChannel = make(chan func(), 1)
+		r.workerShutdown = make(chan bool)
+	}
+	go func() {
+		for {
+			// Make sure to read all workers before
+			// checking if we're about to shutdown.
+			for len(r.workerChannel) > 0 {
+				w := <-r.workerChannel
+				w()
+			}
+
+			select {
+			// We'll read from the shutdown channel only when all
+			// items above have been processed.
+			//
+			// (*Runtime).Shutdown() will block until until the
+			// item is read.
+			case <-r.workerShutdown:
+				return
+
+			default:
+				time.Sleep(100 * time.Millisecond)
+			}
+		}
+	}()
+}
+
+func (r *Runtime) queueWork(f func()) {
+	go func() {
+		r.workerChannel <- f
+	}()
+}
-- 
cgit v1.2.3-54-g00ecf