From 2bb2425704cc7181c5eb924400b351b3a2d9a592 Mon Sep 17 00:00:00 2001
From: Matthew Heon <matthew.heon@pm.me>
Date: Wed, 19 Aug 2020 16:15:35 -0400
Subject: Move pod jobs to parallel execution

Make Podman pod operations that do not involve starting
containers (which needs to be done in a specific order) use the
same parallel operation code we use to make `podman stop` on
large numbers of containers fast. We were previously stopping
containers in a pod serially, which could take up to the timeout
(default 15 seconds) for each container - stopping 100 containers
that do not respond to SIGTERM would take 25 minutes.

To do this, refactor the parallel operation code a bit to remove
its dependency on libpod (damn circular import restrictions...)
and use parallel functions that just re-use the standard
container API operations - maximizes code reuse (previously each
pod handler had a separate implementation of the container
function it performed).

This is a bit of a palate cleanser after fighting CI for two
days - nice to be able to return to a land of sanity.

Signed-off-by: Matthew Heon <matthew.heon@pm.me>
---
 pkg/parallel/ctr/ctr.go        | 40 +++++++++++++++++++++++++++++
 pkg/parallel/parallel.go       | 30 ++++++++++++++++++++++
 pkg/parallel/parallel_linux.go | 57 ------------------------------------------
 3 files changed, 70 insertions(+), 57 deletions(-)
 create mode 100644 pkg/parallel/ctr/ctr.go
 delete mode 100644 pkg/parallel/parallel_linux.go

(limited to 'pkg/parallel')

diff --git a/pkg/parallel/ctr/ctr.go b/pkg/parallel/ctr/ctr.go
new file mode 100644
index 000000000..e8c1292b8
--- /dev/null
+++ b/pkg/parallel/ctr/ctr.go
@@ -0,0 +1,40 @@
+package ctr
+
+import (
+	"context"
+
+	"github.com/containers/podman/v2/libpod"
+	"github.com/containers/podman/v2/pkg/parallel"
+	"github.com/sirupsen/logrus"
+)
+
+// ContainerOp performs the given function on the given set of
+// containers, using a number of parallel threads.
+// If no error is returned, each container specified in ctrs will have an entry
+// in the resulting map; containers with no error will be set to nil.
+func ContainerOp(ctx context.Context, ctrs []*libpod.Container, applyFunc func(*libpod.Container) error) (map[*libpod.Container]error, error) {
+	// We could use a sync.Map but given Go's lack of generic I'd rather
+	// just use a lock on a normal map...
+	// The expectation is that most of the time is spent in applyFunc
+	// anyways.
+	var (
+		errMap = make(map[*libpod.Container]<-chan error)
+	)
+
+	for _, ctr := range ctrs {
+		c := ctr
+		logrus.Debugf("Starting parallel job on container %s", c.ID())
+		errChan := parallel.Enqueue(ctx, func() error {
+			return applyFunc(c)
+		})
+		errMap[c] = errChan
+	}
+
+	finalErr := make(map[*libpod.Container]error)
+	for ctr, errChan := range errMap {
+		err := <-errChan
+		finalErr[ctr] = err
+	}
+
+	return finalErr, nil
+}
diff --git a/pkg/parallel/parallel.go b/pkg/parallel/parallel.go
index c9e4da50d..4da7e0f89 100644
--- a/pkg/parallel/parallel.go
+++ b/pkg/parallel/parallel.go
@@ -1,6 +1,7 @@
 package parallel
 
 import (
+	"context"
 	"sync"
 
 	"github.com/pkg/errors"
@@ -42,3 +43,32 @@ func SetMaxThreads(threads uint) error {
 func GetMaxThreads() uint {
 	return numThreads
 }
+
+// Enqueue adds a single function to the parallel jobs queue. This function will
+// be run when an unused thread is available.
+// Returns a receive-only error channel that will return the error (if any) from
+// the provided function fn when fn has finished executing. The channel will be
+// closed after this.
+func Enqueue(ctx context.Context, fn func() error) <-chan error {
+	retChan := make(chan error)
+
+	go func() {
+		jobControlLock.RLock()
+		defer jobControlLock.RUnlock()
+
+		defer close(retChan)
+
+		if err := jobControl.Acquire(ctx, 1); err != nil {
+			retChan <- errors.Wrapf(err, "error acquiring job control semaphore")
+			return
+		}
+
+		err := fn()
+
+		jobControl.Release(1)
+
+		retChan <- err
+	}()
+
+	return retChan
+}
diff --git a/pkg/parallel/parallel_linux.go b/pkg/parallel/parallel_linux.go
deleted file mode 100644
index 442db1502..000000000
--- a/pkg/parallel/parallel_linux.go
+++ /dev/null
@@ -1,57 +0,0 @@
-package parallel
-
-import (
-	"context"
-	"sync"
-
-	"github.com/containers/podman/v2/libpod"
-	"github.com/pkg/errors"
-	"github.com/sirupsen/logrus"
-)
-
-// ContainerOp performs the given function on the given set of
-// containers, using a number of parallel threads.
-// If no error is returned, each container specified in ctrs will have an entry
-// in the resulting map; containers with no error will be set to nil.
-func ContainerOp(ctx context.Context, ctrs []*libpod.Container, applyFunc func(*libpod.Container) error) (map[*libpod.Container]error, error) {
-	jobControlLock.RLock()
-	defer jobControlLock.RUnlock()
-
-	// We could use a sync.Map but given Go's lack of generic I'd rather
-	// just use a lock on a normal map...
-	// The expectation is that most of the time is spent in applyFunc
-	// anyways.
-	var (
-		errMap  = make(map[*libpod.Container]error)
-		errLock sync.Mutex
-		allDone sync.WaitGroup
-	)
-
-	for _, ctr := range ctrs {
-		// Block until a thread is available
-		if err := jobControl.Acquire(ctx, 1); err != nil {
-			return nil, errors.Wrapf(err, "error acquiring job control semaphore")
-		}
-
-		allDone.Add(1)
-
-		c := ctr
-		go func() {
-			logrus.Debugf("Launching job on container %s", c.ID())
-
-			err := applyFunc(c)
-			errLock.Lock()
-			errMap[c] = err
-			errLock.Unlock()
-
-			allDone.Done()
-			jobControl.Release(1)
-		}()
-	}
-
-	allDone.Wait()
-
-	return errMap, nil
-}
-
-// TODO: Add an Enqueue() function that returns a promise
-- 
cgit v1.2.3-54-g00ecf