Move pod jobs to parallel execution

Make Podman pod operations that do not involve starting containers (which needs to be done in a specific order) use the same parallel operation code we use to make `podman stop` on large numbers of containers fast. We were previously stopping containers in a pod serially, which could take up to the timeout (default 15 seconds) for each container - stopping 100 containers that do not respond to SIGTERM would take 25 minutes. To do this, refactor the parallel operation code a bit to remove its dependency on libpod (damn circular import restrictions...) and use parallel functions that just re-use the standard container API operations - maximizes code reuse (previously each pod handler had a separate implementation of the container function it performed). This is a bit of a palate cleanser after fighting CI for two days - nice to be able to return to a land of sanity. Signed-off-by: Matthew Heon <matthew.heon@pm.me>
author: Matthew Heon <matthew.heon@pm.me> 2020-08-19 16:15:35 -0400
committer: Matthew Heon <mheon@redhat.com> 2020-10-07 10:00:11 -0400
commit: 2bb2425704cc7181c5eb924400b351b3a2d9a592 (patch)
tree: 0a850a365ef085fce496354a241649b7e779877a /pkg/parallel
parent: a7500e54a4646c7db477349e2530ac13df77b8fa (diff)
download: podman-2bb2425704cc7181c5eb924400b351b3a2d9a592.tar.gz
podman-2bb2425704cc7181c5eb924400b351b3a2d9a592.tar.bz2
podman-2bb2425704cc7181c5eb924400b351b3a2d9a592.zip
2 files changed, 44 insertions, 31 deletions
diff --git a/pkg/parallel/parallel_linux.go b/pkg/parallel/ctr/ctr.go
index 442db1502..e8c1292b8 100644
--- a/pkg/parallel/parallel_linux.go
+++ b/pkg/parallel/ctr/ctr.go
@@ -1,11 +1,10 @@
-package parallel
+package ctr
 
 import (
 	"context"
-	"sync"
 
 	"github.com/containers/podman/v2/libpod"
-	"github.com/pkg/errors"
+	"github.com/containers/podman/v2/pkg/parallel"
 	"github.com/sirupsen/logrus"
 )
 
@@ -14,44 +13,28 @@ import (
 // If no error is returned, each container specified in ctrs will have an entry
 // in the resulting map; containers with no error will be set to nil.
 func ContainerOp(ctx context.Context, ctrs []*libpod.Container, applyFunc func(*libpod.Container) error) (map[*libpod.Container]error, error) {
-	jobControlLock.RLock()
-	defer jobControlLock.RUnlock()
-
 	// We could use a sync.Map but given Go's lack of generic I'd rather
 	// just use a lock on a normal map...
 	// The expectation is that most of the time is spent in applyFunc
 	// anyways.
 	var (
-		errMap  = make(map[*libpod.Container]error)
-		errLock sync.Mutex
-		allDone sync.WaitGroup
+		errMap = make(map[*libpod.Container]<-chan error)
 	)
 
 	for _, ctr := range ctrs {
-		// Block until a thread is available
-		if err := jobControl.Acquire(ctx, 1); err != nil {
-			return nil, errors.Wrapf(err, "error acquiring job control semaphore")
-		}
-
-		allDone.Add(1)
-
 		c := ctr
-		go func() {
-			logrus.Debugf("Launching job on container %s", c.ID())
-
-			err := applyFunc(c)
-			errLock.Lock()
-			errMap[c] = err
-			errLock.Unlock()
-
-			allDone.Done()
-			jobControl.Release(1)
-		}()
+		logrus.Debugf("Starting parallel job on container %s", c.ID())
+		errChan := parallel.Enqueue(ctx, func() error {
+			return applyFunc(c)
+		})
+		errMap[c] = errChan
 	}
 
-	allDone.Wait()
+	finalErr := make(map[*libpod.Container]error)
+	for ctr, errChan := range errMap {
+		err := <-errChan
+		finalErr[ctr] = err
+	}
 
-	return errMap, nil
+	return finalErr, nil
 }
-
-// TODO: Add an Enqueue() function that returns a promise
diff --git a/pkg/parallel/parallel.go b/pkg/parallel/parallel.go
index c9e4da50d..4da7e0f89 100644
--- a/pkg/parallel/parallel.go
+++ b/pkg/parallel/parallel.go
@@ -1,6 +1,7 @@
 package parallel
 
 import (
+	"context"
 	"sync"
 
 	"github.com/pkg/errors"
@@ -42,3 +43,32 @@ func SetMaxThreads(threads uint) error {
 func GetMaxThreads() uint {
 	return numThreads
 }
+
+// Enqueue adds a single function to the parallel jobs queue. This function will
+// be run when an unused thread is available.
+// Returns a receive-only error channel that will return the error (if any) from
+// the provided function fn when fn has finished executing. The channel will be
+// closed after this.
+func Enqueue(ctx context.Context, fn func() error) <-chan error {
+	retChan := make(chan error)
+
+	go func() {
+		jobControlLock.RLock()
+		defer jobControlLock.RUnlock()
+
+		defer close(retChan)
+
+		if err := jobControl.Acquire(ctx, 1); err != nil {
+			retChan <- errors.Wrapf(err, "error acquiring job control semaphore")
+			return
+		}
+
+		err := fn()
+
+		jobControl.Release(1)
+
+		retChan <- err
+	}()
+
+	return retChan
+}
author	Matthew Heon <matthew.heon@pm.me>	2020-08-19 16:15:35 -0400
committer	Matthew Heon <mheon@redhat.com>	2020-10-07 10:00:11 -0400
commit	2bb2425704cc7181c5eb924400b351b3a2d9a592 (patch)
tree	0a850a365ef085fce496354a241649b7e779877a /pkg/parallel
parent	a7500e54a4646c7db477349e2530ac13df77b8fa (diff)
download	podman-2bb2425704cc7181c5eb924400b351b3a2d9a592.tar.gz podman-2bb2425704cc7181c5eb924400b351b3a2d9a592.tar.bz2 podman-2bb2425704cc7181c5eb924400b351b3a2d9a592.zip