From 2bb2425704cc7181c5eb924400b351b3a2d9a592 Mon Sep 17 00:00:00 2001 From: Matthew Heon Date: Wed, 19 Aug 2020 16:15:35 -0400 Subject: Move pod jobs to parallel execution Make Podman pod operations that do not involve starting containers (which needs to be done in a specific order) use the same parallel operation code we use to make `podman stop` on large numbers of containers fast. We were previously stopping containers in a pod serially, which could take up to the timeout (default 15 seconds) for each container - stopping 100 containers that do not respond to SIGTERM would take 25 minutes. To do this, refactor the parallel operation code a bit to remove its dependency on libpod (damn circular import restrictions...) and use parallel functions that just re-use the standard container API operations - maximizes code reuse (previously each pod handler had a separate implementation of the container function it performed). This is a bit of a palate cleanser after fighting CI for two days - nice to be able to return to a land of sanity. Signed-off-by: Matthew Heon --- pkg/parallel/ctr/ctr.go | 40 +++++++++++++++++++++++++++++ pkg/parallel/parallel.go | 30 ++++++++++++++++++++++ pkg/parallel/parallel_linux.go | 57 ------------------------------------------ 3 files changed, 70 insertions(+), 57 deletions(-) create mode 100644 pkg/parallel/ctr/ctr.go delete mode 100644 pkg/parallel/parallel_linux.go (limited to 'pkg/parallel') diff --git a/pkg/parallel/ctr/ctr.go b/pkg/parallel/ctr/ctr.go new file mode 100644 index 000000000..e8c1292b8 --- /dev/null +++ b/pkg/parallel/ctr/ctr.go @@ -0,0 +1,40 @@ +package ctr + +import ( + "context" + + "github.com/containers/podman/v2/libpod" + "github.com/containers/podman/v2/pkg/parallel" + "github.com/sirupsen/logrus" +) + +// ContainerOp performs the given function on the given set of +// containers, using a number of parallel threads. +// If no error is returned, each container specified in ctrs will have an entry +// in the resulting map; containers with no error will be set to nil. +func ContainerOp(ctx context.Context, ctrs []*libpod.Container, applyFunc func(*libpod.Container) error) (map[*libpod.Container]error, error) { + // We could use a sync.Map but given Go's lack of generic I'd rather + // just use a lock on a normal map... + // The expectation is that most of the time is spent in applyFunc + // anyways. + var ( + errMap = make(map[*libpod.Container]<-chan error) + ) + + for _, ctr := range ctrs { + c := ctr + logrus.Debugf("Starting parallel job on container %s", c.ID()) + errChan := parallel.Enqueue(ctx, func() error { + return applyFunc(c) + }) + errMap[c] = errChan + } + + finalErr := make(map[*libpod.Container]error) + for ctr, errChan := range errMap { + err := <-errChan + finalErr[ctr] = err + } + + return finalErr, nil +} diff --git a/pkg/parallel/parallel.go b/pkg/parallel/parallel.go index c9e4da50d..4da7e0f89 100644 --- a/pkg/parallel/parallel.go +++ b/pkg/parallel/parallel.go @@ -1,6 +1,7 @@ package parallel import ( + "context" "sync" "github.com/pkg/errors" @@ -42,3 +43,32 @@ func SetMaxThreads(threads uint) error { func GetMaxThreads() uint { return numThreads } + +// Enqueue adds a single function to the parallel jobs queue. This function will +// be run when an unused thread is available. +// Returns a receive-only error channel that will return the error (if any) from +// the provided function fn when fn has finished executing. The channel will be +// closed after this. +func Enqueue(ctx context.Context, fn func() error) <-chan error { + retChan := make(chan error) + + go func() { + jobControlLock.RLock() + defer jobControlLock.RUnlock() + + defer close(retChan) + + if err := jobControl.Acquire(ctx, 1); err != nil { + retChan <- errors.Wrapf(err, "error acquiring job control semaphore") + return + } + + err := fn() + + jobControl.Release(1) + + retChan <- err + }() + + return retChan +} diff --git a/pkg/parallel/parallel_linux.go b/pkg/parallel/parallel_linux.go deleted file mode 100644 index 442db1502..000000000 --- a/pkg/parallel/parallel_linux.go +++ /dev/null @@ -1,57 +0,0 @@ -package parallel - -import ( - "context" - "sync" - - "github.com/containers/podman/v2/libpod" - "github.com/pkg/errors" - "github.com/sirupsen/logrus" -) - -// ContainerOp performs the given function on the given set of -// containers, using a number of parallel threads. -// If no error is returned, each container specified in ctrs will have an entry -// in the resulting map; containers with no error will be set to nil. -func ContainerOp(ctx context.Context, ctrs []*libpod.Container, applyFunc func(*libpod.Container) error) (map[*libpod.Container]error, error) { - jobControlLock.RLock() - defer jobControlLock.RUnlock() - - // We could use a sync.Map but given Go's lack of generic I'd rather - // just use a lock on a normal map... - // The expectation is that most of the time is spent in applyFunc - // anyways. - var ( - errMap = make(map[*libpod.Container]error) - errLock sync.Mutex - allDone sync.WaitGroup - ) - - for _, ctr := range ctrs { - // Block until a thread is available - if err := jobControl.Acquire(ctx, 1); err != nil { - return nil, errors.Wrapf(err, "error acquiring job control semaphore") - } - - allDone.Add(1) - - c := ctr - go func() { - logrus.Debugf("Launching job on container %s", c.ID()) - - err := applyFunc(c) - errLock.Lock() - errMap[c] = err - errLock.Unlock() - - allDone.Done() - jobControl.Release(1) - }() - } - - allDone.Wait() - - return errMap, nil -} - -// TODO: Add an Enqueue() function that returns a promise -- cgit v1.2.3-54-g00ecf