summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthew Heon <matthew.heon@pm.me>2021-09-21 14:24:23 -0400
committerMatthew Heon <matthew.heon@pm.me>2021-09-21 15:32:07 -0400
commit4ecbc7caae1726f0eb4646aa0bd94947eed86f90 (patch)
treeb186d39e81c75d9af078507a45d823702122b896
parentcd7b48198c38c5028540e85dc72dd3406f4318f0 (diff)
downloadpodman-4ecbc7caae1726f0eb4646aa0bd94947eed86f90.tar.gz
podman-4ecbc7caae1726f0eb4646aa0bd94947eed86f90.tar.bz2
podman-4ecbc7caae1726f0eb4646aa0bd94947eed86f90.zip
Add a backoff and retries to retrieving exited event
There's a potential race around extremely short-running containers and events with journald. Events may not be written for some time (small, but appreciable) after they are received, and as such we can fail to retrieve it if there is a sufficiently short time between us writing the event and trying to read it. Work around this by just retrying, with a 0.25 second delay between retries, up to 4 times. [NO TESTS NEEDED] because I have no idea how to reproduce this race in CI. Fixes #11633 Signed-off-by: Matthew Heon <matthew.heon@pm.me>
-rw-r--r--pkg/domain/infra/abi/containers.go55
1 files changed, 25 insertions, 30 deletions
diff --git a/pkg/domain/infra/abi/containers.go b/pkg/domain/infra/abi/containers.go
index dc5f7a0df..affed64d1 100644
--- a/pkg/domain/infra/abi/containers.go
+++ b/pkg/domain/infra/abi/containers.go
@@ -830,21 +830,7 @@ func (ic *ContainerEngine) ContainerStart(ctx context.Context, namesOrIds []stri
}
return reports, errors.Wrapf(err, "unable to start container %s", ctr.ID())
}
-
- if ecode, err := ctr.Wait(ctx); err != nil {
- if errors.Cause(err) == define.ErrNoSuchCtr {
- // Check events
- event, err := ic.Libpod.GetLastContainerEvent(ctx, ctr.ID(), events.Exited)
- if err != nil {
- logrus.Errorf("Cannot get exit code: %v", err)
- exitCode = define.ExecErrorCodeNotFound
- } else {
- exitCode = event.ContainerExitCode
- }
- }
- } else {
- exitCode = int(ecode)
- }
+ exitCode = ic.GetContainerExitCode(ctx, ctr)
reports = append(reports, &entities.ContainerStartReport{
Id: ctr.ID(),
RawInput: rawInput,
@@ -985,21 +971,7 @@ func (ic *ContainerEngine) ContainerRun(ctx context.Context, opts entities.Conta
report.ExitCode = define.ExitCode(err)
return &report, err
}
-
- if ecode, err := ctr.Wait(ctx); err != nil {
- if errors.Cause(err) == define.ErrNoSuchCtr {
- // Check events
- event, err := ic.Libpod.GetLastContainerEvent(ctx, ctr.ID(), events.Exited)
- if err != nil {
- logrus.Errorf("Cannot get exit code: %v", err)
- report.ExitCode = define.ExecErrorCodeNotFound
- } else {
- report.ExitCode = event.ContainerExitCode
- }
- }
- } else {
- report.ExitCode = int(ecode)
- }
+ report.ExitCode = ic.GetContainerExitCode(ctx, ctr)
if opts.Rm && !ctr.ShouldRestart(ctx) {
if err := ic.Libpod.RemoveContainer(ctx, ctr, false, true); err != nil {
if errors.Cause(err) == define.ErrNoSuchCtr ||
@@ -1013,6 +985,29 @@ func (ic *ContainerEngine) ContainerRun(ctx context.Context, opts entities.Conta
return &report, nil
}
+func (ic *ContainerEngine) GetContainerExitCode(ctx context.Context, ctr *libpod.Container) int {
+ exitCode, err := ctr.Wait(ctx)
+ if err == nil {
+ return int(exitCode)
+ }
+ if errors.Cause(err) != define.ErrNoSuchCtr {
+ logrus.Errorf("Could not retrieve exit code: %v", err)
+ return define.ExecErrorCodeNotFound
+ }
+ // Make 4 attempt with 0.25s backoff between each for 1 second total
+ var event *events.Event
+ for i := 0; i < 4; i++ {
+ event, err = ic.Libpod.GetLastContainerEvent(ctx, ctr.ID(), events.Exited)
+ if err != nil {
+ time.Sleep(250 * time.Millisecond)
+ continue
+ }
+ return int(event.ContainerExitCode)
+ }
+ logrus.Errorf("Could not retrieve exit code from event: %v", err)
+ return define.ExecErrorCodeNotFound
+}
+
func (ic *ContainerEngine) ContainerLogs(ctx context.Context, containers []string, options entities.ContainerLogsOptions) error {
if options.StdoutWriter == nil && options.StderrWriter == nil {
return errors.New("no io.Writer set for container logs")