aboutsummaryrefslogtreecommitdiff
path: root/libpod
diff options
context:
space:
mode:
Diffstat (limited to 'libpod')
-rw-r--r--libpod/boltdb_state.go175
-rw-r--r--libpod/boltdb_state_internal.go22
-rw-r--r--libpod/container_api.go97
-rw-r--r--libpod/container_internal.go52
-rw-r--r--libpod/define/errors.go4
-rw-r--r--libpod/events.go3
-rw-r--r--libpod/oci_conmon_linux.go24
-rw-r--r--libpod/state.go9
8 files changed, 307 insertions, 79 deletions
diff --git a/libpod/boltdb_state.go b/libpod/boltdb_state.go
index c3db6152a..e5219ab8c 100644
--- a/libpod/boltdb_state.go
+++ b/libpod/boltdb_state.go
@@ -5,8 +5,10 @@ import (
"fmt"
"net"
"os"
+ "strconv"
"strings"
"sync"
+ "time"
"github.com/containers/common/libnetwork/types"
"github.com/containers/podman/v4/libpod/define"
@@ -63,6 +65,13 @@ type BoltState struct {
// initially created the database. This must match for any further instances
// that access the database, to ensure that state mismatches with
// containers/storage do not occur.
+// - exitCodeBucket/exitCodeTimeStampBucket: (#14559) exit codes must be part
+// of the database to resolve a previous race condition when one process waits
+// for the exit file to be written and another process removes it along with
+// the container during auto-removal. The same race would happen trying to
+// read the exit code from the containers bucket. Hence, exit codes go into
+// their own bucket. To avoid the rather expensive JSON (un)marshaling, we
+// have two buckets: one for the exit codes, the other for the timestamps.
// NewBoltState creates a new bolt-backed state database
func NewBoltState(path string, runtime *Runtime) (State, error) {
@@ -98,6 +107,8 @@ func NewBoltState(path string, runtime *Runtime) (State, error) {
allVolsBkt,
execBkt,
runtimeConfigBkt,
+ exitCodeBkt,
+ exitCodeTimeStampBkt,
}
// Does the DB need an update?
@@ -150,6 +161,10 @@ func (s *BoltState) Refresh() error {
return define.ErrDBClosed
}
+ if err := s.PruneContainerExitCodes(); err != nil {
+ return err
+ }
+
db, err := s.getDBCon()
if err != nil {
return err
@@ -1341,6 +1356,166 @@ func (s *BoltState) GetContainerConfig(id string) (*ContainerConfig, error) {
return config, nil
}
+// AddContainerExitCode adds the exit code for the specified container to the database.
+func (s *BoltState) AddContainerExitCode(id string, exitCode int32) error {
+ db, err := s.getDBCon()
+ if err != nil {
+ return err
+ }
+ defer s.deferredCloseDBCon(db)
+
+ rawID := []byte(id)
+ rawExitCode := []byte(strconv.Itoa(int(exitCode)))
+ rawTimeStamp, err := time.Now().MarshalText()
+ if err != nil {
+ return fmt.Errorf("marshaling exit-code time stamp: %w", err)
+ }
+
+ return db.Update(func(tx *bolt.Tx) error {
+ exitCodeBucket, err := getExitCodeBucket(tx)
+ if err != nil {
+ return err
+ }
+ timeStampBucket, err := getExitCodeTimeStampBucket(tx)
+ if err != nil {
+ return err
+ }
+
+ if err := exitCodeBucket.Put(rawID, rawExitCode); err != nil {
+ return fmt.Errorf("adding exit code of container %s to DB: %w", id, err)
+ }
+ if err := timeStampBucket.Put(rawID, rawTimeStamp); err != nil {
+ if rmErr := exitCodeBucket.Delete(rawID); rmErr != nil {
+ logrus.Errorf("Removing exit code of container %s from DB: %v", id, rmErr)
+ }
+ return fmt.Errorf("adding exit-code time stamp of container %s to DB: %w", id, err)
+ }
+
+ return nil
+ })
+}
+
+// GetContainerExitCode returns the exit code for the specified container.
+func (s *BoltState) GetContainerExitCode(id string) (int32, error) {
+ db, err := s.getDBCon()
+ if err != nil {
+ return -1, err
+ }
+ defer s.deferredCloseDBCon(db)
+
+ rawID := []byte(id)
+ result := int32(-1)
+ return result, db.View(func(tx *bolt.Tx) error {
+ exitCodeBucket, err := getExitCodeBucket(tx)
+ if err != nil {
+ return err
+ }
+
+ rawExitCode := exitCodeBucket.Get(rawID)
+ if rawExitCode == nil {
+ return fmt.Errorf("getting exit code of container %s from DB: %w", id, define.ErrNoSuchExitCode)
+ }
+
+ exitCode, err := strconv.Atoi(string(rawExitCode))
+ if err != nil {
+ return fmt.Errorf("converting raw exit code %v of container %s: %w", rawExitCode, id, err)
+ }
+
+ result = int32(exitCode)
+ return nil
+ })
+}
+
+// GetContainerExitCodeTimeStamp returns the time stamp when the exit code of
+// the specified container was added to the database.
+func (s *BoltState) GetContainerExitCodeTimeStamp(id string) (*time.Time, error) {
+ db, err := s.getDBCon()
+ if err != nil {
+ return nil, err
+ }
+ defer s.deferredCloseDBCon(db)
+
+ rawID := []byte(id)
+ var result time.Time
+ return &result, db.View(func(tx *bolt.Tx) error {
+ timeStampBucket, err := getExitCodeTimeStampBucket(tx)
+ if err != nil {
+ return err
+ }
+
+ rawTimeStamp := timeStampBucket.Get(rawID)
+ if rawTimeStamp == nil {
+ return fmt.Errorf("getting exit-code time stamp of container %s from DB: %w", id, define.ErrNoSuchExitCode)
+ }
+
+ if err := result.UnmarshalText(rawTimeStamp); err != nil {
+ return fmt.Errorf("converting raw time stamp %v of container %s from DB: %w", rawTimeStamp, id, err)
+ }
+
+ return nil
+ })
+}
+
+// PrunExitCodes removes exit codes older than 5 minutes.
+func (s *BoltState) PruneContainerExitCodes() error {
+ db, err := s.getDBCon()
+ if err != nil {
+ return err
+ }
+ defer s.deferredCloseDBCon(db)
+
+ threshold := time.Minute * 5
+ return db.View(func(tx *bolt.Tx) error {
+ timeStampBucket, err := getExitCodeTimeStampBucket(tx)
+ if err != nil {
+ return err
+ }
+
+ return timeStampBucket.ForEach(func(rawID, rawTimeStamp []byte) error {
+ var timeStamp time.Time
+ if err := timeStamp.UnmarshalText(rawTimeStamp); err != nil {
+ return fmt.Errorf("converting raw time stamp %v of container %s from DB: %w", rawTimeStamp, string(rawID), err)
+ }
+ if time.Since(timeStamp) > threshold {
+ // Since the DB connection is locked, pass it
+ // to remove the exit codes to avoid race
+ // conditions.
+ return s.removeContainerExitCode(rawID, db)
+ }
+ return nil
+ })
+ })
+}
+
+// removeContainerExitCode removes the exit code and time stamp of the specified container.
+func (s *BoltState) removeContainerExitCode(rawID []byte, db *bolt.DB) error {
+ return db.Update(func(tx *bolt.Tx) error {
+ exitCodeBucket, err := getExitCodeBucket(tx)
+ if err != nil {
+ return err
+ }
+ timeStampBucket, err := getExitCodeTimeStampBucket(tx)
+ if err != nil {
+ return err
+ }
+
+ var finalErr error
+ if err := exitCodeBucket.Delete(rawID); err != nil {
+ finalErr = fmt.Errorf("removing exit code of container %s from DB: %w", string(rawID), err)
+ }
+ if err := timeStampBucket.Delete(rawID); err != nil {
+ err = fmt.Errorf("removing exit-code time stamp of container %s from DB: %w", string(rawID), err)
+ if finalErr != nil {
+ logrus.Error(err)
+ } else {
+ finalErr = err
+ }
+ }
+
+ return finalErr
+ })
+}
+
// AddExecSession adds an exec session to the state.
func (s *BoltState) AddExecSession(ctr *Container, session *ExecSession) error {
if !s.valid {
diff --git a/libpod/boltdb_state_internal.go b/libpod/boltdb_state_internal.go
index d6f035af9..edba78d6d 100644
--- a/libpod/boltdb_state_internal.go
+++ b/libpod/boltdb_state_internal.go
@@ -29,6 +29,9 @@ const (
aliasesName = "aliases"
runtimeConfigName = "runtime-config"
+ exitCodeName = "exit-code"
+ exitCodeTimeStampName = "exit-code-time-stamp"
+
configName = "config"
stateName = "state"
dependenciesName = "dependencies"
@@ -65,6 +68,9 @@ var (
volDependenciesBkt = []byte(volCtrDependencies)
networksBkt = []byte(networksName)
+ exitCodeBkt = []byte(exitCodeName)
+ exitCodeTimeStampBkt = []byte(exitCodeTimeStampName)
+
configKey = []byte(configName)
stateKey = []byte(stateName)
netNSKey = []byte(netNSName)
@@ -362,6 +368,22 @@ func getRuntimeConfigBucket(tx *bolt.Tx) (*bolt.Bucket, error) {
return bkt, nil
}
+func getExitCodeBucket(tx *bolt.Tx) (*bolt.Bucket, error) {
+ bkt := tx.Bucket(exitCodeBkt)
+ if bkt == nil {
+ return nil, errors.Wrapf(define.ErrDBBadConfig, "exit-code container bucket not found in DB")
+ }
+ return bkt, nil
+}
+
+func getExitCodeTimeStampBucket(tx *bolt.Tx) (*bolt.Bucket, error) {
+ bkt := tx.Bucket(exitCodeTimeStampBkt)
+ if bkt == nil {
+ return nil, errors.Wrapf(define.ErrDBBadConfig, "exit-code time stamp bucket not found in DB")
+ }
+ return bkt, nil
+}
+
func (s *BoltState) getContainerConfigFromDB(id []byte, config *ContainerConfig, ctrsBkt *bolt.Bucket) error {
ctrBkt := ctrsBkt.Bucket(id)
if ctrBkt == nil {
diff --git a/libpod/container_api.go b/libpod/container_api.go
index b064d3528..c14fe95b0 100644
--- a/libpod/container_api.go
+++ b/libpod/container_api.go
@@ -2,6 +2,7 @@ package libpod
import (
"context"
+ "fmt"
"io"
"io/ioutil"
"net/http"
@@ -490,41 +491,84 @@ func (c *Container) RemoveArtifact(name string) error {
// Wait blocks until the container exits and returns its exit code.
func (c *Container) Wait(ctx context.Context) (int32, error) {
- return c.WaitWithInterval(ctx, DefaultWaitInterval)
+ return c.WaitForExit(ctx, DefaultWaitInterval)
}
-// WaitWithInterval blocks until the container to exit and returns its exit
-// code. The argument is the interval at which checks the container's status.
-func (c *Container) WaitWithInterval(ctx context.Context, waitTimeout time.Duration) (int32, error) {
+// WaitForExit blocks until the container exits and returns its exit code. The
+// argument is the interval at which checks the container's status.
+func (c *Container) WaitForExit(ctx context.Context, pollInterval time.Duration) (int32, error) {
if !c.valid {
return -1, define.ErrCtrRemoved
}
- exitFile, err := c.exitFilePath()
- if err != nil {
- return -1, err
- }
- chWait := make(chan error, 1)
+ id := c.ID()
+ var conmonTimer time.Timer
+ conmonTimerSet := false
- go func() {
- <-ctx.Done()
- chWait <- define.ErrCanceled
- }()
+ getExitCode := func() (bool, int32, error) {
+ containerRemoved := false
+ if !c.batched {
+ c.lock.Lock()
+ defer c.lock.Unlock()
+ }
- for {
- // ignore errors here (with exception of cancellation), it is only used to avoid waiting
- // too long.
- _, e := WaitForFile(exitFile, chWait, waitTimeout)
- if e == define.ErrCanceled {
- return -1, define.ErrCanceled
+ if err := c.syncContainer(); err != nil {
+ if !errors.Is(err, define.ErrNoSuchCtr) {
+ return false, -1, err
+ }
+ containerRemoved = true
+ }
+
+ // If conmon is not alive anymore set a timer to make sure
+ // we're returning even if conmon has forcefully been killed.
+ if !conmonTimerSet && !containerRemoved {
+ conmonAlive, err := c.ociRuntime.CheckConmonRunning(c)
+ switch {
+ case errors.Is(err, define.ErrNoSuchCtr):
+ containerRemoved = true
+ case err != nil:
+ return false, -1, err
+ case !conmonAlive:
+ timerDuration := time.Second * 20
+ conmonTimer = *time.NewTimer(timerDuration)
+ conmonTimerSet = true
+ }
+ }
+
+ if !containerRemoved {
+ // If conmon is dead for more than $timerDuration or if the
+ // container has exited properly, try to look up the exit code.
+ select {
+ case <-conmonTimer.C:
+ logrus.Debugf("Exceeded conmon timeout waiting for container %s to exit", id)
+ default:
+ if !c.ensureState(define.ContainerStateExited, define.ContainerStateConfigured) {
+ return false, -1, nil
+ }
+ }
}
- stopped, code, err := c.isStopped()
+ exitCode, err := c.runtime.state.GetContainerExitCode(id)
+ if err != nil {
+ return true, -1, err
+ }
+
+ return true, exitCode, nil
+ }
+
+ for {
+ hasExited, exitCode, err := getExitCode()
+ if hasExited {
+ return exitCode, err
+ }
if err != nil {
return -1, err
}
- if stopped {
- return code, nil
+ select {
+ case <-ctx.Done():
+ return -1, fmt.Errorf("waiting for exit code of container %s canceled", id)
+ default:
+ time.Sleep(pollInterval)
}
}
}
@@ -551,11 +595,12 @@ func (c *Container) WaitForConditionWithInterval(ctx context.Context, waitTimeou
wantedStates := make(map[define.ContainerStatus]bool, len(conditions))
for _, condition := range conditions {
- if condition == define.ContainerStateStopped || condition == define.ContainerStateExited {
+ switch condition {
+ case define.ContainerStateExited, define.ContainerStateStopped:
waitForExit = true
- continue
+ default:
+ wantedStates[condition] = true
}
- wantedStates[condition] = true
}
trySend := func(code int32, err error) {
@@ -572,7 +617,7 @@ func (c *Container) WaitForConditionWithInterval(ctx context.Context, waitTimeou
go func() {
defer wg.Done()
- code, err := c.WaitWithInterval(ctx, waitTimeout)
+ code, err := c.WaitForExit(ctx, waitTimeout)
trySend(code, err)
}()
}
diff --git a/libpod/container_internal.go b/libpod/container_internal.go
index ce48987f6..ae61298f3 100644
--- a/libpod/container_internal.go
+++ b/libpod/container_internal.go
@@ -219,7 +219,7 @@ func (c *Container) handleExitFile(exitFile string, fi os.FileInfo) error {
// Write an event for the container's death
c.newContainerExitedEvent(c.state.ExitCode)
- return nil
+ return c.runtime.state.AddContainerExitCode(c.ID(), c.state.ExitCode)
}
func (c *Container) shouldRestart() bool {
@@ -784,20 +784,6 @@ func (c *Container) getArtifactPath(name string) string {
return filepath.Join(c.config.StaticDir, artifactsDir, name)
}
-// Used with Wait() to determine if a container has exited
-func (c *Container) isStopped() (bool, int32, error) {
- if !c.batched {
- c.lock.Lock()
- defer c.lock.Unlock()
- }
- err := c.syncContainer()
- if err != nil {
- return true, -1, err
- }
-
- return !c.ensureState(define.ContainerStateRunning, define.ContainerStatePaused, define.ContainerStateStopping), c.state.ExitCode, nil
-}
-
// save container state to the database
func (c *Container) save() error {
if err := c.runtime.state.SaveContainer(c); err != nil {
@@ -1282,13 +1268,6 @@ func (c *Container) stop(timeout uint) error {
}
}
- // Check if conmon is still alive.
- // If it is not, we won't be getting an exit file.
- conmonAlive, err := c.ociRuntime.CheckConmonRunning(c)
- if err != nil {
- return err
- }
-
// Set the container state to "stopping" and unlock the container
// before handing it over to conmon to unblock other commands. #8501
// demonstrates nicely that a high stop timeout will block even simple
@@ -1341,21 +1320,18 @@ func (c *Container) stop(timeout uint) error {
}
c.newContainerEvent(events.Stop)
-
- c.state.PID = 0
- c.state.ConmonPID = 0
c.state.StoppedByUser = true
+ conmonAlive, err := c.ociRuntime.CheckConmonRunning(c)
+ if err != nil {
+ return err
+ }
if !conmonAlive {
- // Conmon is dead, so we can't expect an exit code.
- c.state.ExitCode = -1
- c.state.FinishedTime = time.Now()
- c.state.State = define.ContainerStateStopped
- if err := c.save(); err != nil {
- logrus.Errorf("Saving container %s status: %v", c.ID(), err)
+ if err := c.checkExitFile(); err != nil {
+ return err
}
- return errors.Wrapf(define.ErrConmonDead, "container %s conmon process missing, cannot retrieve exit code", c.ID())
+ return c.save()
}
if err := c.save(); err != nil {
@@ -1939,6 +1915,18 @@ func (c *Container) cleanup(ctx context.Context) error {
}
}
+ // Prune the exit codes of other container during clean up.
+ // Since Podman is no daemon, we have to clean them up somewhere.
+ // Cleanup seems like a good place as it's not performance
+ // critical.
+ if err := c.runtime.state.PruneContainerExitCodes(); err != nil {
+ if lastError == nil {
+ lastError = err
+ } else {
+ logrus.Errorf("Pruning container exit codes: %v", err)
+ }
+ }
+
return lastError
}
diff --git a/libpod/define/errors.go b/libpod/define/errors.go
index f5a7c73e5..9757a85b1 100644
--- a/libpod/define/errors.go
+++ b/libpod/define/errors.go
@@ -24,6 +24,10 @@ var (
// not exist.
ErrNoSuchExecSession = errors.New("no such exec session")
+ // ErrNoSuchExitCode indicates that the requested container exit code
+ // does not exist.
+ ErrNoSuchExitCode = errors.New("no such exit code")
+
// ErrDepExists indicates that the current object has dependencies and
// cannot be removed before them.
ErrDepExists = errors.New("dependency exists")
diff --git a/libpod/events.go b/libpod/events.go
index f09d8402a..021b3b53c 100644
--- a/libpod/events.go
+++ b/libpod/events.go
@@ -151,6 +151,9 @@ func (r *Runtime) GetEvents(ctx context.Context, filters []string) ([]*events.Ev
// GetLastContainerEvent takes a container name or ID and an event status and returns
// the last occurrence of the container event
func (r *Runtime) GetLastContainerEvent(ctx context.Context, nameOrID string, containerEvent events.Status) (*events.Event, error) {
+ // FIXME: events should be read in reverse order!
+ // https://github.com/containers/podman/issues/14579
+
// check to make sure the event.Status is valid
if _, err := events.StringToStatus(containerEvent.String()); err != nil {
return nil, err
diff --git a/libpod/oci_conmon_linux.go b/libpod/oci_conmon_linux.go
index b07660937..d417626dc 100644
--- a/libpod/oci_conmon_linux.go
+++ b/libpod/oci_conmon_linux.go
@@ -264,11 +264,6 @@ func (r *ConmonOCIRuntime) CreateContainer(ctr *Container, restoreOptions *Conta
// status, but will instead only check for the existence of the conmon exit file
// and update state to stopped if it exists.
func (r *ConmonOCIRuntime) UpdateContainerStatus(ctr *Container) error {
- exitFile, err := r.ExitFilePath(ctr)
- if err != nil {
- return err
- }
-
runtimeDir, err := util.GetRuntimeDir()
if err != nil {
return err
@@ -340,22 +335,10 @@ func (r *ConmonOCIRuntime) UpdateContainerStatus(ctr *Container) error {
// Only grab exit status if we were not already stopped
// If we were, it should already be in the database
if ctr.state.State == define.ContainerStateStopped && oldState != define.ContainerStateStopped {
- var fi os.FileInfo
- chWait := make(chan error)
- defer close(chWait)
-
- _, err := WaitForFile(exitFile, chWait, time.Second*5)
- if err == nil {
- fi, err = os.Stat(exitFile)
+ if _, err := ctr.Wait(context.Background()); err != nil {
+ logrus.Errorf("Waiting for container %s to exit: %v", ctr.ID(), err)
}
- if err != nil {
- ctr.state.ExitCode = -1
- ctr.state.FinishedTime = time.Now()
- logrus.Errorf("No exit file for container %s found: %v", ctr.ID(), err)
- return nil
- }
-
- return ctr.handleExitFile(exitFile, fi)
+ return nil
}
// Handle ContainerStateStopping - keep it unless the container
@@ -1166,7 +1149,6 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co
}).Debugf("running conmon: %s", r.conmonPath)
cmd := exec.Command(r.conmonPath, args...)
- cmd.Dir = ctr.bundlePath()
cmd.SysProcAttr = &syscall.SysProcAttr{
Setpgid: true,
}
diff --git a/libpod/state.go b/libpod/state.go
index 471023769..4fbd3c302 100644
--- a/libpod/state.go
+++ b/libpod/state.go
@@ -111,6 +111,15 @@ type State interface {
// Return a container config from the database by full ID
GetContainerConfig(id string) (*ContainerConfig, error)
+ // Add the exit code for the specified container to the database.
+ AddContainerExitCode(id string, exitCode int32) error
+
+ // Return the exit code for the specified container.
+ GetContainerExitCode(id string) (int32, error)
+
+ // Remove exit codes older than 5 minutes.
+ PruneContainerExitCodes() error
+
// Add creates a reference to an exec session in the database.
// The container the exec session is attached to will be recorded.
// The container state will not be modified.