libpod: fix wait and exit-code logic

This commit addresses three intertwined bugs to fix an issue when using Gitlab runner on Podman. The three bug fixes are not split into separate commits as tests won't pass otherwise; avoidable noise when bisecting future issues. 1) Podman conflated states: even when asking to wait for the `exited` state, Podman returned as soon as a container transitioned to `stopped`. The issues surfaced in Gitlab tests to fail [1] as `conmon`'s buffers have not (yet) been emptied when attaching to a container right after a wait. The race window was extremely narrow, and I only managed to reproduce with the Gitlab runner [1] unit tests. 2) The clearer separation between `exited` and `stopped` revealed a race condition predating the changes. If a container is configured for autoremoval (e.g., via `run --rm`), the "run" process competes with the "cleanup" process running in the background. The window of the race condition was sufficiently large that the "cleanup" process has already removed the container and storage before the "run" process could read the exit code and hence waited indefinitely. Address the exit-code race condition by recording exit codes in the main libpod database. Exit codes can now be read from a database. When waiting for a container to exit, Podman first waits for the container to transition to `exited` and will then query the database for its exit code. Outdated exit codes are pruned during cleanup (i.e., non-performance critical) and when refreshing the database after a reboot. An exit code is considered outdated when it is older than 5 minutes. While the race condition predates this change, the waiting process has apparently always been fast enough in catching the exit code due to issue 1): `exited` and `stopped` were conflated. The waiting process hence caught the exit code after the container transitioned to `stopped` but before it `exited` and got removed. 3) With 1) and 2), Podman is now waiting for a container to properly transition to the `exited` state. Some tests did not pass after 1) and 2) which revealed the third bug: `conmon` was executed with its working directory pointing to the OCI runtime bundle of the container. The changed working directory broke resolving relative paths in the "cleanup" process. The "cleanup" process error'ed before actually cleaning up the container and waiting "main" process ran indefinitely - or until hitting a timeout. Fix the issue by executing `conmon` with the same working directory as Podman. Note that fixing 3) *may* address a number of issues we have seen in the past where for *some* reason cleanup processes did not fire. [1] https://gitlab.com/gitlab-org/gitlab-runner/-/issues/27119#note_970712864 Signed-off-by: Valentin Rothberg <vrothberg@redhat.com> [MH: Minor reword of commit message] Signed-off-by: Matthew Heon <mheon@redhat.com>
author: Valentin Rothberg <vrothberg@redhat.com> 2022-06-10 12:38:28 +0200
committer: Matthew Heon <mheon@redhat.com> 2022-06-23 09:11:57 -0400
commit: 30e7cbccc1942d4f8e3b4f489b9f33f30dec7233 (patch)
tree: 532d323a187caf4309b0ec0f5a5ace46574f27d0 /libpod/boltdb_state_internal.go
parent: 15188dce0566ffcba9e7da6fbde69625f49e0e16 (diff)
download: podman-30e7cbccc1942d4f8e3b4f489b9f33f30dec7233.tar.gz
podman-30e7cbccc1942d4f8e3b4f489b9f33f30dec7233.tar.bz2
podman-30e7cbccc1942d4f8e3b4f489b9f33f30dec7233.zip
1 files changed, 22 insertions, 0 deletions
diff --git a/libpod/boltdb_state_internal.go b/libpod/boltdb_state_internal.go
index d6f035af9..edba78d6d 100644
--- a/libpod/boltdb_state_internal.go
+++ b/libpod/boltdb_state_internal.go
@@ -29,6 +29,9 @@ const (
 	aliasesName       = "aliases"
 	runtimeConfigName = "runtime-config"
 
+	exitCodeName          = "exit-code"
+	exitCodeTimeStampName = "exit-code-time-stamp"
+
 	configName         = "config"
 	stateName          = "state"
 	dependenciesName   = "dependencies"
@@ -65,6 +68,9 @@ var (
 	volDependenciesBkt = []byte(volCtrDependencies)
 	networksBkt        = []byte(networksName)
 
+	exitCodeBkt          = []byte(exitCodeName)
+	exitCodeTimeStampBkt = []byte(exitCodeTimeStampName)
+
 	configKey     = []byte(configName)
 	stateKey      = []byte(stateName)
 	netNSKey      = []byte(netNSName)
@@ -362,6 +368,22 @@ func getRuntimeConfigBucket(tx *bolt.Tx) (*bolt.Bucket, error) {
 	return bkt, nil
 }
 
+func getExitCodeBucket(tx *bolt.Tx) (*bolt.Bucket, error) {
+	bkt := tx.Bucket(exitCodeBkt)
+	if bkt == nil {
+		return nil, errors.Wrapf(define.ErrDBBadConfig, "exit-code container bucket not found in DB")
+	}
+	return bkt, nil
+}
+
+func getExitCodeTimeStampBucket(tx *bolt.Tx) (*bolt.Bucket, error) {
+	bkt := tx.Bucket(exitCodeTimeStampBkt)
+	if bkt == nil {
+		return nil, errors.Wrapf(define.ErrDBBadConfig, "exit-code time stamp bucket not found in DB")
+	}
+	return bkt, nil
+}
+
 func (s *BoltState) getContainerConfigFromDB(id []byte, config *ContainerConfig, ctrsBkt *bolt.Bucket) error {
 	ctrBkt := ctrsBkt.Bucket(id)
 	if ctrBkt == nil {
author	Valentin Rothberg <vrothberg@redhat.com>	2022-06-10 12:38:28 +0200
committer	Matthew Heon <mheon@redhat.com>	2022-06-23 09:11:57 -0400
commit	30e7cbccc1942d4f8e3b4f489b9f33f30dec7233 (patch)
tree	532d323a187caf4309b0ec0f5a5ace46574f27d0 /libpod/boltdb_state_internal.go
parent	15188dce0566ffcba9e7da6fbde69625f49e0e16 (diff)
download	podman-30e7cbccc1942d4f8e3b4f489b9f33f30dec7233.tar.gz podman-30e7cbccc1942d4f8e3b4f489b9f33f30dec7233.tar.bz2 podman-30e7cbccc1942d4f8e3b4f489b9f33f30dec7233.zip