podman: fix memleak caused by renaming and not deleting

the exit file If the container exit code needs to be retained, it cannot be retained in tmpfs, because libpod runs in a memcg itself so it can't leave traces with a daemon-less design. This wasn't a memleak detectable by kmemleak for example. The kernel never lost track of the memory and there was no erroneous refcounting either. The reference count dependencies however are not easy to track because when a refcount is increased, there's no way to tell who's still holding the reference. In this case it was a single page of tmpfs pagecache holding a refcount that kept pinned a whole hierarchy of dying memcg, slab kmem, cgropups, unrechable kernfs nodes and the respective dentries and inodes. Such a problem wouldn't happen if the exit file was stored in a regular filesystem because the pagecache could be reclaimed in such case under memory pressure. The tmpfs page can be swapped out, but that's not enough to release the memcg with CONFIG_MEMCG_SWAP_ENABLED=y. No amount of more aggressive kernel slab shrinking could have solved this. Not even assigning slab kmem of dying cgroups to alive cgroup would fully solve this. The only way to free the memory of a dying cgroup when a struct page still references it, would be to loop over all "struct page" in the kernel to find which one is associated with the dying cgroup which is a O(N) operation (where N is the number of pages and can reach billions). Linking all the tmpfs pages to the memcg would cost less during memcg offlining, but it would waste lots of memory and CPU globally. So this can't be optimized in the kernel. A cronjob running this command can act as workaround and will allow all slab cache to be released, not just the single tmpfs pages. rm -f /run/libpod/exits/* This patch solved the memleak with a reproducer, booting with cgroup.memory=nokmem and with selinux disabled. The reason memcg kmem and selinux were disabled for testing of this fix, is because kmem greatly decreases the kernel effectiveness in reusing partial slab objects. cgroup.memory=nokmem is strongly recommended at least for workstation usage. selinux needs to be further analyzed because it causes further slab allocations. The upstream podman commit used for testing is 1fe2965e4f672674f7b66648e9973a0ed5434bb4 (v1.4.4). The upstream kernel commit used for testing is f16fea666898dbdd7812ce94068c76da3e3fcf1e (v5.2-rc6). Reported-by: Michele Baldessari <michele@redhat.com> Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> <Applied with small tweaks to comments> Signed-off-by: Matthew Heon <matthew.heon@pm.me>
author: Matthew Heon <matthew.heon@pm.me> 2019-07-10 15:09:33 -0400
committer: Matthew Heon <matthew.heon@pm.me> 2019-07-31 17:28:42 -0400
commit: ebacfbd091f709d7ca0b811a9fe1fee57c6f0ad3 (patch)
tree: d25bdc09d9972971ab89bfde8848eb24a271c0e9
parent: a622f8d345b1853401de2e533e9fbf14ef169fa2 (diff)
download: podman-ebacfbd091f709d7ca0b811a9fe1fee57c6f0ad3.tar.gz
podman-ebacfbd091f709d7ca0b811a9fe1fee57c6f0ad3.tar.bz2
podman-ebacfbd091f709d7ca0b811a9fe1fee57c6f0ad3.zip
2 files changed, 7 insertions, 55 deletions
diff --git a/libpod/container_internal.go b/libpod/container_internal.go
index 83ee5640e..3114e00c0 100644
--- a/libpod/container_internal.go
+++ b/libpod/container_internal.go
@@ -634,19 +634,15 @@ func (c *Container) removeConmonFiles() error {
 		return errors.Wrapf(err, "error removing container %s OOM file", c.ID())
 	}
 
-	// Instead of outright deleting the exit file, rename it (if it exists).
-	// We want to retain it so we can get the exit code of containers which
-	// are removed (at least until we have a workable events system)
+	// Remove the exit file so we don't leak memory in tmpfs
 	exitFile := filepath.Join(c.ociRuntime.exitsDir, c.ID())
-	oldExitFile := filepath.Join(c.ociRuntime.exitsDir, fmt.Sprintf("%s-old", c.ID()))
 	if _, err := os.Stat(exitFile); err != nil {
 		if !os.IsNotExist(err) {
 			return errors.Wrapf(err, "error running stat on container %s exit file", c.ID())
 		}
 	} else {
-		// Rename should replace the old exit file (if it exists)
-		if err := os.Rename(exitFile, oldExitFile); err != nil {
-			return errors.Wrapf(err, "error renaming container %s exit file", c.ID())
+		if err := os.Remove(exitFile); err != nil {
+			return errors.Wrapf(err, "error removing container %s exit file", c.ID())
 		}
 	}
 
diff --git a/pkg/adapter/containers.go b/pkg/adapter/containers.go
index faaef3e60..525b1f29b 100644
--- a/pkg/adapter/containers.go
+++ b/pkg/adapter/containers.go
@@ -9,7 +9,6 @@ import (
 	"io"
 	"io/ioutil"
 	"os"
-	"path/filepath"
 	"strconv"
 	"strings"
 	"sync"
@@ -418,15 +417,8 @@ func (r *LocalRuntime) Run(ctx context.Context, c *cliconfig.RunValues, exitCode
 
 	if ecode, err := ctr.Wait(); err != nil {
 		if errors.Cause(err) == define.ErrNoSuchCtr {
-			// The container may have been removed
-			// Go looking for an exit file
-			ctrExitCode, err := ReadExitFile(config.TmpDir, ctr.ID())
-			if err != nil {
-				logrus.Errorf("Cannot get exit code: %v", err)
-				exitCode = 127
-			} else {
-				exitCode = ctrExitCode
-			}
+			logrus.Errorf("Cannot get exit code: %v", err)
+			exitCode = 127
 		}
 	} else {
 		exitCode = int(ecode)
@@ -441,31 +433,6 @@ func (r *LocalRuntime) Run(ctx context.Context, c *cliconfig.RunValues, exitCode
 	return exitCode, nil
 }
 
-// ReadExitFile reads a container's exit file
-func ReadExitFile(runtimeTmp, ctrID string) (int, error) {
-	exitFile := filepath.Join(runtimeTmp, "exits", fmt.Sprintf("%s-old", ctrID))
-
-	logrus.Debugf("Attempting to read container %s exit code from file %s", ctrID, exitFile)
-
-	// Check if it exists
-	if _, err := os.Stat(exitFile); err != nil {
-		return 0, errors.Wrapf(err, "error getting exit file for container %s", ctrID)
-	}
-
-	// File exists, read it in and convert to int
-	statusStr, err := ioutil.ReadFile(exitFile)
-	if err != nil {
-		return 0, errors.Wrapf(err, "error reading exit file for container %s", ctrID)
-	}
-
-	exitCode, err := strconv.Atoi(string(statusStr))
-	if err != nil {
-		return 0, errors.Wrapf(err, "error parsing exit code for container %s", ctrID)
-	}
-
-	return exitCode, nil
-}
-
 // Ps ...
 func (r *LocalRuntime) Ps(c *cliconfig.PsValues, opts shared.PsOptions) ([]shared.PsContainerOutput, error) {
 	maxWorkers := shared.Parallelize("ps")
@@ -655,19 +622,8 @@ func (r *LocalRuntime) Start(ctx context.Context, c *cliconfig.StartValues, sigP
 
 			if ecode, err := ctr.Wait(); err != nil {
 				if errors.Cause(err) == define.ErrNoSuchCtr {
-					// The container may have been removed
-					// Go looking for an exit file
-					rtc, err := r.GetConfig()
-					if err != nil {
-						return 0, err
-					}
-					ctrExitCode, err := ReadExitFile(rtc.TmpDir, ctr.ID())
-					if err != nil {
-						logrus.Errorf("Cannot get exit code: %v", err)
-						exitCode = 127
-					} else {
-						exitCode = ctrExitCode
-					}
+					logrus.Errorf("Cannot get exit code: %v", err)
+					exitCode = 127
 				}
 			} else {
 				exitCode = int(ecode)
author	Matthew Heon <matthew.heon@pm.me>	2019-07-10 15:09:33 -0400
committer	Matthew Heon <matthew.heon@pm.me>	2019-07-31 17:28:42 -0400
commit	ebacfbd091f709d7ca0b811a9fe1fee57c6f0ad3 (patch)
tree	d25bdc09d9972971ab89bfde8848eb24a271c0e9
parent	a622f8d345b1853401de2e533e9fbf14ef169fa2 (diff)
download	podman-ebacfbd091f709d7ca0b811a9fe1fee57c6f0ad3.tar.gz podman-ebacfbd091f709d7ca0b811a9fe1fee57c6f0ad3.tar.bz2 podman-ebacfbd091f709d7ca0b811a9fe1fee57c6f0ad3.zip