From ebacfbd091f709d7ca0b811a9fe1fee57c6f0ad3 Mon Sep 17 00:00:00 2001
From: Matthew Heon <matthew.heon@pm.me>
Date: Wed, 10 Jul 2019 15:09:33 -0400
Subject: podman: fix memleak caused by renaming and not deleting  the exit
 file

If the container exit code needs to be retained, it cannot be retained
in tmpfs, because libpod runs in a memcg itself so it can't leave
traces with a daemon-less design.

This wasn't a memleak detectable by kmemleak for example. The kernel
never lost track of the memory and there was no erroneous refcounting
either. The reference count dependencies however are not easy to track
because when a refcount is increased, there's no way to tell who's
still holding the reference. In this case it was a single page of
tmpfs pagecache holding a refcount that kept pinned a whole hierarchy
of dying memcg, slab kmem, cgropups, unrechable kernfs nodes and the
respective dentries and inodes. Such a problem wouldn't happen if the
exit file was stored in a regular filesystem because the pagecache
could be reclaimed in such case under memory pressure. The tmpfs page
can be swapped out, but that's not enough to release the memcg with
CONFIG_MEMCG_SWAP_ENABLED=y.

No amount of more aggressive kernel slab shrinking could have solved
this. Not even assigning slab kmem of dying cgroups to alive cgroup
would fully solve this. The only way to free the memory of a dying
cgroup when a struct page still references it, would be to loop over
all "struct page" in the kernel to find which one is associated with
the dying cgroup which is a O(N) operation (where N is the number of
pages and can reach billions). Linking all the tmpfs pages to the
memcg would cost less during memcg offlining, but it would waste lots
of memory and CPU globally. So this can't be optimized in the kernel.

A cronjob running this command can act as workaround and will allow
all slab cache to be released, not just the single tmpfs pages.

    rm -f /run/libpod/exits/*

This patch solved the memleak with a reproducer, booting with
cgroup.memory=nokmem and with selinux disabled. The reason memcg kmem
and selinux were disabled for testing of this fix, is because kmem
greatly decreases the kernel effectiveness in reusing partial slab
objects. cgroup.memory=nokmem is strongly recommended at least for
workstation usage. selinux needs to be further analyzed because it
causes further slab allocations.

The upstream podman commit used for testing is
1fe2965e4f672674f7b66648e9973a0ed5434bb4 (v1.4.4).

The upstream kernel commit used for testing is
f16fea666898dbdd7812ce94068c76da3e3fcf1e (v5.2-rc6).

Reported-by: Michele Baldessari <michele@redhat.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>

<Applied with small tweaks to comments>
Signed-off-by: Matthew Heon <matthew.heon@pm.me>
---
 libpod/container_internal.go | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'libpod/container_internal.go')
diff --git a/libpod/container_internal.go b/libpod/container_internal.go
index 83ee5640e..3114e00c0 100644
--- a/libpod/container_internal.go
+++ b/libpod/container_internal.go
@@ -634,19 +634,15 @@ func (c *Container) removeConmonFiles() error {
 		return errors.Wrapf(err, "error removing container %s OOM file", c.ID())
 	}
 
-	// Instead of outright deleting the exit file, rename it (if it exists).
-	// We want to retain it so we can get the exit code of containers which
-	// are removed (at least until we have a workable events system)
+	// Remove the exit file so we don't leak memory in tmpfs
 	exitFile := filepath.Join(c.ociRuntime.exitsDir, c.ID())
-	oldExitFile := filepath.Join(c.ociRuntime.exitsDir, fmt.Sprintf("%s-old", c.ID()))
 	if _, err := os.Stat(exitFile); err != nil {
 		if !os.IsNotExist(err) {
 			return errors.Wrapf(err, "error running stat on container %s exit file", c.ID())
 		}
 	} else {
-		// Rename should replace the old exit file (if it exists)
-		if err := os.Rename(exitFile, oldExitFile); err != nil {
-			return errors.Wrapf(err, "error renaming container %s exit file", c.ID())
+		if err := os.Remove(exitFile); err != nil {
+			return errors.Wrapf(err, "error removing container %s exit file", c.ID())
 		}
 	}
 
-- 
cgit v1.2.3-54-g00ecf


From 9dcd76e369fb163774f8f58a7da24a7899e95b60 Mon Sep 17 00:00:00 2001
From: Matthew Heon <matthew.heon@pm.me>
Date: Wed, 31 Jul 2019 17:22:08 -0400
Subject: Ensure we generate a 'stopped' event on force-remove

When forcibly removing a container, we are initiating an explicit
stop of the container, which is not reflected in 'podman events'.
Swap to using our standard 'stop()' function instead of a custom
one for force-remove, and move the event into the internal stop
function (so internal calls also register it).

This does add one more database save() to `podman remove`. This
should not be a terribly serious performance hit, and does have
the desirable side effect of making things generally safer.

Signed-off-by: Matthew Heon <matthew.heon@pm.me>
---
 libpod/container_api.go      | 2 +-
 libpod/container_internal.go | 8 +++++++-
 libpod/runtime_ctr.go        | 7 +------
 3 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'libpod/container_internal.go')

diff --git a/libpod/container_api.go b/libpod/container_api.go
index cd020e429..ef9c3f006 100644
--- a/libpod/container_api.go
+++ b/libpod/container_api.go
@@ -187,7 +187,7 @@ func (c *Container) StopWithTimeout(timeout uint) error {
 		c.state.State == define.ContainerStateExited {
 		return define.ErrCtrStopped
 	}
-	defer c.newContainerEvent(events.Stop)
+
 	return c.stop(timeout)
 }
 
diff --git a/libpod/container_internal.go b/libpod/container_internal.go
index 3114e00c0..aba9c5b93 100644
--- a/libpod/container_internal.go
+++ b/libpod/container_internal.go
@@ -1108,7 +1108,13 @@ func (c *Container) stop(timeout uint) error {
 	}
 
 	// Wait until we have an exit file, and sync once we do
-	return c.waitForExitFileAndSync()
+	if err := c.waitForExitFileAndSync(); err != nil {
+		return err
+	}
+
+	c.newContainerEvent(events.Stop)
+
+	return nil
 }
 
 // Internal, non-locking function to pause a container
diff --git a/libpod/runtime_ctr.go b/libpod/runtime_ctr.go
index e57ab4634..47d49f6aa 100644
--- a/libpod/runtime_ctr.go
+++ b/libpod/runtime_ctr.go
@@ -394,14 +394,9 @@ func (r *Runtime) removeContainer(ctx context.Context, c *Container, force bool,
 
 	// Check that the container's in a good state to be removed
 	if c.state.State == config2.ContainerStateRunning {
-		if err := c.ociRuntime.stopContainer(c, c.StopTimeout()); err != nil {
+		if err := c.stop(c.StopTimeout()); err != nil {
 			return errors.Wrapf(err, "cannot remove container %s as it could not be stopped", c.ID())
 		}
-
-		// Need to update container state to make sure we know it's stopped
-		if err := c.waitForExitFileAndSync(); err != nil {
-			return err
-		}
 	}
 
 	// Check that all of our exec sessions have finished
-- 
cgit v1.2.3-54-g00ecf