7 files changed, 450 insertions, 1 deletions
diff --git a/.cirrus.yml b/.cirrus.yml
index a23595712..ae8a4dc3a 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -598,6 +598,38 @@ rootless_system_test_task:
     main_script: *main
     always: *logs_artifacts
 
+# FIXME: we may want to consider running this from nightly cron instead of CI.
+# The tests are actually pretty quick (less than a minute) but they do rely
+# on pulling images from quay.io, which means we're subject to network flakes.
+#
+# FIXME: how does this env matrix work, anyway? Does it spin up multiple VMs?
+# We might just want to encode the version matrix in runner.sh instead
+upgrade_test_task:
+    name: "Upgrade test: from $PODMAN_UPGRADE_FROM"
+    alias: upgrade_test
+    skip: *tags
+    only_if: *not_docs
+    depends_on:
+      - local_system_test
+    matrix:
+        - env:
+              PODMAN_UPGRADE_FROM: v1.9.0
+        - env:
+              PODMAN_UPGRADE_FROM: v2.0.6
+        - env:
+              PODMAN_UPGRADE_FROM: v2.1.1
+    gce_instance: *standardvm
+    env:
+        TEST_FLAVOR: upgrade_test
+        DISTRO_NV: ${FEDORA_NAME}
+        VM_IMAGE_NAME: ${FEDORA_CACHE_IMAGE_NAME}
+        # ID for re-use of build output
+        _BUILD_CACHE_HANDLE: ${FEDORA_NAME}-build-${CIRRUS_BUILD_ID}
+    clone_script: *noop
+    gopath_cache: *ro_gopath_cache
+    setup_script: *setup
+    main_script: *main
+    always: *logs_artifacts
 
 # This task is critical.  It updates the "last-used by" timestamp stored
 # in metadata for all VM images.  This mechanism functions in tandem with
@@ -654,6 +686,7 @@ success_task:
         - local_system_test
         - remote_system_test
         - rootless_system_test
+        - upgrade_test
         - meta
     container: *smallcontainer
     env:
diff --git a/contrib/cirrus/runner.sh b/contrib/cirrus/runner.sh
index ccbdb63b6..fca9aff93 100755
--- a/contrib/cirrus/runner.sh
+++ b/contrib/cirrus/runner.sh
@@ -70,6 +70,10 @@ function _run_sys() {
     dotest system
 }
 
+function _run_upgrade_test() {
+    bats test/upgrade |& logformatter
+}
+
 function _run_bindings() {
     # shellcheck disable=SC2155
     export PATH=$PATH:$GOSRC/hack
diff --git a/contrib/cirrus/setup_environment.sh b/contrib/cirrus/setup_environment.sh
index 4c95d0254..64ea3b7b4 100755
--- a/contrib/cirrus/setup_environment.sh
+++ b/contrib/cirrus/setup_environment.sh
@@ -200,6 +200,7 @@ case "$TEST_FLAVOR" in
     compose) ;&
     int) ;&
     sys) ;&
+    upgrade_test) ;&
     bindings) ;&
     endpoint)
         # Use existing host bits when testing is to happen inside a container
diff --git a/test/system/helpers.bash b/test/system/helpers.bash
index 0572c6866..4a7823852 100644
--- a/test/system/helpers.bash
+++ b/test/system/helpers.bash
@@ -149,7 +149,7 @@ function run_podman() {
     echo "$_LOG_PROMPT $PODMAN $*"
     # BATS hangs if a subprocess remains and keeps FD 3 open; this happens
     # if podman crashes unexpectedly without cleaning up subprocesses.
-    run timeout --foreground -v --kill=10 $PODMAN_TIMEOUT $PODMAN "$@" 3>/dev/null
+    run timeout --foreground -v --kill=10 $PODMAN_TIMEOUT $PODMAN $_PODMAN_TEST_OPTS "$@" 3>/dev/null
     # without "quotes", multiple lines are glommed together into one
     if [ -n "$output" ]; then
         echo "$output"
diff --git a/test/upgrade/README.md b/test/upgrade/README.md
new file mode 100644
index 000000000..2979a66d7
--- /dev/null
+++ b/test/upgrade/README.md
@@ -0,0 +1,87 @@
+Background
+==========
+
+For years we've been needing a way to test podman upgrades; this
+became much more critical on December 7, 2020, when Matt disclosed
+a bug he had found over the weekend
+([#8613](https://github.com/containers/podman/issues/8613))
+in which reuse of a previously-defined field name would
+result in fatal JSON decode failures if current-podman were
+to try reading containers created with podman <= 1.8 (FIXME: confirm)
+
+Upgrade testing is a daunting problem; but in the December 12
+Cabal meeting Dan suggested using podman-in-podman. This PR
+is the result of fleshing out that idea.
+
+Overview
+========
+
+The BATS script in this directory fetches and runs an old-podman
+container image from quay.io/podman, uses it to create and run
+a number of containers, then uses new-podman to interact with
+those containers.
+
+As of 2021-02-23 the available old-podman versions are:
+
+```console
+$ ./bin/podman search --list-tags quay.io/podman/stable | awk '$2 ~ /^v/ { print $2}' | sort | column -c 75
+v1.4.2  v1.5.0  v1.6    v1.9.0  v2.0.2  v2.1.1
+v1.4.4  v1.5.1  v1.6.2  v1.9.1  v2.0.6  v2.2.1
+```
+
+Test invocation is:
+```console
+$ sudo env PODMAN=bin/podman PODMAN_UPGRADE_FROM=v1.9.0 PODMAN_UPGRADE_TEST_DEBUG= bats test/upgrade
+```
+(Path assumes you're cd'ed to top-level podman repo). `PODMAN_UPGRADE_FROM`
+can be any of the versions above. `PODMAN_UPGRADE_TEST_DEBUG` is empty
+here, but listed so you can set it `=1` and leave the podman_parent
+container running. Interacting with this container is left as an
+exercise for the reader.
+
+The script will pull the given podman image, invoke it with a scratch
+root directory, and have it do a small set of podman stuff (pull an
+image, create/run some containers). This podman process stays running
+because if it exits, it kills containers running inside the container.
+
+We then invoke the current (host-installed) podman, using the same
+scratch root directory, and perform operations on those images and
+containers. Most of those operations are done in individual @tests.
+
+The goal is to have this upgrade test run in CI, iterating over a
+loop of known old versions. This list would need to be hand-maintained
+and updated on new releases. There might also need to be extra
+configuration defined, such as per-version commands (see below).
+
+Findings
+========
+
+Well, first, `v1.6.2` won't work on default f32/f33: the image
+does not include `crun`, so it can't work at all:
+
+    ERRO[0000] oci runtime "runc" does not support CGroups V2: use system migrate to mitigate
+
+I realize that it's kind of stupid not to test 1.6, since that's
+precisely the test that would've caught #8613 early, but I just
+don't think it's worth the hassle of setting up cgroupsv1 VMs.
+
+For posterity, in an earlier incantation of this script I tried
+booting f32 into cgroupsv1 and ran into the following warnings
+when running new-podman on old-containers:
+```
+ERRO[0000] error joining network namespace for container 322b66d94640e31b2e6921565445cf0dade4ec13cabc16ee5f29292bdc038341: error retrieving network namespace at /var/run/netns/cni-577e2289-2c05-2e28-3c3d-002a5596e7da: failed to Statfs "/var/run/netns/cni-577e2289
+```
+
+Where To Go From Here
+=====================
+
+* Tests are still (2021-02-23) incomplete, with several failing outright.
+  See FIXMEs in the code.
+
+* Figuring out how/if to run rootless. I think this is possible, perhaps
+  even necessary, but will be tricky to get right because of home-directory
+  mounting.
+
+* Figuring out how/if to run variations with different config files
+  (e.g. running OLD-PODMAN that creates a user libpod.conf, tweaking
+  that in the test, then running NEW-PODMAN upgrate tests)
diff --git a/test/upgrade/helpers.bash b/test/upgrade/helpers.bash
new file mode 100644
index 000000000..41d9279e6
--- /dev/null
+++ b/test/upgrade/helpers.bash
@@ -0,0 +1,11 @@
+# -*- bash -*-
+
+load "../system/helpers"
+
+setup() {
+    :
+}
+
+teardown() {
+    :
+}
diff --git a/test/upgrade/test-upgrade.bats b/test/upgrade/test-upgrade.bats
new file mode 100644
index 000000000..dd827b398
--- /dev/null
+++ b/test/upgrade/test-upgrade.bats
@@ -0,0 +1,313 @@
+# -*- bats -*-
+
+load helpers
+
+# Create a var-lib-containers dir for this podman. We need to bind-mount
+# this into the container, and use --root and --runroot and --tmpdir
+# options both in the container podman and out here: that's the only
+# way to share image and container storage.
+if [ -z "${PODMAN_UPGRADE_WORKDIR}" ]; then
+    # Much as I'd love a descriptive name like "podman-upgrade-tests.XXXXX",
+    # keep it short ("pu") because of the 100-character path length limit
+    # for UNIX sockets (needed by conmon)
+    export PODMAN_UPGRADE_WORKDIR=$(mktemp -d --tmpdir=${BATS_TMPDIR:-${TMPDIR:-/tmp}} pu.XXXXXX)
+
+    touch $PODMAN_UPGRADE_WORKDIR/status
+fi
+
+# Generate a set of random strings used for content verification
+if [ -z "${RANDOM_STRING_1}" ]; then
+    export RANDOM_STRING_1=$(random_string 15)
+    export LABEL_CREATED=$(random_string 16)
+    export LABEL_FAILED=$(random_string 17)
+    export LABEL_RUNNING=$(random_string 18)
+
+    # FIXME: randomize this
+    HOST_PORT=34567
+fi
+
+# Version string of the podman we're actually testing, e.g. '3.0.0-dev-d1a26013'
+PODMAN_VERSION=$($PODMAN version  |awk '/^Version:/ { V=$2 } /^Git Commit:/ { G=$3 } END { print V "-" substr(G,0,8) }')
+
+setup() {
+    skip_if_rootless
+
+    # The podman-in-podman image (old podman)
+    if [[ -z "$PODMAN_UPGRADE_FROM" ]]; then
+        echo "# \$PODMAN_UPGRADE_FROM is undefined (should be e.g. v1.9.0)" >&3
+        false
+    fi
+
+    if [ "$(< $PODMAN_UPGRADE_WORKDIR/status)" = "failed" ]; then
+        # FIXME: exit instead?
+        echo "*** setup failed - no point in running tests"
+        false
+    fi
+
+    export _PODMAN_TEST_OPTS="--root=$PODMAN_UPGRADE_WORKDIR/root --runroot=$PODMAN_UPGRADE_WORKDIR/runroot --tmpdir=$PODMAN_UPGRADE_WORKDIR/tmp"
+}
+
+###############################################################################
+# BEGIN setup
+
+@test "initial setup: start $PODMAN_UPGRADE_FROM containers" {
+    echo failed >| $PODMAN_UPGRADE_WORKDIR/status
+
+    OLD_PODMAN=quay.io/podman/stable:$PODMAN_UPGRADE_FROM
+    $PODMAN pull $OLD_PODMAN
+
+    # Shortcut name, because we're referencing it a lot
+    pmroot=$PODMAN_UPGRADE_WORKDIR
+
+    # WWW content to share
+    mkdir -p $pmroot/var/www
+    echo $RANDOM_STRING_1 >$pmroot/var/www/index.txt
+
+    # podman tmpdir
+    mkdir -p $pmroot/tmp
+
+    #
+    # Script to run >>OLD<< podman commands.
+    #
+    # These commands will be run inside a podman container. The "podman"
+    # command in this script will be the desired old-podman version.
+    #
+    pmscript=$pmroot/setup
+    cat >| $pmscript <<EOF
+#!/bin/bash
+
+# cgroup-manager=systemd does not work inside a container
+opts="--cgroup-manager=cgroupfs --events-backend=file $_PODMAN_TEST_OPTS"
+
+set -ex
+
+# Try try again, because network flakiness makes this a point of failure
+podman \$opts pull $IMAGE \
+  || (sleep 10; podman \$opts pull $IMAGE) \
+  || (sleep 30; podman \$opts pull $IMAGE)
+
+
+podman \$opts create --name mycreatedcontainer --label mylabel=$LABEL_CREATED \
+                                               $IMAGE false
+
+podman \$opts run    --name mydonecontainer    $IMAGE echo ++$RANDOM_STRING_1++
+
+podman \$opts run    --name myfailedcontainer  --label mylabel=$LABEL_FAILED \
+                                               $IMAGE sh -c 'exit 17' || true
+
+# FIXME: add "-p $HOST_PORT:80"
+#    ...I tried and tried, and could not get this to work. I could never
+#    connect to the port from the host, nor even from the podman_parent
+#    container; I could never see the port listed in 'ps' nor 'inspect'.
+#    And, finally, I ended up in a state where the container wouldn't
+#    even start, and via complicated 'podman logs' found out:
+#        httpd: bind: Address in use
+#    So I just give up for now.
+#
+podman \$opts run -d --name myrunningcontainer --label mylabel=$LABEL_RUNNING \
+                                               -v $pmroot/var/www:/var/www \
+                                               -w /var/www \
+                                               $IMAGE /bin/busybox-extras httpd -f -p 80
+
+echo READY
+while :;do
+    if [ -e /stop ]; then
+        echo STOPPING
+        podman \$opts stop -t 0 myrunningcontainer || true
+        podman \$opts rm -f     myrunningcontainer || true
+        exit 0
+    fi
+    sleep 0.5
+done
+EOF
+    chmod 555 $pmscript
+
+    # Clean up vestiges of previous run
+    $PODMAN rm -f podman_parent || true
+
+    # Not entirely a NOP! This is just so we get /run/crun created on a CI VM
+    $PODMAN run --rm $OLD_PODMAN true
+
+    #
+    # Use new-podman to run the above script under old-podman.
+    #
+    # DO NOT USE run_podman HERE! That would use $_PODMAN_TEST_OPTS
+    # and would write into our shared test dir, which would then
+    # pollute it for use by old-podman. We must keep that pristine
+    # so old-podman is the first to write to it.
+    #
+    $PODMAN run -d --name podman_parent --pid=host \
+            --privileged \
+            --net=host \
+            --cgroupns=host \
+            -v /dev/fuse:/dev/fuse \
+            -v /run/crun:/run/crun \
+            -v $pmroot:$pmroot \
+            $OLD_PODMAN $pmroot/setup
+
+    _PODMAN_TEST_OPTS= wait_for_ready podman_parent
+
+    echo OK >| $PODMAN_UPGRADE_WORKDIR/status
+}
+
+# END   setup
+###############################################################################
+# BEGIN actual tests
+
+# This is a NOP; used only so the version string will show up in logs
+@test "upgrade: $PODMAN_UPGRADE_FROM -> $PODMAN_VERSION" {
+    :
+}
+
+@test "images" {
+    run_podman images -a --format '{{.Names}}'
+    is "$output" "\[$IMAGE\]" "podman images"
+}
+
+@test "ps : one container running" {
+    run_podman ps --format '{{.Image}}--{{.Names}}'
+    is "$output" "$IMAGE--myrunningcontainer" "ps: one container running"
+}
+
+@test "ps -a : shows all containers" {
+    # IMPORTANT: we can't use --sort=created, because that requires #8427
+    # on the *creating* podman end.
+    run_podman ps -a \
+               --format '{{.Names}}--{{.Status}}--{{.Ports}}--{{.Labels.mylabel}}' \
+               --sort=names
+    is "${lines[0]}" "mycreatedcontainer--Created----$LABEL_CREATED" "created"
+    is "${lines[1]}" "mydonecontainer--Exited (0).*----<no value>" "done"
+    is "${lines[2]}" "myfailedcontainer--Exited (17) .*----$LABEL_FAILED" "fail"
+    is "${lines[3]}" "myrunningcontainer--Up .*----$LABEL_RUNNING" "running"
+
+    # For debugging: dump containers and IDs
+    if [[ -n "$PODMAN_UPGRADE_TEST_DEBUG" ]]; then
+        run_podman ps -a
+        for l in "${lines[@]}"; do
+            echo "# $l" >&3
+        done
+    fi
+}
+
+
+@test "inspect - all container status" {
+    tests="
+running   | running    |  0
+created   | configured |  0
+done      | exited     |  0
+failed    | exited     | 17
+"
+    while read cname state exitstatus; do
+        run_podman inspect --format '{{.State.Status}}--{{.State.ExitCode}}' my${cname}container
+        is "$output" "$state--$exitstatus" "status of my${cname}container"
+    done < <(parse_table "$tests")
+}
+
+@test "logs" {
+    run_podman logs mydonecontainer
+    is "$output" "++$RANDOM_STRING_1++" "podman logs on stopped container"
+
+#    run_podman logs myrunningcontainer
+#    is "$output" "READY" "podman logs on running container"
+}
+
+@test "exec" {
+    run_podman exec myrunningcontainer cat /var/www/index.txt
+    is "$output" "$RANDOM_STRING_1" "exec into myrunningcontainer"
+}
+
+@test "load" {
+    # FIXME, is this really necessary?
+    skip "TBI. Not sure if there's any point to this."
+}
+
+@test "mount" {
+    skip "TBI"
+}
+
+@test "pods" {
+    skip "TBI"
+}
+
+# FIXME: commit? kill? network? pause? restart? top? volumes? What else?
+
+
+@test "start" {
+    skip "FIXME: this leaves a mount behind: root/overlay/sha/merged"
+    run_podman --cgroup-manager=cgroupfs start -a mydonecontainer
+    is "$output" "++$RANDOM_STRING_1++" "start on already-run container"
+}
+
+@test "rm a stopped container" {
+    # FIXME FIXME FIXME!
+    #
+    # I have no idea what's going on here. For most of my testing in this
+    # section, the code here was simply 'podman rm myfailedcontainer', and
+    # it would succeed, but then way down, in 'cleanup' below, the 'rm -f'
+    # step would fail:
+    #
+    #    # podman rm -f podman_parent
+    #    error freeing lock for container <sha>: no such file or directory
+    #    ...where <sha> is the ID of the podman_parent container.
+    #
+    # I started playing with this section, by adding 'rm mydonecontainer',
+    # and now it always fails, the same way, but with the container we're
+    # removing right here:
+    #
+    #    error freeing lock for container <sha>: no such file or directory
+    #    ...where <sha> is the ID of mydonecontainer.
+    #
+    # I don't know. I give up for now, and am skip'ing the whole thing.
+    # If you want to play with it, try commenting out the 'myfailed' lines,
+    # or just the 'mydone' ones, or, I don't know.
+    skip "FIXME: error freeing lock for container <sha>: no such file or dir"
+
+    # For debugging, so we can see what 'error freeing lock' refers to
+    run_podman ps -a
+
+    run_podman rm myfailedcontainer
+    is "$output" "[0-9a-f]\\{64\\}" "podman rm myfailedcontainer"
+
+    run_podman rm mydonecontainer
+    is "$output" "[0-9a-f]\\{64\\}" "podman rm mydonecontainer"
+}
+
+
+@test "stop and rm" {
+    # About a ten-second pause, then:
+    #    Error: timed out waiting for file /tmp/pu.nf747w/tmp/exits/<sha>: internal libpod error
+    # It doesn't seem to be a socket-length issue: the paths are ~80-88 chars.
+    # Leaving podman_parent running, and exec'ing into it, it doesn't look
+    # like the file is being written to the wrong place.
+    skip "FIXME: this doesn't work: timed out waiting for file tmpdir/exits/sha"
+    run_podman stop myrunningcontainer
+    run_podman rm   myrunningcontainer
+}
+
+@test "clean up parent" {
+    if [[ -n "$PODMAN_UPGRADE_TEST_DEBUG" ]]; then
+        skip "workdir is $PODMAN_UPGRADE_WORKDIR"
+    fi
+
+    # We're done with shared environment. By clearing this, we can now
+    # use run_podman for actions on the podman_parent container
+    unset _PODMAN_TEST_OPTS
+
+    # (Useful for debugging the 'rm -f' step below, which, when it fails, only
+    # gives a container ID. This 'ps' confirms that the CID is podman_parent)
+    run_podman ps -a
+
+    # Stop the container gracefully
+    run_podman exec podman_parent touch /stop
+    run_podman wait podman_parent
+
+    run_podman logs podman_parent
+    run_podman rm -f podman_parent
+
+    # FIXME: why does this remain mounted?
+    umount $PODMAN_UPGRADE_WORKDIR/root/overlay || true
+
+    rm -rf $PODMAN_UPGRADE_WORKDIR
+}
+
+# FIXME: now clean up