9 files changed, 518 insertions, 343 deletions
diff --git a/libpod/container_graph.go b/libpod/container_graph.go
index 96d61b756..d43579e4a 100644
--- a/libpod/container_graph.go
+++ b/libpod/container_graph.go
@@ -281,3 +281,94 @@ func startNode(ctx context.Context, node *containerNode, setError bool, ctrError
 		startNode(ctx, successor, ctrErrored, ctrErrors, ctrsVisited, restart)
 	}
 }
+
+// Visit a node on the container graph and remove it, or set an error if it
+// failed to remove. Only intended for use in pod removal; do *not* use when
+// removing individual containers.
+// All containers are assumed to be *UNLOCKED* on running this function.
+// Container locks will be acquired as necessary.
+// Pod and infraID are optional. If a pod is given it must be *LOCKED*.
+func removeNode(ctx context.Context, node *containerNode, pod *Pod, force bool, timeout *uint, setError bool, ctrErrors map[string]error, ctrsVisited map[string]bool, ctrNamedVolumes map[string]*ContainerNamedVolume) {
+	// If we already visited this node, we're done.
+	if ctrsVisited[node.id] {
+		return
+	}
+
+	// Someone who depends on us failed.
+	// Mark us as failed and recurse.
+	if setError {
+		ctrsVisited[node.id] = true
+		ctrErrors[node.id] = fmt.Errorf("a container that depends on container %s could not be removed: %w", node.id, define.ErrCtrStateInvalid)
+
+		// Hit anyone who depends on us, set errors there as well.
+		for _, successor := range node.dependsOn {
+			removeNode(ctx, successor, pod, force, timeout, true, ctrErrors, ctrsVisited, ctrNamedVolumes)
+		}
+	}
+
+	// Does anyone still depend on us?
+	// Cannot remove if true. Once all our dependencies have been removed,
+	// we will be removed.
+	for _, dep := range node.dependedOn {
+		// The container that depends on us hasn't been removed yet.
+		// OK to continue on
+		if ok := ctrsVisited[dep.id]; !ok {
+			return
+		}
+	}
+
+	// Going to try to remove the node, mark us as visited
+	ctrsVisited[node.id] = true
+
+	ctrErrored := false
+
+	// Verify that all that depend on us are gone.
+	// Graph traversal should guarantee this is true, but this isn't that
+	// expensive, and it's better to be safe.
+	for _, dep := range node.dependedOn {
+		if _, err := node.container.runtime.GetContainer(dep.id); err == nil {
+			ctrErrored = true
+			ctrErrors[node.id] = fmt.Errorf("a container that depends on container %s still exists: %w", node.id, define.ErrDepExists)
+		}
+	}
+
+	// Lock the container
+	node.container.lock.Lock()
+
+	// Gate all subsequent bits behind a ctrErrored check - we don't want to
+	// proceed if a previous step failed.
+	if !ctrErrored {
+		if err := node.container.syncContainer(); err != nil {
+			ctrErrored = true
+			ctrErrors[node.id] = err
+		}
+	}
+
+	if !ctrErrored {
+		for _, vol := range node.container.config.NamedVolumes {
+			ctrNamedVolumes[vol.Name] = vol
+		}
+
+		if pod != nil && pod.state.InfraContainerID == node.id {
+			pod.state.InfraContainerID = ""
+			if err := pod.save(); err != nil {
+				ctrErrored = true
+				ctrErrors[node.id] = fmt.Errorf("error removing infra container %s from pod %s: %w", node.id, pod.ID(), err)
+			}
+		}
+	}
+
+	if !ctrErrored {
+		if err := node.container.runtime.removeContainer(ctx, node.container, force, false, true, false, timeout); err != nil {
+			ctrErrored = true
+			ctrErrors[node.id] = err
+		}
+	}
+
+	node.container.lock.Unlock()
+
+	// Recurse to anyone who we depend on and remove them
+	for _, successor := range node.dependsOn {
+		removeNode(ctx, successor, pod, force, timeout, ctrErrored, ctrErrors, ctrsVisited, ctrNamedVolumes)
+	}
+}
diff --git a/libpod/container_inspect.go b/libpod/container_inspect.go
index b72d843b6..e4089efa6 100644
--- a/libpod/container_inspect.go
+++ b/libpod/container_inspect.go
@@ -3,20 +3,15 @@ package libpod
 import (
 	"errors"
 	"fmt"
-	"sort"
 	"strings"
 
-	"github.com/containers/common/pkg/config"
 	"github.com/containers/podman/v4/libpod/define"
 	"github.com/containers/podman/v4/libpod/driver"
 	"github.com/containers/podman/v4/pkg/util"
 	"github.com/containers/storage/types"
 	units "github.com/docker/go-units"
 	spec "github.com/opencontainers/runtime-spec/specs-go"
-	"github.com/opencontainers/runtime-tools/generate"
-	"github.com/opencontainers/runtime-tools/validate"
 	"github.com/sirupsen/logrus"
-	"github.com/syndtr/gocapability/capability"
 )
 
 // inspectLocked inspects a container for low-level information.
@@ -163,8 +158,6 @@ func (c *Container) getContainerInspectData(size bool, driverData *define.Driver
 		Driver:          driverData.Name,
 		MountLabel:      config.MountLabel,
 		ProcessLabel:    config.ProcessLabel,
-		EffectiveCaps:   ctrSpec.Process.Capabilities.Effective,
-		BoundingCaps:    ctrSpec.Process.Capabilities.Bounding,
 		AppArmorProfile: ctrSpec.Process.ApparmorProfile,
 		ExecIDs:         execIDs,
 		GraphDriver:     driverData,
@@ -173,6 +166,10 @@ func (c *Container) getContainerInspectData(size bool, driverData *define.Driver
 		IsInfra:         c.IsInfra(),
 		IsService:       c.IsService(),
 	}
+	if ctrSpec.Process.Capabilities != nil {
+		data.EffectiveCaps = ctrSpec.Process.Capabilities.Effective
+		data.BoundingCaps = ctrSpec.Process.Capabilities.Bounding
+	}
 
 	if c.state.ConfigPath != "" {
 		data.OCIConfigPath = c.state.ConfigPath
@@ -484,11 +481,6 @@ func (c *Container) generateInspectContainerHostConfig(ctrSpec *spec.Spec, named
 	hostConfig.ShmSize = c.config.ShmSize
 	hostConfig.Runtime = "oci"
 
-	// This is very expensive to initialize.
-	// So we don't want to initialize it unless we absolutely have to - IE,
-	// there are things that require a major:minor to path translation.
-	var deviceNodes map[string]string
-
 	// Annotations
 	if ctrSpec.Annotations != nil {
 		hostConfig.ContainerIDFile = ctrSpec.Annotations[define.InspectAnnotationCIDFile]
@@ -506,109 +498,8 @@ func (c *Container) generateInspectContainerHostConfig(ctrSpec *spec.Spec, named
 		}
 	}
 
-	// Resource limits
-	if ctrSpec.Linux != nil {
-		if ctrSpec.Linux.Resources != nil {
-			if ctrSpec.Linux.Resources.CPU != nil {
-				if ctrSpec.Linux.Resources.CPU.Shares != nil {
-					hostConfig.CpuShares = *ctrSpec.Linux.Resources.CPU.Shares
-				}
-				if ctrSpec.Linux.Resources.CPU.Period != nil {
-					hostConfig.CpuPeriod = *ctrSpec.Linux.Resources.CPU.Period
-				}
-				if ctrSpec.Linux.Resources.CPU.Quota != nil {
-					hostConfig.CpuQuota = *ctrSpec.Linux.Resources.CPU.Quota
-				}
-				if ctrSpec.Linux.Resources.CPU.RealtimePeriod != nil {
-					hostConfig.CpuRealtimePeriod = *ctrSpec.Linux.Resources.CPU.RealtimePeriod
-				}
-				if ctrSpec.Linux.Resources.CPU.RealtimeRuntime != nil {
-					hostConfig.CpuRealtimeRuntime = *ctrSpec.Linux.Resources.CPU.RealtimeRuntime
-				}
-				hostConfig.CpusetCpus = ctrSpec.Linux.Resources.CPU.Cpus
-				hostConfig.CpusetMems = ctrSpec.Linux.Resources.CPU.Mems
-			}
-			if ctrSpec.Linux.Resources.Memory != nil {
-				if ctrSpec.Linux.Resources.Memory.Limit != nil {
-					hostConfig.Memory = *ctrSpec.Linux.Resources.Memory.Limit
-				}
-				if ctrSpec.Linux.Resources.Memory.Reservation != nil {
-					hostConfig.MemoryReservation = *ctrSpec.Linux.Resources.Memory.Reservation
-				}
-				if ctrSpec.Linux.Resources.Memory.Swap != nil {
-					hostConfig.MemorySwap = *ctrSpec.Linux.Resources.Memory.Swap
-				}
-				if ctrSpec.Linux.Resources.Memory.Swappiness != nil {
-					hostConfig.MemorySwappiness = int64(*ctrSpec.Linux.Resources.Memory.Swappiness)
-				} else {
-					// Swappiness has a default of -1
-					hostConfig.MemorySwappiness = -1
-				}
-				if ctrSpec.Linux.Resources.Memory.DisableOOMKiller != nil {
-					hostConfig.OomKillDisable = *ctrSpec.Linux.Resources.Memory.DisableOOMKiller
-				}
-			}
-			if ctrSpec.Linux.Resources.Pids != nil {
-				hostConfig.PidsLimit = ctrSpec.Linux.Resources.Pids.Limit
-			}
-			hostConfig.CgroupConf = ctrSpec.Linux.Resources.Unified
-			if ctrSpec.Linux.Resources.BlockIO != nil {
-				if ctrSpec.Linux.Resources.BlockIO.Weight != nil {
-					hostConfig.BlkioWeight = *ctrSpec.Linux.Resources.BlockIO.Weight
-				}
-				hostConfig.BlkioWeightDevice = []define.InspectBlkioWeightDevice{}
-				for _, dev := range ctrSpec.Linux.Resources.BlockIO.WeightDevice {
-					key := fmt.Sprintf("%d:%d", dev.Major, dev.Minor)
-					// TODO: how do we handle LeafWeight vs
-					// Weight? For now, ignore anything
-					// without Weight set.
-					if dev.Weight == nil {
-						logrus.Infof("Ignoring weight device %s as it lacks a weight", key)
-						continue
-					}
-					if deviceNodes == nil {
-						nodes, err := util.FindDeviceNodes()
-						if err != nil {
-							return nil, err
-						}
-						deviceNodes = nodes
-					}
-					path, ok := deviceNodes[key]
-					if !ok {
-						logrus.Infof("Could not locate weight device %s in system devices", key)
-						continue
-					}
-					weightDev := define.InspectBlkioWeightDevice{}
-					weightDev.Path = path
-					weightDev.Weight = *dev.Weight
-					hostConfig.BlkioWeightDevice = append(hostConfig.BlkioWeightDevice, weightDev)
-				}
-
-				readBps, err := blkioDeviceThrottle(deviceNodes, ctrSpec.Linux.Resources.BlockIO.ThrottleReadBpsDevice)
-				if err != nil {
-					return nil, err
-				}
-				hostConfig.BlkioDeviceReadBps = readBps
-
-				writeBps, err := blkioDeviceThrottle(deviceNodes, ctrSpec.Linux.Resources.BlockIO.ThrottleWriteBpsDevice)
-				if err != nil {
-					return nil, err
-				}
-				hostConfig.BlkioDeviceWriteBps = writeBps
-
-				readIops, err := blkioDeviceThrottle(deviceNodes, ctrSpec.Linux.Resources.BlockIO.ThrottleReadIOPSDevice)
-				if err != nil {
-					return nil, err
-				}
-				hostConfig.BlkioDeviceReadIOps = readIops
-
-				writeIops, err := blkioDeviceThrottle(deviceNodes, ctrSpec.Linux.Resources.BlockIO.ThrottleWriteIOPSDevice)
-				if err != nil {
-					return nil, err
-				}
-				hostConfig.BlkioDeviceWriteIOps = writeIops
-			}
-		}
+	if err := c.platformInspectContainerHostConfig(ctrSpec, hostConfig); err != nil {
+		return nil, err
 	}
 
 	// NanoCPUs.
@@ -659,182 +550,6 @@ func (c *Container) generateInspectContainerHostConfig(ctrSpec *spec.Spec, named
 		hostConfig.PortBindings = make(map[string][]define.InspectHostPort)
 	}
 
-	// Cap add and cap drop.
-	// We need a default set of capabilities to compare against.
-	// The OCI generate package has one, and is commonly used, so we'll
-	// use it.
-	// Problem: there are 5 sets of capabilities.
-	// Use the bounding set for this computation, it's the most encompassing
-	// (but still not perfect).
-	capAdd := []string{}
-	capDrop := []string{}
-	// No point in continuing if we got a spec without a Process block...
-	if ctrSpec.Process != nil {
-		// Max an O(1) lookup table for default bounding caps.
-		boundingCaps := make(map[string]bool)
-		g, err := generate.New("linux")
-		if err != nil {
-			return nil, err
-		}
-		if !hostConfig.Privileged {
-			for _, cap := range g.Config.Process.Capabilities.Bounding {
-				boundingCaps[cap] = true
-			}
-		} else {
-			// If we are privileged, use all caps.
-			for _, cap := range capability.List() {
-				if g.HostSpecific && cap > validate.LastCap() {
-					continue
-				}
-				boundingCaps[fmt.Sprintf("CAP_%s", strings.ToUpper(cap.String()))] = true
-			}
-		}
-		// Iterate through spec caps.
-		// If it's not in default bounding caps, it was added.
-		// If it is, delete from the default set. Whatever remains after
-		// we finish are the dropped caps.
-		for _, cap := range ctrSpec.Process.Capabilities.Bounding {
-			if _, ok := boundingCaps[cap]; ok {
-				delete(boundingCaps, cap)
-			} else {
-				capAdd = append(capAdd, cap)
-			}
-		}
-		for cap := range boundingCaps {
-			capDrop = append(capDrop, cap)
-		}
-		// Sort CapDrop so it displays in consistent order (GH #9490)
-		sort.Strings(capDrop)
-	}
-	hostConfig.CapAdd = capAdd
-	hostConfig.CapDrop = capDrop
-	switch {
-	case c.config.IPCNsCtr != "":
-		hostConfig.IpcMode = fmt.Sprintf("container:%s", c.config.IPCNsCtr)
-	case ctrSpec.Linux != nil:
-		// Locate the spec's IPC namespace.
-		// If there is none, it's ipc=host.
-		// If there is one and it has a path, it's "ns:".
-		// If no path, it's default - the empty string.
-		for _, ns := range ctrSpec.Linux.Namespaces {
-			if ns.Type == spec.IPCNamespace {
-				if ns.Path != "" {
-					hostConfig.IpcMode = fmt.Sprintf("ns:%s", ns.Path)
-				} else {
-					break
-				}
-			}
-		}
-	case c.config.NoShm:
-		hostConfig.IpcMode = "none"
-	case c.config.NoShmShare:
-		hostConfig.IpcMode = "private"
-	}
-	if hostConfig.IpcMode == "" {
-		hostConfig.IpcMode = "shareable"
-	}
-
-	// Cgroup namespace mode
-	cgroupMode := ""
-	if c.config.CgroupNsCtr != "" {
-		cgroupMode = fmt.Sprintf("container:%s", c.config.CgroupNsCtr)
-	} else if ctrSpec.Linux != nil {
-		// Locate the spec's cgroup namespace
-		// If there is none, it's cgroup=host.
-		// If there is one and it has a path, it's "ns:".
-		// If there is no path, it's private.
-		for _, ns := range ctrSpec.Linux.Namespaces {
-			if ns.Type == spec.CgroupNamespace {
-				if ns.Path != "" {
-					cgroupMode = fmt.Sprintf("ns:%s", ns.Path)
-				} else {
-					cgroupMode = "private"
-				}
-			}
-		}
-		if cgroupMode == "" {
-			cgroupMode = "host"
-		}
-	}
-	hostConfig.CgroupMode = cgroupMode
-
-	// Cgroup parent
-	// Need to check if it's the default, and not print if so.
-	defaultCgroupParent := ""
-	switch c.CgroupManager() {
-	case config.CgroupfsCgroupsManager:
-		defaultCgroupParent = CgroupfsDefaultCgroupParent
-	case config.SystemdCgroupsManager:
-		defaultCgroupParent = SystemdDefaultCgroupParent
-	}
-	if c.config.CgroupParent != defaultCgroupParent {
-		hostConfig.CgroupParent = c.config.CgroupParent
-	}
-	hostConfig.CgroupManager = c.CgroupManager()
-
-	// PID namespace mode
-	pidMode := ""
-	if c.config.PIDNsCtr != "" {
-		pidMode = fmt.Sprintf("container:%s", c.config.PIDNsCtr)
-	} else if ctrSpec.Linux != nil {
-		// Locate the spec's PID namespace.
-		// If there is none, it's pid=host.
-		// If there is one and it has a path, it's "ns:".
-		// If there is no path, it's default - the empty string.
-		for _, ns := range ctrSpec.Linux.Namespaces {
-			if ns.Type == spec.PIDNamespace {
-				if ns.Path != "" {
-					pidMode = fmt.Sprintf("ns:%s", ns.Path)
-				} else {
-					pidMode = "private"
-				}
-				break
-			}
-		}
-		if pidMode == "" {
-			pidMode = "host"
-		}
-	}
-	hostConfig.PidMode = pidMode
-
-	// UTS namespace mode
-	utsMode := c.NamespaceMode(spec.UTSNamespace, ctrSpec)
-
-	hostConfig.UTSMode = utsMode
-
-	// User namespace mode
-	usernsMode := ""
-	if c.config.UserNsCtr != "" {
-		usernsMode = fmt.Sprintf("container:%s", c.config.UserNsCtr)
-	} else if ctrSpec.Linux != nil {
-		// Locate the spec's user namespace.
-		// If there is none, it's default - the empty string.
-		// If there is one, it's "private" if no path, or "ns:" if
-		// there's a path.
-
-		for _, ns := range ctrSpec.Linux.Namespaces {
-			if ns.Type == spec.UserNamespace {
-				if ns.Path != "" {
-					usernsMode = fmt.Sprintf("ns:%s", ns.Path)
-				} else {
-					usernsMode = "private"
-				}
-			}
-		}
-	}
-	hostConfig.UsernsMode = usernsMode
-	if c.config.IDMappings.UIDMap != nil && c.config.IDMappings.GIDMap != nil {
-		hostConfig.IDMappings = generateIDMappings(c.config.IDMappings)
-	}
-	// Devices
-	// Do not include if privileged - assumed that all devices will be
-	// included.
-	var err error
-	hostConfig.Devices, err = c.GetDevices(hostConfig.Privileged, *ctrSpec, deviceNodes)
-	if err != nil {
-		return nil, err
-	}
-
 	// Ulimits
 	hostConfig.Ulimits = []define.InspectUlimit{}
 	if ctrSpec.Process != nil {
diff --git a/libpod/container_inspect_freebsd.go b/libpod/container_inspect_freebsd.go
new file mode 100644
index 000000000..8b4e8df87
--- /dev/null
+++ b/libpod/container_inspect_freebsd.go
@@ -0,0 +1,17 @@
+package libpod
+
+import (
+	"github.com/containers/podman/v4/libpod/define"
+	spec "github.com/opencontainers/runtime-spec/specs-go"
+)
+
+func (c *Container) platformInspectContainerHostConfig(ctrSpec *spec.Spec, hostConfig *define.InspectContainerHostConfig) error {
+	// Not sure what to put here. FreeBSD jails use pids from the
+	// global pool but can only see their own pids.
+	hostConfig.PidMode = "host"
+
+	// UTS namespace mode
+	hostConfig.UTSMode = c.NamespaceMode(spec.UTSNamespace, ctrSpec)
+
+	return nil
+}
diff --git a/libpod/container_inspect_linux.go b/libpod/container_inspect_linux.go
new file mode 100644
index 000000000..355690d70
--- /dev/null
+++ b/libpod/container_inspect_linux.go
@@ -0,0 +1,306 @@
+package libpod
+
+import (
+	"fmt"
+	"sort"
+	"strings"
+
+	"github.com/containers/common/pkg/config"
+	"github.com/containers/podman/v4/libpod/define"
+	"github.com/containers/podman/v4/pkg/util"
+	spec "github.com/opencontainers/runtime-spec/specs-go"
+	"github.com/opencontainers/runtime-tools/generate"
+	"github.com/opencontainers/runtime-tools/validate"
+	"github.com/sirupsen/logrus"
+	"github.com/syndtr/gocapability/capability"
+)
+
+func (c *Container) platformInspectContainerHostConfig(ctrSpec *spec.Spec, hostConfig *define.InspectContainerHostConfig) error {
+	// This is very expensive to initialize.
+	// So we don't want to initialize it unless we absolutely have to - IE,
+	// there are things that require a major:minor to path translation.
+	var deviceNodes map[string]string
+
+	// Resource limits
+	if ctrSpec.Linux != nil {
+		if ctrSpec.Linux.Resources != nil {
+			if ctrSpec.Linux.Resources.CPU != nil {
+				if ctrSpec.Linux.Resources.CPU.Shares != nil {
+					hostConfig.CpuShares = *ctrSpec.Linux.Resources.CPU.Shares
+				}
+				if ctrSpec.Linux.Resources.CPU.Period != nil {
+					hostConfig.CpuPeriod = *ctrSpec.Linux.Resources.CPU.Period
+				}
+				if ctrSpec.Linux.Resources.CPU.Quota != nil {
+					hostConfig.CpuQuota = *ctrSpec.Linux.Resources.CPU.Quota
+				}
+				if ctrSpec.Linux.Resources.CPU.RealtimePeriod != nil {
+					hostConfig.CpuRealtimePeriod = *ctrSpec.Linux.Resources.CPU.RealtimePeriod
+				}
+				if ctrSpec.Linux.Resources.CPU.RealtimeRuntime != nil {
+					hostConfig.CpuRealtimeRuntime = *ctrSpec.Linux.Resources.CPU.RealtimeRuntime
+				}
+				hostConfig.CpusetCpus = ctrSpec.Linux.Resources.CPU.Cpus
+				hostConfig.CpusetMems = ctrSpec.Linux.Resources.CPU.Mems
+			}
+			if ctrSpec.Linux.Resources.Memory != nil {
+				if ctrSpec.Linux.Resources.Memory.Limit != nil {
+					hostConfig.Memory = *ctrSpec.Linux.Resources.Memory.Limit
+				}
+				if ctrSpec.Linux.Resources.Memory.Reservation != nil {
+					hostConfig.MemoryReservation = *ctrSpec.Linux.Resources.Memory.Reservation
+				}
+				if ctrSpec.Linux.Resources.Memory.Swap != nil {
+					hostConfig.MemorySwap = *ctrSpec.Linux.Resources.Memory.Swap
+				}
+				if ctrSpec.Linux.Resources.Memory.Swappiness != nil {
+					hostConfig.MemorySwappiness = int64(*ctrSpec.Linux.Resources.Memory.Swappiness)
+				} else {
+					// Swappiness has a default of -1
+					hostConfig.MemorySwappiness = -1
+				}
+				if ctrSpec.Linux.Resources.Memory.DisableOOMKiller != nil {
+					hostConfig.OomKillDisable = *ctrSpec.Linux.Resources.Memory.DisableOOMKiller
+				}
+			}
+			if ctrSpec.Linux.Resources.Pids != nil {
+				hostConfig.PidsLimit = ctrSpec.Linux.Resources.Pids.Limit
+			}
+			hostConfig.CgroupConf = ctrSpec.Linux.Resources.Unified
+			if ctrSpec.Linux.Resources.BlockIO != nil {
+				if ctrSpec.Linux.Resources.BlockIO.Weight != nil {
+					hostConfig.BlkioWeight = *ctrSpec.Linux.Resources.BlockIO.Weight
+				}
+				hostConfig.BlkioWeightDevice = []define.InspectBlkioWeightDevice{}
+				for _, dev := range ctrSpec.Linux.Resources.BlockIO.WeightDevice {
+					key := fmt.Sprintf("%d:%d", dev.Major, dev.Minor)
+					// TODO: how do we handle LeafWeight vs
+					// Weight? For now, ignore anything
+					// without Weight set.
+					if dev.Weight == nil {
+						logrus.Infof("Ignoring weight device %s as it lacks a weight", key)
+						continue
+					}
+					if deviceNodes == nil {
+						nodes, err := util.FindDeviceNodes()
+						if err != nil {
+							return err
+						}
+						deviceNodes = nodes
+					}
+					path, ok := deviceNodes[key]
+					if !ok {
+						logrus.Infof("Could not locate weight device %s in system devices", key)
+						continue
+					}
+					weightDev := define.InspectBlkioWeightDevice{}
+					weightDev.Path = path
+					weightDev.Weight = *dev.Weight
+					hostConfig.BlkioWeightDevice = append(hostConfig.BlkioWeightDevice, weightDev)
+				}
+
+				readBps, err := blkioDeviceThrottle(deviceNodes, ctrSpec.Linux.Resources.BlockIO.ThrottleReadBpsDevice)
+				if err != nil {
+					return err
+				}
+				hostConfig.BlkioDeviceReadBps = readBps
+
+				writeBps, err := blkioDeviceThrottle(deviceNodes, ctrSpec.Linux.Resources.BlockIO.ThrottleWriteBpsDevice)
+				if err != nil {
+					return err
+				}
+				hostConfig.BlkioDeviceWriteBps = writeBps
+
+				readIops, err := blkioDeviceThrottle(deviceNodes, ctrSpec.Linux.Resources.BlockIO.ThrottleReadIOPSDevice)
+				if err != nil {
+					return err
+				}
+				hostConfig.BlkioDeviceReadIOps = readIops
+
+				writeIops, err := blkioDeviceThrottle(deviceNodes, ctrSpec.Linux.Resources.BlockIO.ThrottleWriteIOPSDevice)
+				if err != nil {
+					return err
+				}
+				hostConfig.BlkioDeviceWriteIOps = writeIops
+			}
+		}
+	}
+
+	// Cap add and cap drop.
+	// We need a default set of capabilities to compare against.
+	// The OCI generate package has one, and is commonly used, so we'll
+	// use it.
+	// Problem: there are 5 sets of capabilities.
+	// Use the bounding set for this computation, it's the most encompassing
+	// (but still not perfect).
+	capAdd := []string{}
+	capDrop := []string{}
+	// No point in continuing if we got a spec without a Process block...
+	if ctrSpec.Process != nil {
+		// Max an O(1) lookup table for default bounding caps.
+		boundingCaps := make(map[string]bool)
+		g, err := generate.New("linux")
+		if err != nil {
+			return err
+		}
+		if !hostConfig.Privileged {
+			for _, cap := range g.Config.Process.Capabilities.Bounding {
+				boundingCaps[cap] = true
+			}
+		} else {
+			// If we are privileged, use all caps.
+			for _, cap := range capability.List() {
+				if g.HostSpecific && cap > validate.LastCap() {
+					continue
+				}
+				boundingCaps[fmt.Sprintf("CAP_%s", strings.ToUpper(cap.String()))] = true
+			}
+		}
+		// Iterate through spec caps.
+		// If it's not in default bounding caps, it was added.
+		// If it is, delete from the default set. Whatever remains after
+		// we finish are the dropped caps.
+		for _, cap := range ctrSpec.Process.Capabilities.Bounding {
+			if _, ok := boundingCaps[cap]; ok {
+				delete(boundingCaps, cap)
+			} else {
+				capAdd = append(capAdd, cap)
+			}
+		}
+		for cap := range boundingCaps {
+			capDrop = append(capDrop, cap)
+		}
+		// Sort CapDrop so it displays in consistent order (GH #9490)
+		sort.Strings(capDrop)
+	}
+	hostConfig.CapAdd = capAdd
+	hostConfig.CapDrop = capDrop
+	switch {
+	case c.config.IPCNsCtr != "":
+		hostConfig.IpcMode = fmt.Sprintf("container:%s", c.config.IPCNsCtr)
+	case ctrSpec.Linux != nil:
+		// Locate the spec's IPC namespace.
+		// If there is none, it's ipc=host.
+		// If there is one and it has a path, it's "ns:".
+		// If no path, it's default - the empty string.
+		for _, ns := range ctrSpec.Linux.Namespaces {
+			if ns.Type == spec.IPCNamespace {
+				if ns.Path != "" {
+					hostConfig.IpcMode = fmt.Sprintf("ns:%s", ns.Path)
+				} else {
+					break
+				}
+			}
+		}
+	case c.config.NoShm:
+		hostConfig.IpcMode = "none"
+	case c.config.NoShmShare:
+		hostConfig.IpcMode = "private"
+	}
+	if hostConfig.IpcMode == "" {
+		hostConfig.IpcMode = "shareable"
+	}
+
+	// Cgroup namespace mode
+	cgroupMode := ""
+	if c.config.CgroupNsCtr != "" {
+		cgroupMode = fmt.Sprintf("container:%s", c.config.CgroupNsCtr)
+	} else if ctrSpec.Linux != nil {
+		// Locate the spec's cgroup namespace
+		// If there is none, it's cgroup=host.
+		// If there is one and it has a path, it's "ns:".
+		// If there is no path, it's private.
+		for _, ns := range ctrSpec.Linux.Namespaces {
+			if ns.Type == spec.CgroupNamespace {
+				if ns.Path != "" {
+					cgroupMode = fmt.Sprintf("ns:%s", ns.Path)
+				} else {
+					cgroupMode = "private"
+				}
+			}
+		}
+		if cgroupMode == "" {
+			cgroupMode = "host"
+		}
+	}
+	hostConfig.CgroupMode = cgroupMode
+
+	// Cgroup parent
+	// Need to check if it's the default, and not print if so.
+	defaultCgroupParent := ""
+	switch c.CgroupManager() {
+	case config.CgroupfsCgroupsManager:
+		defaultCgroupParent = CgroupfsDefaultCgroupParent
+	case config.SystemdCgroupsManager:
+		defaultCgroupParent = SystemdDefaultCgroupParent
+	}
+	if c.config.CgroupParent != defaultCgroupParent {
+		hostConfig.CgroupParent = c.config.CgroupParent
+	}
+	hostConfig.CgroupManager = c.CgroupManager()
+
+	// PID namespace mode
+	pidMode := ""
+	if c.config.PIDNsCtr != "" {
+		pidMode = fmt.Sprintf("container:%s", c.config.PIDNsCtr)
+	} else if ctrSpec.Linux != nil {
+		// Locate the spec's PID namespace.
+		// If there is none, it's pid=host.
+		// If there is one and it has a path, it's "ns:".
+		// If there is no path, it's default - the empty string.
+		for _, ns := range ctrSpec.Linux.Namespaces {
+			if ns.Type == spec.PIDNamespace {
+				if ns.Path != "" {
+					pidMode = fmt.Sprintf("ns:%s", ns.Path)
+				} else {
+					pidMode = "private"
+				}
+				break
+			}
+		}
+		if pidMode == "" {
+			pidMode = "host"
+		}
+	}
+	hostConfig.PidMode = pidMode
+
+	// UTS namespace mode
+	utsMode := c.NamespaceMode(spec.UTSNamespace, ctrSpec)
+
+	hostConfig.UTSMode = utsMode
+
+	// User namespace mode
+	usernsMode := ""
+	if c.config.UserNsCtr != "" {
+		usernsMode = fmt.Sprintf("container:%s", c.config.UserNsCtr)
+	} else if ctrSpec.Linux != nil {
+		// Locate the spec's user namespace.
+		// If there is none, it's default - the empty string.
+		// If there is one, it's "private" if no path, or "ns:" if
+		// there's a path.
+
+		for _, ns := range ctrSpec.Linux.Namespaces {
+			if ns.Type == spec.UserNamespace {
+				if ns.Path != "" {
+					usernsMode = fmt.Sprintf("ns:%s", ns.Path)
+				} else {
+					usernsMode = "private"
+				}
+			}
+		}
+	}
+	hostConfig.UsernsMode = usernsMode
+	if c.config.IDMappings.UIDMap != nil && c.config.IDMappings.GIDMap != nil {
+		hostConfig.IDMappings = generateIDMappings(c.config.IDMappings)
+	}
+	// Devices
+	// Do not include if privileged - assumed that all devices will be
+	// included.
+	var err error
+	hostConfig.Devices, err = c.GetDevices(hostConfig.Privileged, *ctrSpec, deviceNodes)
+	if err != nil {
+		return err
+	}
+
+	return nil
+}
diff --git a/libpod/pod_api.go b/libpod/pod_api.go
index 1bd686ddc..924d43436 100644
--- a/libpod/pod_api.go
+++ b/libpod/pod_api.go
@@ -40,7 +40,7 @@ func (p *Pod) startInitContainers(ctx context.Context) error {
 			icLock := initCon.lock
 			icLock.Lock()
 			var time *uint
-			if err := p.runtime.removeContainer(ctx, initCon, false, false, true, time); err != nil {
+			if err := p.runtime.removeContainer(ctx, initCon, false, false, true, false, time); err != nil {
 				icLock.Unlock()
 				return fmt.Errorf("failed to remove once init container %s: %w", initCon.ID(), err)
 			}
diff --git a/libpod/runtime_ctr.go b/libpod/runtime_ctr.go
index 1f032dd6b..7b3cbadfa 100644
--- a/libpod/runtime_ctr.go
+++ b/libpod/runtime_ctr.go
@@ -581,7 +581,7 @@ func (r *Runtime) setupContainer(ctx context.Context, ctr *Container) (_ *Contai
 // be removed also if and only if the container is the sole user
 // Otherwise, RemoveContainer will return an error if the container is running
 func (r *Runtime) RemoveContainer(ctx context.Context, c *Container, force bool, removeVolume bool, timeout *uint) error {
-	return r.removeContainer(ctx, c, force, removeVolume, false, timeout)
+	return r.removeContainer(ctx, c, force, removeVolume, false, false, timeout)
 }
 
 // Internal function to remove a container.
@@ -589,7 +589,9 @@ func (r *Runtime) RemoveContainer(ctx context.Context, c *Container, force bool,
 // removePod is used only when removing pods. It instructs Podman to ignore
 // infra container protections, and *not* remove from the database (as pod
 // remove will handle that).
-func (r *Runtime) removeContainer(ctx context.Context, c *Container, force, removeVolume, removePod bool, timeout *uint) error {
+// ignoreDeps is *DANGEROUS* and should not be used outside of a very specific
+// context (alternate pod removal code, where graph traversal is not possible).
+func (r *Runtime) removeContainer(ctx context.Context, c *Container, force, removeVolume, removePod, ignoreDeps bool, timeout *uint) error {
 	if !c.valid {
 		if ok, _ := r.state.HasContainer(c.ID()); !ok {
 			// Container probably already removed
@@ -618,25 +620,27 @@ func (r *Runtime) removeContainer(ctx context.Context, c *Container, force, remo
 	// pod.
 	var pod *Pod
 	runtime := c.runtime
-	if c.config.Pod != "" && !removePod {
+	if c.config.Pod != "" {
 		pod, err = r.state.Pod(c.config.Pod)
 		if err != nil {
 			return fmt.Errorf("container %s is in pod %s, but pod cannot be retrieved: %w", c.ID(), pod.ID(), err)
 		}
 
-		// Lock the pod while we're removing container
-		if pod.config.LockID == c.config.LockID {
-			return fmt.Errorf("container %s and pod %s share lock ID %d: %w", c.ID(), pod.ID(), c.config.LockID, define.ErrWillDeadlock)
-		}
-		pod.lock.Lock()
-		defer pod.lock.Unlock()
-		if err := pod.updatePod(); err != nil {
-			return err
-		}
+		if !removePod {
+			// Lock the pod while we're removing container
+			if pod.config.LockID == c.config.LockID {
+				return fmt.Errorf("container %s and pod %s share lock ID %d: %w", c.ID(), pod.ID(), c.config.LockID, define.ErrWillDeadlock)
+			}
+			pod.lock.Lock()
+			defer pod.lock.Unlock()
+			if err := pod.updatePod(); err != nil {
+				return err
+			}
 
-		infraID := pod.state.InfraContainerID
-		if c.ID() == infraID {
-			return fmt.Errorf("container %s is the infra container of pod %s and cannot be removed without removing the pod", c.ID(), pod.ID())
+			infraID := pod.state.InfraContainerID
+			if c.ID() == infraID {
+				return fmt.Errorf("container %s is the infra container of pod %s and cannot be removed without removing the pod", c.ID(), pod.ID())
+			}
 		}
 	}
 
@@ -696,7 +700,7 @@ func (r *Runtime) removeContainer(ctx context.Context, c *Container, force, remo
 	// Check that no other containers depend on the container.
 	// Only used if not removing a pod - pods guarantee that all
 	// deps will be evicted at the same time.
-	if !removePod {
+	if !ignoreDeps {
 		deps, err := r.state.ContainerInUse(c)
 		if err != nil {
 			return err
@@ -777,13 +781,11 @@ func (r *Runtime) removeContainer(ctx context.Context, c *Container, force, remo
 	if c.config.Pod != "" {
 		// If we're removing the pod, the container will be evicted
 		// from the state elsewhere
-		if !removePod {
-			if err := r.state.RemoveContainerFromPod(pod, c); err != nil {
-				if cleanupErr == nil {
-					cleanupErr = err
-				} else {
-					logrus.Errorf("Removing container %s from database: %v", c.ID(), err)
-				}
+		if err := r.state.RemoveContainerFromPod(pod, c); err != nil {
+			if cleanupErr == nil {
+				cleanupErr = err
+			} else {
+				logrus.Errorf("Removing container %s from database: %v", c.ID(), err)
 			}
 		}
 	} else {
@@ -872,7 +874,7 @@ func (r *Runtime) evictContainer(ctx context.Context, idOrName string, removeVol
 	if err == nil {
 		logrus.Infof("Container %s successfully retrieved from state, attempting normal removal", id)
 		// Assume force = true for the evict case
-		err = r.removeContainer(ctx, tmpCtr, true, removeVolume, false, timeout)
+		err = r.removeContainer(ctx, tmpCtr, true, removeVolume, false, false, timeout)
 		if !tmpCtr.valid {
 			// If the container is marked invalid, remove succeeded
 			// in kicking it out of the state - no need to continue.
@@ -1034,7 +1036,7 @@ func (r *Runtime) RemoveDepend(ctx context.Context, rmCtr *Container, force bool
 	}
 
 	report := reports.RmReport{Id: rmCtr.ID(), RawInput: rmCtr.ID()}
-	report.Err = r.removeContainer(ctx, rmCtr, force, removeVolume, false, timeout)
+	report.Err = r.removeContainer(ctx, rmCtr, force, removeVolume, false, false, timeout)
 	return append(rmReports, &report), nil
 }
 
diff --git a/libpod/runtime_img.go b/libpod/runtime_img.go
index 5510b2af6..dacbd752f 100644
--- a/libpod/runtime_img.go
+++ b/libpod/runtime_img.go
@@ -47,7 +47,7 @@ func (r *Runtime) RemoveContainersForImageCallback(ctx context.Context) libimage
 					return fmt.Errorf("removing image %s: container %s using image could not be removed: %w", imageID, ctr.ID(), err)
 				}
 			} else {
-				if err := r.removeContainer(ctx, ctr, true, false, false, timeout); err != nil {
+				if err := r.removeContainer(ctx, ctr, true, false, false, false, timeout); err != nil {
 					return fmt.Errorf("removing image %s: container %s using image could not be removed: %w", imageID, ctr.ID(), err)
 				}
 			}
diff --git a/libpod/runtime_pod_linux.go b/libpod/runtime_pod_linux.go
index 3eeef69d8..24e9f3da7 100644
--- a/libpod/runtime_pod_linux.go
+++ b/libpod/runtime_pod_linux.go
@@ -17,6 +17,7 @@ import (
 	"github.com/containers/podman/v4/libpod/events"
 	"github.com/containers/podman/v4/pkg/rootless"
 	"github.com/containers/podman/v4/pkg/specgen"
+	"github.com/hashicorp/go-multierror"
 	"github.com/sirupsen/logrus"
 )
 
@@ -191,29 +192,9 @@ func (r *Runtime) SavePod(pod *Pod) error {
 	return nil
 }
 
-func (r *Runtime) removePod(ctx context.Context, p *Pod, removeCtrs, force bool, timeout *uint) error {
-	if err := p.updatePod(); err != nil {
-		return err
-	}
-
-	ctrs, err := r.state.PodContainers(p)
-	if err != nil {
-		return err
-	}
-	numCtrs := len(ctrs)
-
-	// If the only running container in the pod is the pause container, remove the pod and container unconditionally.
-	pauseCtrID := p.state.InfraContainerID
-	if numCtrs == 1 && ctrs[0].ID() == pauseCtrID {
-		removeCtrs = true
-		force = true
-	}
-	if !removeCtrs && numCtrs > 0 {
-		return fmt.Errorf("pod %s contains containers and cannot be removed: %w", p.ID(), define.ErrCtrExists)
-	}
-
-	ctrNamedVolumes := make(map[string]*ContainerNamedVolume)
-
+// DO NOT USE THIS FUNCTION DIRECTLY. Use removePod(), below. It will call
+// removeMalformedPod() if necessary.
+func (r *Runtime) removeMalformedPod(ctx context.Context, p *Pod, ctrs []*Container, force bool, timeout *uint, ctrNamedVolumes map[string]*ContainerNamedVolume) error {
 	var removalErr error
 	for _, ctr := range ctrs {
 		err := func() error {
@@ -231,7 +212,7 @@ func (r *Runtime) removePod(ctx context.Context, p *Pod, removeCtrs, force bool,
 				ctrNamedVolumes[vol.Name] = vol
 			}
 
-			return r.removeContainer(ctx, ctr, force, false, true, timeout)
+			return r.removeContainer(ctx, ctr, force, false, true, true, timeout)
 		}()
 
 		if removalErr == nil {
@@ -261,6 +242,69 @@ func (r *Runtime) removePod(ctx context.Context, p *Pod, removeCtrs, force bool,
 		return err
 	}
 
+	return nil
+}
+
+func (r *Runtime) removePod(ctx context.Context, p *Pod, removeCtrs, force bool, timeout *uint) error {
+	if err := p.updatePod(); err != nil {
+		return err
+	}
+
+	ctrs, err := r.state.PodContainers(p)
+	if err != nil {
+		return err
+	}
+	numCtrs := len(ctrs)
+
+	// If the only running container in the pod is the pause container, remove the pod and container unconditionally.
+	pauseCtrID := p.state.InfraContainerID
+	if numCtrs == 1 && ctrs[0].ID() == pauseCtrID {
+		removeCtrs = true
+		force = true
+	}
+	if !removeCtrs && numCtrs > 0 {
+		return fmt.Errorf("pod %s contains containers and cannot be removed: %w", p.ID(), define.ErrCtrExists)
+	}
+
+	var removalErr error
+	ctrNamedVolumes := make(map[string]*ContainerNamedVolume)
+
+	// Build a graph of all containers in the pod.
+	graph, err := BuildContainerGraph(ctrs)
+	if err != nil {
+		// We have to allow the pod to be removed.
+		// But let's only do it if force is set.
+		if !force {
+			return fmt.Errorf("cannot create container graph for pod %s: %w", p.ID(), err)
+		}
+
+		removalErr = fmt.Errorf("creating container graph for pod %s failed, fell back to loop removal: %w", p.ID(), err)
+
+		if err := r.removeMalformedPod(ctx, p, ctrs, force, timeout, ctrNamedVolumes); err != nil {
+			logrus.Errorf("Error creating container graph for pod %s: %v. Falling back to loop removal.", p.ID(), err)
+			return err
+		}
+	} else {
+		ctrErrors := make(map[string]error)
+		ctrsVisited := make(map[string]bool)
+
+		for _, node := range graph.notDependedOnNodes {
+			removeNode(ctx, node, p, force, timeout, false, ctrErrors, ctrsVisited, ctrNamedVolumes)
+		}
+
+		// This is gross, but I don't want to change the signature on
+		// removePod - especially since any change here eventually has
+		// to map down to one error unless we want to make a breaking
+		// API change.
+		if len(ctrErrors) > 0 {
+			var allErrs error
+			for id, err := range ctrErrors {
+				allErrs = multierror.Append(allErrs, fmt.Errorf("removing container %s from pod %s: %w", id, p.ID(), err))
+			}
+			return allErrs
+		}
+	}
+
 	for volName := range ctrNamedVolumes {
 		volume, err := r.state.Volume(volName)
 		if err != nil && !errors.Is(err, define.ErrNoSuchVolume) {
diff --git a/libpod/runtime_volume_linux.go b/libpod/runtime_volume_linux.go
index c9a4a7dc1..08fdbf977 100644
--- a/libpod/runtime_volume_linux.go
+++ b/libpod/runtime_volume_linux.go
@@ -324,7 +324,7 @@ func (r *Runtime) removeVolume(ctx context.Context, v *Volume, force bool, timeo
 
 			logrus.Debugf("Removing container %s (depends on volume %q)", ctr.ID(), v.Name())
 
-			if err := r.removeContainer(ctx, ctr, force, false, false, timeout); err != nil {
+			if err := r.removeContainer(ctx, ctr, force, false, false, false, timeout); err != nil {
 				return fmt.Errorf("removing container %s that depends on volume %s: %w", ctr.ID(), v.Name(), err)
 			}
 		}