diff options
-rw-r--r-- | cmd/podman/common/create.go | 2 | ||||
-rw-r--r-- | cmd/podman/containers/create.go | 4 | ||||
-rw-r--r-- | docs/source/markdown/podman-create.1.md | 5 | ||||
-rw-r--r-- | docs/source/markdown/podman-run.1.md | 10 | ||||
-rw-r--r-- | libpod/container.go | 22 | ||||
-rw-r--r-- | libpod/container_internal_linux.go | 12 | ||||
-rw-r--r-- | libpod/container_validate.go | 4 | ||||
-rw-r--r-- | libpod/oci_conmon.go | 7 | ||||
-rw-r--r-- | libpod/oci_conmon_linux.go | 10 | ||||
-rw-r--r-- | libpod/options.go | 2 | ||||
-rw-r--r-- | libpod/runtime_ctr.go | 4 | ||||
-rw-r--r-- | pkg/api/handlers/compat/networks.go | 2 | ||||
-rw-r--r-- | pkg/api/server/server.go | 4 | ||||
-rw-r--r-- | pkg/systemd/generate/common.go | 5 | ||||
-rw-r--r-- | pkg/systemd/generate/common_test.go | 9 | ||||
-rw-r--r-- | test/system/200-pod.bats | 41 | ||||
-rw-r--r-- | test/system/410-selinux.bats | 19 | ||||
-rw-r--r-- | utils/utils_supported.go | 122 | ||||
-rw-r--r-- | utils/utils_windows.go | 12 |
19 files changed, 278 insertions, 18 deletions
diff --git a/cmd/podman/common/create.go b/cmd/podman/common/create.go index fbb7f449e..ec3251ae1 100644 --- a/cmd/podman/common/create.go +++ b/cmd/podman/common/create.go @@ -57,7 +57,7 @@ func GetCreateFlags(cf *ContainerCLIOpts) *pflag.FlagSet { createFlags.StringVar( &cf.CGroupsMode, "cgroups", containerConfig.Cgroups(), - `control container cgroup configuration ("enabled"|"disabled"|"no-conmon")`, + `control container cgroup configuration ("enabled"|"disabled"|"no-conmon"|"split")`, ) createFlags.StringVar( &cf.CGroupParent, diff --git a/cmd/podman/containers/create.go b/cmd/podman/containers/create.go index 91a8d43f6..60e9aa815 100644 --- a/cmd/podman/containers/create.go +++ b/cmd/podman/containers/create.go @@ -210,6 +210,10 @@ func createInit(c *cobra.Command) error { cliVals.Env = env } + if c.Flag("cgroups").Changed && cliVals.CGroupsMode == "split" && registry.IsRemote() { + return errors.Errorf("the option --cgroups=%q is not supported in remote mode", cliVals.CGroupsMode) + } + // Docker-compatibility: the "-h" flag for run/create is reserved for // the hostname (see https://github.com/containers/libpod/issues/1367). diff --git a/docs/source/markdown/podman-create.1.md b/docs/source/markdown/podman-create.1.md index 3ec91a3ad..ded668f34 100644 --- a/docs/source/markdown/podman-create.1.md +++ b/docs/source/markdown/podman-create.1.md @@ -78,9 +78,12 @@ If the host uses cgroups v1, the default is set to **host**. On cgroups v2 the **--cgroups**=*mode* Determines whether the container will create CGroups. -Valid values are *enabled*, *disabled*, *no-conmon*, which the default being *enabled*. +Valid values are *enabled*, *disabled*, *no-conmon*, *split*, which the default being *enabled*. + +The *enabled* option will create a new cgroup under the cgroup-parent. The *disabled* option will force the container to not create CGroups, and thus conflicts with CGroup options (**--cgroupns** and **--cgroup-parent**). The *no-conmon* option disables a new CGroup only for the conmon process. +The *split* option splits the current cgroup in two sub-cgroups: one for conmon and one for the container payload. It is not possible to set *--cgroup-parent* with *split*. **--cgroup-parent**=*path* diff --git a/docs/source/markdown/podman-run.1.md b/docs/source/markdown/podman-run.1.md index 88666d595..83971107f 100644 --- a/docs/source/markdown/podman-run.1.md +++ b/docs/source/markdown/podman-run.1.md @@ -89,14 +89,16 @@ Set the cgroup namespace mode for the container. If the host uses cgroups v1, the default is set to **host**. On cgroups v2, the default is **private**. -**--cgroups**=**enabled**|**disabled**|**no-conmon** +**--cgroups**=**enabled**|**disabled**|**no-conmon**|**split** Determines whether the container will create CGroups. -Default is **enabled**. The **disabled** option will force the container -to not create CGroups, and thus conflicts with CGroup options -(**--cgroupns** and **--cgroup-parent**). +Default is **enabled**. + +The **enabled** option will create a new cgroup under the cgroup-parent. +The **disabled** option will force the container to not create CGroups, and thus conflicts with CGroup options (**--cgroupns** and **--cgroup-parent**). The **no-conmon** option disables a new CGroup only for the **conmon** process. +The **split** option splits the current cgroup in two sub-cgroups: one for conmon and one for the container payload. It is not possible to set **--cgroup-parent** with **split**. **--cgroup-parent**=*path* diff --git a/libpod/container.go b/libpod/container.go index c85249676..20688e3ee 100644 --- a/libpod/container.go +++ b/libpod/container.go @@ -17,6 +17,7 @@ import ( "github.com/containers/libpod/libpod/lock" "github.com/containers/libpod/pkg/namespaces" "github.com/containers/libpod/pkg/rootless" + "github.com/containers/libpod/utils" "github.com/containers/storage" "github.com/cri-o/ocicni/pkg/ocicni" spec "github.com/opencontainers/runtime-spec/specs-go" @@ -1089,10 +1090,25 @@ func (c *Container) NamespacePath(linuxNS LinuxNS) (string, error) { //nolint:in // CGroupPath returns a cgroups "path" for a given container. func (c *Container) CGroupPath() (string, error) { - switch c.runtime.config.Engine.CgroupManager { - case config.CgroupfsCgroupsManager: + switch { + case c.config.CgroupsMode == cgroupSplit: + if c.config.CgroupParent != "" { + return "", errors.Errorf("cannot specify cgroup-parent with cgroup-mode %q", cgroupSplit) + } + cg, err := utils.GetCgroupProcess(c.state.ConmonPID) + if err != nil { + return "", err + } + // Use the conmon cgroup for two reasons: we validate the container + // delegation was correct, and the conmon cgroup doesn't change at runtime + // while we are not sure about the container that can create sub cgroups. + if !strings.HasSuffix(cg, "supervisor") { + return "", errors.Errorf("invalid cgroup for conmon %q", cg) + } + return strings.TrimSuffix(cg, "/supervisor") + "/container", nil + case c.runtime.config.Engine.CgroupManager == config.CgroupfsCgroupsManager: return filepath.Join(c.config.CgroupParent, fmt.Sprintf("libpod-%s", c.ID())), nil - case config.SystemdCgroupsManager: + case c.runtime.config.Engine.CgroupManager == config.SystemdCgroupsManager: if rootless.IsRootless() { uid := rootless.GetRootlessUID() parts := strings.SplitN(c.config.CgroupParent, "/", 2) diff --git a/libpod/container_internal_linux.go b/libpod/container_internal_linux.go index 5ee6726e0..2c78f6bd2 100644 --- a/libpod/container_internal_linux.go +++ b/libpod/container_internal_linux.go @@ -31,6 +31,7 @@ import ( "github.com/containers/libpod/pkg/resolvconf" "github.com/containers/libpod/pkg/rootless" "github.com/containers/libpod/pkg/util" + "github.com/containers/libpod/utils" "github.com/containers/storage/pkg/archive" securejoin "github.com/cyphar/filepath-securejoin" "github.com/opencontainers/runc/libcontainer/user" @@ -1505,8 +1506,17 @@ func (c *Container) getOCICgroupPath() (string, error) { switch { case (rootless.IsRootless() && !unified) || c.config.NoCgroups: return "", nil + case c.config.CgroupsMode == cgroupSplit: + if c.config.CgroupParent != "" { + return c.config.CgroupParent, nil + } + selfCgroup, err := utils.GetOwnCgroup() + if err != nil { + return "", err + } + return filepath.Join(selfCgroup, "container"), nil case c.runtime.config.Engine.CgroupManager == config.SystemdCgroupsManager: - // When runc is set to use Systemd as a cgroup manager, it + // When the OCI runtime is set to use Systemd as a cgroup manager, it // expects cgroups to be passed as follows: // slice:prefix:name systemdCgroups := fmt.Sprintf("%s:libpod:%s", path.Base(c.config.CgroupParent), c.ID()) diff --git a/libpod/container_validate.go b/libpod/container_validate.go index b7f0aadff..a53a1839d 100644 --- a/libpod/container_validate.go +++ b/libpod/container_validate.go @@ -34,6 +34,10 @@ func (c *Container) validate() error { return errors.Wrapf(define.ErrInvalidArg, "cannot both create a network namespace and join another container's network namespace") } + if c.config.CgroupsMode == cgroupSplit && c.config.CgroupParent != "" { + return errors.Wrapf(define.ErrInvalidArg, "cannot specify --cgroup-mode=split with a cgroup-parent") + } + // Not creating cgroups has a number of requirements, mostly related to // the PID namespace. if c.config.NoCgroups || c.config.CgroupsMode == "disabled" { diff --git a/libpod/oci_conmon.go b/libpod/oci_conmon.go new file mode 100644 index 000000000..74060b357 --- /dev/null +++ b/libpod/oci_conmon.go @@ -0,0 +1,7 @@ +package libpod + +const ( + // cgroupSplit is the cgroup mode for reusing the current cgroup both + // for conmon and for the container payload. + cgroupSplit = "split" +) diff --git a/libpod/oci_conmon_linux.go b/libpod/oci_conmon_linux.go index d8a89047e..26e5d70b0 100644 --- a/libpod/oci_conmon_linux.go +++ b/libpod/oci_conmon_linux.go @@ -881,6 +881,12 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co return err } + if ctr.config.CgroupsMode == cgroupSplit { + if err := utils.MoveUnderCgroupSubtree("supervisor"); err != nil { + return err + } + } + args := r.sharedConmonArgs(ctr, ctr.ID(), ctr.bundlePath(), filepath.Join(ctr.state.RunDir, "pidfile"), ctr.LogPath(), r.exitsDir, ociLog, ctr.LogDriver(), logTag) if ctr.config.Spec.Process.Terminal { @@ -1173,7 +1179,7 @@ func (r *ConmonOCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, p "--socket-dir-path", r.socketsDir, } - if r.cgroupManager == config.SystemdCgroupsManager && !ctr.config.NoCgroups { + if r.cgroupManager == config.SystemdCgroupsManager && !ctr.config.NoCgroups && ctr.config.CgroupsMode != cgroupSplit { args = append(args, "-s") } @@ -1275,7 +1281,7 @@ func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec // If cgroup creation is disabled - just signal. switch ctr.config.CgroupsMode { - case "disabled", "no-conmon": + case "disabled", "no-conmon", cgroupSplit: mustCreateCgroup = false } diff --git a/libpod/options.go b/libpod/options.go index 7a60870a0..28be1bc03 100644 --- a/libpod/options.go +++ b/libpod/options.go @@ -1049,7 +1049,7 @@ func WithCgroupsMode(mode string) CtrCreateOption { case "disabled": ctr.config.NoCgroups = true ctr.config.CgroupsMode = mode - case "enabled", "no-conmon": + case "enabled", "no-conmon", cgroupSplit: ctr.config.CgroupsMode = mode default: return errors.Wrapf(define.ErrInvalidArg, "Invalid cgroup mode %q", mode) diff --git a/libpod/runtime_ctr.go b/libpod/runtime_ctr.go index f1752cbeb..dd6602acb 100644 --- a/libpod/runtime_ctr.go +++ b/libpod/runtime_ctr.go @@ -233,9 +233,9 @@ func (r *Runtime) setupContainer(ctx context.Context, ctr *Container) (_ *Contai return nil, errors.Wrapf(err, "error retrieving pod %s cgroup", pod.ID()) } ctr.config.CgroupParent = podCgroup - case rootless.IsRootless(): + case rootless.IsRootless() && ctr.config.CgroupsMode != cgroupSplit: ctr.config.CgroupParent = SystemdDefaultRootlessCgroupParent - default: + case ctr.config.CgroupsMode != cgroupSplit: ctr.config.CgroupParent = SystemdDefaultCgroupParent } } else if len(ctr.config.CgroupParent) < 6 || !strings.HasSuffix(path.Base(ctr.config.CgroupParent), ".slice") { diff --git a/pkg/api/handlers/compat/networks.go b/pkg/api/handlers/compat/networks.go index 0f1eca5e5..7209255d7 100644 --- a/pkg/api/handlers/compat/networks.go +++ b/pkg/api/handlers/compat/networks.go @@ -285,7 +285,7 @@ func RemoveNetwork(w http.ResponseWriter, r *http.Request) { return } if !exists { - utils.Error(w, "network not found", http.StatusNotFound, err) + utils.Error(w, "network not found", http.StatusNotFound, network.ErrNetworkNotFound) return } if err := network.RemoveNetwork(config, name); err != nil { diff --git a/pkg/api/server/server.go b/pkg/api/server/server.go index bd6a99b96..5b2f8bea2 100644 --- a/pkg/api/server/server.go +++ b/pkg/api/server/server.go @@ -173,6 +173,10 @@ func (s *APIServer) Serve() error { }() } + // Before we start serving, ensure umask is properly set for container + // creation. + _ = syscall.Umask(0022) + go func() { err := s.Server.Serve(s.Listener) if err != nil && err != http.ErrServerClosed { diff --git a/pkg/systemd/generate/common.go b/pkg/systemd/generate/common.go index fe56dc874..d6d18a810 100644 --- a/pkg/systemd/generate/common.go +++ b/pkg/systemd/generate/common.go @@ -1,6 +1,8 @@ package generate import ( + "strings" + "github.com/pkg/errors" ) @@ -44,6 +46,9 @@ func filterPodFlags(command []string) []string { i++ continue } + if strings.HasPrefix(s, "--pod=") || strings.HasPrefix(s, "--pod-id-file=") { + continue + } processed = append(processed, s) } return processed diff --git a/pkg/systemd/generate/common_test.go b/pkg/systemd/generate/common_test.go index f53bb7828..389c30f59 100644 --- a/pkg/systemd/generate/common_test.go +++ b/pkg/systemd/generate/common_test.go @@ -1,6 +1,7 @@ package generate import ( + "strings" "testing" "github.com/stretchr/testify/assert" @@ -14,12 +15,16 @@ func TestFilterPodFlags(t *testing.T) { {[]string{"podman", "pod", "create"}}, {[]string{"podman", "pod", "create", "--name", "foo"}}, {[]string{"podman", "pod", "create", "--pod-id-file", "foo"}}, + {[]string{"podman", "pod", "create", "--pod-id-file=foo"}}, {[]string{"podman", "run", "--pod", "foo"}}, + {[]string{"podman", "run", "--pod=foo"}}, } for _, test := range tests { processed := filterPodFlags(test.input) - assert.NotContains(t, processed, "--pod-id-file") - assert.NotContains(t, processed, "--pod") + for _, s := range processed { + assert.False(t, strings.HasPrefix(s, "--pod-id-file")) + assert.False(t, strings.HasPrefix(s, "--pod")) + } } } diff --git a/test/system/200-pod.bats b/test/system/200-pod.bats index 9a6b39057..0e9d9132e 100644 --- a/test/system/200-pod.bats +++ b/test/system/200-pod.bats @@ -150,6 +150,18 @@ function random_ip() { pod_id_file=${PODMAN_TMPDIR}/pod-id-file + # Randomly-assigned ports in the 5xxx and 6xxx range + for port_in in $(shuf -i 5000-5999);do + if ! { exec 3<> /dev/tcp/127.0.0.1/$port_in; } &>/dev/null; then + break + fi + done + for port_out in $(shuf -i 6000-6999);do + if ! { exec 3<> /dev/tcp/127.0.0.1/$port_out; } &>/dev/null; then + break + fi + done + # Create a pod with all the desired options # FIXME: --ip=$ip fails: # Error adding network: failed to allocate all requested IPs @@ -161,6 +173,7 @@ function random_ip() { --dns "$dns_server" \ --dns-search "$dns_search" \ --dns-opt "$dns_opt" \ + --publish "$port_out:$port_in" \ --label "${labelname}=${labelvalue}" pod_id="$output" @@ -199,6 +212,34 @@ function random_ip() { run_podman pod ps --no-trunc --filter "label=${labelname}=${labelvalue}" --format '{{.ID}}' is "$output" "$pod_id" "pod ps --filter label=..." + # Test local port forwarding, as well as 'ps' output showing ports + # Run 'nc' in a container, waiting for input on the published port. + c_name=$(random_string 15) + run_podman run -d --pod mypod --name $c_name $IMAGE nc -l -p $port_in + cid="$output" + + # Try running another container also listening on the same port. + run_podman 1 run --pod mypod --name dsfsdfsdf $IMAGE nc -l -p $port_in + is "$output" "nc: bind: Address in use" \ + "two containers cannot bind to same port" + + # While the container is still running, run 'podman ps' (no --format) + # and confirm that the output includes the published port + run_podman ps --filter id=$cid + is "${lines[1]}" "${cid:0:12} $IMAGE nc -l -p $port_in .* 0.0.0.0:$port_out->$port_in/tcp $c_name" \ + "output of 'podman ps'" + + # send a random string to the container. This will cause the container + # to output the string to its logs, then exit. + teststring=$(random_string 30) + echo "$teststring" | nc 127.0.0.1 $port_out + + # Confirm that the container log output is the string we sent it. + run_podman logs $cid + is "$output" "$teststring" "test string received on container" + + # Clean up + run_podman rm $cid run_podman pod rm -f mypod } diff --git a/test/system/410-selinux.bats b/test/system/410-selinux.bats index 8a0477eff..1769730f0 100644 --- a/test/system/410-selinux.bats +++ b/test/system/410-selinux.bats @@ -63,4 +63,23 @@ function check_label() { check_label "--security-opt label=level:s0:c1,c2" "container_t" "s0:c1,c2" } +# pr #6752 +@test "podman selinux: inspect multiple labels" { + if [ ! -e /usr/sbin/selinuxenabled ] || ! /usr/sbin/selinuxenabled; then + skip "selinux disabled or not available" + fi + + run_podman run -d --name myc \ + --security-opt seccomp=unconfined \ + --security-opt label=type:spc_t \ + --security-opt label=level:s0 \ + $IMAGE sh -c 'while test ! -e /stop; do sleep 0.1; done' + run_podman inspect --format='{{ .HostConfig.SecurityOpt }}' myc + is "$output" "\[label=type:spc_t,label=level:s0 seccomp=unconfined]" \ + "'podman inspect' preserves all --security-opts" + + run_podman exec myc touch /stop + run_podman rm -f myc +} + # vim: filetype=sh diff --git a/utils/utils_supported.go b/utils/utils_supported.go index ce9fd5604..201ddb57b 100644 --- a/utils/utils_supported.go +++ b/utils/utils_supported.go @@ -3,10 +3,20 @@ package utils import ( + "bufio" + "bytes" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strings" + "github.com/containers/libpod/pkg/cgroups" "github.com/containers/libpod/pkg/rootless" systemdDbus "github.com/coreos/go-systemd/v22/dbus" "github.com/godbus/dbus/v5" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" ) // RunUnderSystemdScope adds the specified pid to a systemd scope @@ -43,6 +53,118 @@ func RunUnderSystemdScope(pid int, slice string, unitName string) error { return nil } +func getCgroupProcess(procFile string) (string, error) { + f, err := os.Open(procFile) + if err != nil { + return "", errors.Wrapf(err, "open file %q", procFile) + } + defer f.Close() + + scanner := bufio.NewScanner(f) + cgroup := "/" + for scanner.Scan() { + line := scanner.Text() + parts := strings.Split(line, ":") + if len(parts) != 3 { + return "", errors.Errorf("cannot parse cgroup line %q", line) + } + if strings.HasPrefix(line, "0::") { + cgroup = line[3:] + break + } + // root cgroup, skip it + if parts[2] == "/" { + continue + } + // The process must have the same cgroup path for all controllers + // The OCI runtime spec file allow us to specify only one path. + if cgroup != "/" && cgroup != parts[2] { + return "", errors.Errorf("cgroup configuration not supported, the process is in two different cgroups") + } + cgroup = parts[2] + } + if cgroup == "/" { + return "", errors.Errorf("could not find cgroup mount in %q", procFile) + } + return cgroup, nil +} + +// GetOwnCgroup returns the cgroup for the current process. +func GetOwnCgroup() (string, error) { + return getCgroupProcess("/proc/self/cgroup") +} + +// GetCgroupProcess returns the cgroup for the specified process process. +func GetCgroupProcess(pid int) (string, error) { + return getCgroupProcess(fmt.Sprintf("/proc/%d/cgroup", pid)) +} + +// MoveUnderCgroupSubtree moves the PID under a cgroup subtree. +func MoveUnderCgroupSubtree(subtree string) error { + procFile := "/proc/self/cgroup" + f, err := os.Open(procFile) + if err != nil { + return errors.Wrapf(err, "open file %q", procFile) + } + defer f.Close() + + unifiedMode, err := cgroups.IsCgroup2UnifiedMode() + if err != nil { + return err + } + + scanner := bufio.NewScanner(f) + for scanner.Scan() { + line := scanner.Text() + parts := strings.Split(line, ":") + if len(parts) != 3 { + return errors.Errorf("cannot parse cgroup line %q", line) + } + + // root cgroup, skip it + if parts[2] == "/" { + continue + } + + cgroupRoot := "/sys/fs/cgroup" + // Special case the unified mount on hybrid cgroup and named hierarchies. + // This works on Fedora 31, but we should really parse the mounts to see + // where the cgroup hierarchy is mounted. + if parts[1] == "" && !unifiedMode { + // If it is not using unified mode, the cgroup v2 hierarchy is + // usually mounted under /sys/fs/cgroup/unified + cgroupRoot = filepath.Join(cgroupRoot, "unified") + } else if parts[1] != "" { + // Assume the controller is mounted at /sys/fs/cgroup/$CONTROLLER. + controller := strings.TrimPrefix(parts[1], "name=") + cgroupRoot = filepath.Join(cgroupRoot, controller) + } + + processes, err := ioutil.ReadFile(filepath.Join(cgroupRoot, parts[2], "cgroup.procs")) + if err != nil { + return err + } + + newCgroup := filepath.Join(cgroupRoot, parts[2], subtree) + if err := os.Mkdir(newCgroup, 0755); err != nil { + return err + } + + f, err := os.OpenFile(filepath.Join(newCgroup, "cgroup.procs"), os.O_RDWR, 0755) + if err != nil { + return err + } + defer f.Close() + + for _, pid := range bytes.Split(processes, []byte("\n")) { + if _, err := f.Write(pid); err != nil { + logrus.Warnf("Cannot move process %s to cgroup %q", pid, newCgroup) + } + } + } + return nil +} + func newProp(name string, units interface{}) systemdDbus.Property { return systemdDbus.Property{ Name: name, diff --git a/utils/utils_windows.go b/utils/utils_windows.go index db27877d9..1a2196029 100644 --- a/utils/utils_windows.go +++ b/utils/utils_windows.go @@ -7,3 +7,15 @@ import "github.com/pkg/errors" func RunUnderSystemdScope(pid int, slice string, unitName string) error { return errors.New("not implemented for windows") } + +func MoveUnderCgroupSubtree(subtree string) error { + return errors.New("not implemented for windows") +} + +func GetOwnCgroup() (string, error) { + return "", errors.New("not implemented for windows") +} + +func GetCgroupProcess(pid int) (string, error) { + return "", errors.New("not implemented for windows") +} |