diff options
Diffstat (limited to 'libpod/networking_common.go')
-rw-r--r-- | libpod/networking_common.go | 719 |
1 files changed, 719 insertions, 0 deletions
diff --git a/libpod/networking_common.go b/libpod/networking_common.go new file mode 100644 index 000000000..fa444e26a --- /dev/null +++ b/libpod/networking_common.go @@ -0,0 +1,719 @@ +//go:build linux || freebsd +// +build linux freebsd + +package libpod + +import ( + "errors" + "fmt" + "regexp" + "sort" + + "github.com/containers/common/libnetwork/etchosts" + "github.com/containers/common/libnetwork/types" + "github.com/containers/common/pkg/config" + "github.com/containers/common/pkg/machine" + "github.com/containers/common/pkg/util" + "github.com/containers/podman/v4/libpod/define" + "github.com/containers/podman/v4/libpod/events" + "github.com/containers/podman/v4/pkg/namespaces" + "github.com/containers/podman/v4/pkg/rootless" + "github.com/containers/storage/pkg/lockfile" + "github.com/sirupsen/logrus" +) + +// convertPortMappings will remove the HostIP part from the ports when running inside podman machine. +// This is need because a HostIP of 127.0.0.1 would now allow the gvproxy forwarder to reach to open ports. +// For machine the HostIP must only be used by gvproxy and never in the VM. +func (c *Container) convertPortMappings() []types.PortMapping { + if !machine.IsGvProxyBased() || len(c.config.PortMappings) == 0 { + return c.config.PortMappings + } + // if we run in a machine VM we have to ignore the host IP part + newPorts := make([]types.PortMapping, 0, len(c.config.PortMappings)) + for _, port := range c.config.PortMappings { + port.HostIP = "" + newPorts = append(newPorts, port) + } + return newPorts +} + +func (c *Container) getNetworkOptions(networkOpts map[string]types.PerNetworkOptions) types.NetworkOptions { + opts := types.NetworkOptions{ + ContainerID: c.config.ID, + ContainerName: getCNIPodName(c), + } + opts.PortMappings = c.convertPortMappings() + + // If the container requested special network options use this instead of the config. + // This is the case for container restore or network reload. + if c.perNetworkOpts != nil { + opts.Networks = c.perNetworkOpts + } else { + opts.Networks = networkOpts + } + return opts +} + +// setUpNetwork will set up the the networks, on error it will also tear down the cni +// networks. If rootless it will join/create the rootless network namespace. +func (r *Runtime) setUpNetwork(ns string, opts types.NetworkOptions) (map[string]types.StatusBlock, error) { + rootlessNetNS, err := r.GetRootlessNetNs(true) + if err != nil { + return nil, err + } + var results map[string]types.StatusBlock + setUpPod := func() error { + results, err = r.network.Setup(ns, types.SetupOptions{NetworkOptions: opts}) + return err + } + // rootlessNetNS is nil if we are root + if rootlessNetNS != nil { + // execute the setup in the rootless net ns + err = rootlessNetNS.Do(setUpPod) + rootlessNetNS.Lock.Unlock() + } else { + err = setUpPod() + } + return results, err +} + +// getCNIPodName return the pod name (hostname) used by CNI and the dnsname plugin. +// If we are in the pod network namespace use the pod name otherwise the container name +func getCNIPodName(c *Container) string { + if c.config.NetMode.IsPod() || c.IsInfra() { + pod, err := c.runtime.state.Pod(c.PodID()) + if err == nil { + return pod.Name() + } + } + return c.Name() +} + +// Tear down a container's network configuration and joins the +// rootless net ns as rootless user +func (r *Runtime) teardownNetwork(ns string, opts types.NetworkOptions) error { + rootlessNetNS, err := r.GetRootlessNetNs(false) + if err != nil { + return err + } + tearDownPod := func() error { + if err := r.network.Teardown(ns, types.TeardownOptions{NetworkOptions: opts}); err != nil { + return fmt.Errorf("tearing down network namespace configuration for container %s: %w", opts.ContainerID, err) + } + return nil + } + + // rootlessNetNS is nil if we are root + if rootlessNetNS != nil { + // execute the cni setup in the rootless net ns + err = rootlessNetNS.Do(tearDownPod) + if cerr := rootlessNetNS.Cleanup(r); cerr != nil { + logrus.WithError(err).Error("failed to clean up rootless netns") + } + rootlessNetNS.Lock.Unlock() + } else { + err = tearDownPod() + } + return err +} + +// Tear down a container's CNI network configuration, but do not tear down the +// namespace itself. +func (r *Runtime) teardownCNI(ctr *Container) error { + if ctr.state.NetNS == nil { + // The container has no network namespace, we're set + return nil + } + + logrus.Debugf("Tearing down network namespace at %s for container %s", ctr.state.NetNS.Path(), ctr.ID()) + + networks, err := ctr.networks() + if err != nil { + return err + } + + if !ctr.config.NetMode.IsSlirp4netns() && len(networks) > 0 { + netOpts := ctr.getNetworkOptions(networks) + return r.teardownNetwork(ctr.state.NetNS.Path(), netOpts) + } + return nil +} + +// isBridgeNetMode checks if the given network mode is bridge. +// It returns nil when it is set to bridge and an error otherwise. +func isBridgeNetMode(n namespaces.NetworkMode) error { + if !n.IsBridge() { + return fmt.Errorf("%q is not supported: %w", n, define.ErrNetworkModeInvalid) + } + return nil +} + +// Reload only works with containers with a configured network. +// It will tear down, and then reconfigure, the network of the container. +// This is mainly used when a reload of firewall rules wipes out existing +// firewall configuration. +// Efforts will be made to preserve MAC and IP addresses, but this only works if +// the container only joined a single CNI network, and was only assigned a +// single MAC or IP. +// Only works on root containers at present, though in the future we could +// extend this to stop + restart slirp4netns +func (r *Runtime) reloadContainerNetwork(ctr *Container) (map[string]types.StatusBlock, error) { + if ctr.state.NetNS == nil { + return nil, fmt.Errorf("container %s network is not configured, refusing to reload: %w", ctr.ID(), define.ErrCtrStateInvalid) + } + if err := isBridgeNetMode(ctr.config.NetMode); err != nil { + return nil, err + } + logrus.Infof("Going to reload container %s network", ctr.ID()) + + err := r.teardownCNI(ctr) + if err != nil { + // teardownCNI will error if the iptables rules do not exists and this is the case after + // a firewall reload. The purpose of network reload is to recreate the rules if they do + // not exists so we should not log this specific error as error. This would confuse users otherwise. + // iptables-legacy and iptables-nft will create different errors make sure to match both. + b, rerr := regexp.MatchString("Couldn't load target `CNI-[a-f0-9]{24}':No such file or directory|Chain 'CNI-[a-f0-9]{24}' does not exist", err.Error()) + if rerr == nil && !b { + logrus.Error(err) + } else { + logrus.Info(err) + } + } + + networkOpts, err := ctr.networks() + if err != nil { + return nil, err + } + + // Set the same network settings as before.. + netStatus := ctr.getNetworkStatus() + for network, perNetOpts := range networkOpts { + for name, netInt := range netStatus[network].Interfaces { + perNetOpts.InterfaceName = name + perNetOpts.StaticMAC = netInt.MacAddress + for _, netAddress := range netInt.Subnets { + perNetOpts.StaticIPs = append(perNetOpts.StaticIPs, netAddress.IPNet.IP) + } + // Normally interfaces have a length of 1, only for some special cni configs we could get more. + // For now just use the first interface to get the ips this should be good enough for most cases. + break + } + networkOpts[network] = perNetOpts + } + ctr.perNetworkOpts = networkOpts + + return r.configureNetNS(ctr, ctr.state.NetNS) +} + +// Produce an InspectNetworkSettings containing information on the container +// network. +func (c *Container) getContainerNetworkInfo() (*define.InspectNetworkSettings, error) { + if c.config.NetNsCtr != "" { + netNsCtr, err := c.runtime.GetContainer(c.config.NetNsCtr) + if err != nil { + return nil, err + } + // see https://github.com/containers/podman/issues/10090 + // the container has to be locked for syncContainer() + netNsCtr.lock.Lock() + defer netNsCtr.lock.Unlock() + // Have to sync to ensure that state is populated + if err := netNsCtr.syncContainer(); err != nil { + return nil, err + } + logrus.Debugf("Container %s shares network namespace, retrieving network info of container %s", c.ID(), c.config.NetNsCtr) + + return netNsCtr.getContainerNetworkInfo() + } + + settings := new(define.InspectNetworkSettings) + settings.Ports = makeInspectPortBindings(c.config.PortMappings, c.config.ExposedPorts) + + networks, err := c.networks() + if err != nil { + return nil, err + } + + if c.state.NetNS == nil { + if networkNSPath := c.joinedNetworkNSPath(); networkNSPath != "" { + if result, err := c.inspectJoinedNetworkNS(networkNSPath); err == nil { + // fallback to dummy configuration + settings.InspectBasicNetworkConfig = resultToBasicNetworkConfig(result) + return settings, nil + } + // do not propagate error inspecting a joined network ns + logrus.Errorf("Inspecting network namespace: %s of container %s: %v", networkNSPath, c.ID(), err) + } + // We can't do more if the network is down. + + // We still want to make dummy configurations for each CNI net + // the container joined. + if len(networks) > 0 { + settings.Networks = make(map[string]*define.InspectAdditionalNetwork, len(networks)) + for net, opts := range networks { + cniNet := new(define.InspectAdditionalNetwork) + cniNet.NetworkID = net + cniNet.Aliases = opts.Aliases + settings.Networks[net] = cniNet + } + } + + return settings, nil + } + + // Set network namespace path + settings.SandboxKey = c.state.NetNS.Path() + + netStatus := c.getNetworkStatus() + // If this is empty, we're probably slirp4netns + if len(netStatus) == 0 { + return settings, nil + } + + // If we have networks - handle that here + if len(networks) > 0 { + if len(networks) != len(netStatus) { + return nil, fmt.Errorf("network inspection mismatch: asked to join %d network(s) %v, but have information on %d network(s): %w", len(networks), networks, len(netStatus), define.ErrInternal) + } + + settings.Networks = make(map[string]*define.InspectAdditionalNetwork) + + for name, opts := range networks { + result := netStatus[name] + addedNet := new(define.InspectAdditionalNetwork) + addedNet.NetworkID = name + addedNet.Aliases = opts.Aliases + addedNet.InspectBasicNetworkConfig = resultToBasicNetworkConfig(result) + + settings.Networks[name] = addedNet + } + + // if not only the default network is connected we can return here + // otherwise we have to populate the InspectBasicNetworkConfig settings + _, isDefaultNet := networks[c.runtime.config.Network.DefaultNetwork] + if !(len(networks) == 1 && isDefaultNet) { + return settings, nil + } + } + + // If not joining networks, we should have at most 1 result + if len(netStatus) > 1 { + return nil, fmt.Errorf("should have at most 1 network status result if not joining networks, instead got %d: %w", len(netStatus), define.ErrInternal) + } + + if len(netStatus) == 1 { + for _, status := range netStatus { + settings.InspectBasicNetworkConfig = resultToBasicNetworkConfig(status) + } + } + return settings, nil +} + +// resultToBasicNetworkConfig produces an InspectBasicNetworkConfig from a CNI +// result +func resultToBasicNetworkConfig(result types.StatusBlock) define.InspectBasicNetworkConfig { + config := define.InspectBasicNetworkConfig{} + interfaceNames := make([]string, 0, len(result.Interfaces)) + for interfaceName := range result.Interfaces { + interfaceNames = append(interfaceNames, interfaceName) + } + // ensure consistent inspect results by sorting + sort.Strings(interfaceNames) + for _, interfaceName := range interfaceNames { + netInt := result.Interfaces[interfaceName] + for _, netAddress := range netInt.Subnets { + size, _ := netAddress.IPNet.Mask.Size() + if netAddress.IPNet.IP.To4() != nil { + // ipv4 + if config.IPAddress == "" { + config.IPAddress = netAddress.IPNet.IP.String() + config.IPPrefixLen = size + config.Gateway = netAddress.Gateway.String() + } else { + config.SecondaryIPAddresses = append(config.SecondaryIPAddresses, define.Address{Addr: netAddress.IPNet.IP.String(), PrefixLength: size}) + } + } else { + // ipv6 + if config.GlobalIPv6Address == "" { + config.GlobalIPv6Address = netAddress.IPNet.IP.String() + config.GlobalIPv6PrefixLen = size + config.IPv6Gateway = netAddress.Gateway.String() + } else { + config.SecondaryIPv6Addresses = append(config.SecondaryIPv6Addresses, define.Address{Addr: netAddress.IPNet.IP.String(), PrefixLength: size}) + } + } + } + if config.MacAddress == "" { + config.MacAddress = netInt.MacAddress.String() + } else { + config.AdditionalMacAddresses = append(config.AdditionalMacAddresses, netInt.MacAddress.String()) + } + } + return config +} + +// NetworkDisconnect removes a container from the network +func (c *Container) NetworkDisconnect(nameOrID, netName string, force bool) error { + // only the bridge mode supports cni networks + if err := isBridgeNetMode(c.config.NetMode); err != nil { + return err + } + + c.lock.Lock() + defer c.lock.Unlock() + + networks, err := c.networks() + if err != nil { + return err + } + + // check if network exists and if the input is a ID we get the name + // CNI only uses names so it is important that we only use the name + netName, err = c.runtime.normalizeNetworkName(netName) + if err != nil { + return err + } + + _, nameExists := networks[netName] + if !nameExists && len(networks) > 0 { + return fmt.Errorf("container %s is not connected to network %s", nameOrID, netName) + } + + if err := c.syncContainer(); err != nil { + return err + } + // get network status before we disconnect + networkStatus := c.getNetworkStatus() + + if err := c.runtime.state.NetworkDisconnect(c, netName); err != nil { + return err + } + + c.newNetworkEvent(events.NetworkDisconnect, netName) + if !c.ensureState(define.ContainerStateRunning, define.ContainerStateCreated) { + return nil + } + + if c.state.NetNS == nil { + return fmt.Errorf("unable to disconnect %s from %s: %w", nameOrID, netName, define.ErrNoNetwork) + } + + opts := types.NetworkOptions{ + ContainerID: c.config.ID, + ContainerName: getCNIPodName(c), + } + opts.PortMappings = c.convertPortMappings() + opts.Networks = map[string]types.PerNetworkOptions{ + netName: networks[netName], + } + + if err := c.runtime.teardownNetwork(c.state.NetNS.Path(), opts); err != nil { + return err + } + + // update network status if container is running + oldStatus, statusExist := networkStatus[netName] + delete(networkStatus, netName) + c.state.NetworkStatus = networkStatus + err = c.save() + if err != nil { + return err + } + + // Reload ports when there are still connected networks, maybe we removed the network interface with the child ip. + // Reloading without connected networks does not make sense, so we can skip this step. + if rootless.IsRootless() && len(networkStatus) > 0 { + if err := c.reloadRootlessRLKPortMapping(); err != nil { + return err + } + } + + // Update resolv.conf if required + if statusExist { + stringIPs := make([]string, 0, len(oldStatus.DNSServerIPs)) + for _, ip := range oldStatus.DNSServerIPs { + stringIPs = append(stringIPs, ip.String()) + } + if len(stringIPs) > 0 { + logrus.Debugf("Removing DNS Servers %v from resolv.conf", stringIPs) + if err := c.removeNameserver(stringIPs); err != nil { + return err + } + } + + // update /etc/hosts file + if file, ok := c.state.BindMounts[config.DefaultHostsFile]; ok { + // sync the names with c.getHostsEntries() + names := []string{c.Hostname(), c.config.Name} + rm := etchosts.GetNetworkHostEntries(map[string]types.StatusBlock{netName: oldStatus}, names...) + if len(rm) > 0 { + // make sure to lock this file to prevent concurrent writes when + // this is used a net dependency container + lock, err := lockfile.GetLockfile(file) + if err != nil { + return fmt.Errorf("failed to lock hosts file: %w", err) + } + logrus.Debugf("Remove /etc/hosts entries %v", rm) + lock.Lock() + err = etchosts.Remove(file, rm) + lock.Unlock() + if err != nil { + return err + } + } + } + } + return nil +} + +// ConnectNetwork connects a container to a given network +func (c *Container) NetworkConnect(nameOrID, netName string, netOpts types.PerNetworkOptions) error { + // only the bridge mode supports cni networks + if err := isBridgeNetMode(c.config.NetMode); err != nil { + return err + } + + c.lock.Lock() + defer c.lock.Unlock() + + networks, err := c.networks() + if err != nil { + return err + } + + // check if network exists and if the input is a ID we get the name + // CNI only uses names so it is important that we only use the name + netName, err = c.runtime.normalizeNetworkName(netName) + if err != nil { + return err + } + + if err := c.syncContainer(); err != nil { + return err + } + + // get network status before we connect + networkStatus := c.getNetworkStatus() + + // always add the short id as alias for docker compat + netOpts.Aliases = append(netOpts.Aliases, c.config.ID[:12]) + + if netOpts.InterfaceName == "" { + netOpts.InterfaceName = getFreeInterfaceName(networks) + if netOpts.InterfaceName == "" { + return errors.New("could not find free network interface name") + } + } + + if err := c.runtime.state.NetworkConnect(c, netName, netOpts); err != nil { + // Docker compat: treat requests to attach already attached networks as a no-op, ignoring opts + if errors.Is(err, define.ErrNetworkConnected) && c.ensureState(define.ContainerStateConfigured) { + return nil + } + + return err + } + c.newNetworkEvent(events.NetworkConnect, netName) + if !c.ensureState(define.ContainerStateRunning, define.ContainerStateCreated) { + return nil + } + if c.state.NetNS == nil { + return fmt.Errorf("unable to connect %s to %s: %w", nameOrID, netName, define.ErrNoNetwork) + } + + opts := types.NetworkOptions{ + ContainerID: c.config.ID, + ContainerName: getCNIPodName(c), + } + opts.PortMappings = c.convertPortMappings() + opts.Networks = map[string]types.PerNetworkOptions{ + netName: netOpts, + } + + results, err := c.runtime.setUpNetwork(c.state.NetNS.Path(), opts) + if err != nil { + return err + } + if len(results) != 1 { + return errors.New("when adding aliases, results must be of length 1") + } + + // we need to get the old host entries before we add the new one to the status + // if we do not add do it here we will get the wrong existing entries which will throw of the logic + // we could also copy the map but this does not seem worth it + // sync the hostNames with c.getHostsEntries() + hostNames := []string{c.Hostname(), c.config.Name} + oldHostEntries := etchosts.GetNetworkHostEntries(networkStatus, hostNames...) + + // update network status + if networkStatus == nil { + networkStatus = make(map[string]types.StatusBlock, 1) + } + networkStatus[netName] = results[netName] + c.state.NetworkStatus = networkStatus + + err = c.save() + if err != nil { + return err + } + + // The first network needs a port reload to set the correct child ip for the rootlessport process. + // Adding a second network does not require a port reload because the child ip is still valid. + if rootless.IsRootless() && len(networks) == 0 { + if err := c.reloadRootlessRLKPortMapping(); err != nil { + return err + } + } + + ipv6, err := c.checkForIPv6(networkStatus) + if err != nil { + return err + } + + // Update resolv.conf if required + stringIPs := make([]string, 0, len(results[netName].DNSServerIPs)) + for _, ip := range results[netName].DNSServerIPs { + if (ip.To4() == nil) && !ipv6 { + continue + } + stringIPs = append(stringIPs, ip.String()) + } + if len(stringIPs) > 0 { + logrus.Debugf("Adding DNS Servers %v to resolv.conf", stringIPs) + if err := c.addNameserver(stringIPs); err != nil { + return err + } + } + + // update /etc/hosts file + if file, ok := c.state.BindMounts[config.DefaultHostsFile]; ok { + // make sure to lock this file to prevent concurrent writes when + // this is used a net dependency container + lock, err := lockfile.GetLockfile(file) + if err != nil { + return fmt.Errorf("failed to lock hosts file: %w", err) + } + new := etchosts.GetNetworkHostEntries(results, hostNames...) + logrus.Debugf("Add /etc/hosts entries %v", new) + // use special AddIfExists API to make sure we only add new entries if an old one exists + // see the AddIfExists() comment for more information + lock.Lock() + err = etchosts.AddIfExists(file, oldHostEntries, new) + lock.Unlock() + if err != nil { + return err + } + } + + return nil +} + +// get a free interface name for a new network +// return an empty string if no free name was found +func getFreeInterfaceName(networks map[string]types.PerNetworkOptions) string { + ifNames := make([]string, 0, len(networks)) + for _, opts := range networks { + ifNames = append(ifNames, opts.InterfaceName) + } + for i := 0; i < 100000; i++ { + ifName := fmt.Sprintf("eth%d", i) + if !util.StringInSlice(ifName, ifNames) { + return ifName + } + } + return "" +} + +// DisconnectContainerFromNetwork removes a container from its CNI network +func (r *Runtime) DisconnectContainerFromNetwork(nameOrID, netName string, force bool) error { + ctr, err := r.LookupContainer(nameOrID) + if err != nil { + return err + } + return ctr.NetworkDisconnect(nameOrID, netName, force) +} + +// ConnectContainerToNetwork connects a container to a CNI network +func (r *Runtime) ConnectContainerToNetwork(nameOrID, netName string, netOpts types.PerNetworkOptions) error { + ctr, err := r.LookupContainer(nameOrID) + if err != nil { + return err + } + return ctr.NetworkConnect(nameOrID, netName, netOpts) +} + +// normalizeNetworkName takes a network name, a partial or a full network ID and returns the network name. +// If the network is not found a errors is returned. +func (r *Runtime) normalizeNetworkName(nameOrID string) (string, error) { + net, err := r.network.NetworkInspect(nameOrID) + if err != nil { + return "", err + } + return net.Name, nil +} + +// ocicniPortsToNetTypesPorts convert the old port format to the new one +// while deduplicating ports into ranges +func ocicniPortsToNetTypesPorts(ports []types.OCICNIPortMapping) []types.PortMapping { + if len(ports) == 0 { + return nil + } + + newPorts := make([]types.PortMapping, 0, len(ports)) + + // first sort the ports + sort.Slice(ports, func(i, j int) bool { + return compareOCICNIPorts(ports[i], ports[j]) + }) + + // we already check if the slice is empty so we can use the first element + currentPort := types.PortMapping{ + HostIP: ports[0].HostIP, + HostPort: uint16(ports[0].HostPort), + ContainerPort: uint16(ports[0].ContainerPort), + Protocol: ports[0].Protocol, + Range: 1, + } + + for i := 1; i < len(ports); i++ { + if ports[i].HostIP == currentPort.HostIP && + ports[i].Protocol == currentPort.Protocol && + ports[i].HostPort-int32(currentPort.Range) == int32(currentPort.HostPort) && + ports[i].ContainerPort-int32(currentPort.Range) == int32(currentPort.ContainerPort) { + currentPort.Range++ + } else { + newPorts = append(newPorts, currentPort) + currentPort = types.PortMapping{ + HostIP: ports[i].HostIP, + HostPort: uint16(ports[i].HostPort), + ContainerPort: uint16(ports[i].ContainerPort), + Protocol: ports[i].Protocol, + Range: 1, + } + } + } + newPorts = append(newPorts, currentPort) + return newPorts +} + +// compareOCICNIPorts will sort the ocicni ports by +// 1) host ip +// 2) protocol +// 3) hostPort +// 4) container port +func compareOCICNIPorts(i, j types.OCICNIPortMapping) bool { + if i.HostIP != j.HostIP { + return i.HostIP < j.HostIP + } + + if i.Protocol != j.Protocol { + return i.Protocol < j.Protocol + } + + if i.HostPort != j.HostPort { + return i.HostPort < j.HostPort + } + + return i.ContainerPort < j.ContainerPort +} |