diff options
Diffstat (limited to 'server/sandbox_run.go')
-rw-r--r-- | server/sandbox_run.go | 615 |
1 files changed, 615 insertions, 0 deletions
diff --git a/server/sandbox_run.go b/server/sandbox_run.go new file mode 100644 index 000000000..72afdb229 --- /dev/null +++ b/server/sandbox_run.go @@ -0,0 +1,615 @@ +package server + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "os" + "path" + "path/filepath" + "regexp" + "strconv" + "strings" + "time" + + "github.com/containers/storage" + "github.com/kubernetes-incubator/cri-o/libkpod/sandbox" + "github.com/kubernetes-incubator/cri-o/oci" + "github.com/kubernetes-incubator/cri-o/pkg/annotations" + runtimespec "github.com/opencontainers/runtime-spec/specs-go" + "github.com/opencontainers/runtime-tools/generate" + "github.com/opencontainers/selinux/go-selinux/label" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" + "golang.org/x/net/context" + "golang.org/x/sys/unix" + "k8s.io/kubernetes/pkg/api/v1" + pb "k8s.io/kubernetes/pkg/kubelet/apis/cri/v1alpha1/runtime" + "k8s.io/kubernetes/pkg/kubelet/leaky" + "k8s.io/kubernetes/pkg/kubelet/network/hostport" + "k8s.io/kubernetes/pkg/kubelet/types" +) + +const ( + // PodInfraOOMAdj is the value that we set for oom score adj for + // the pod infra container. + // TODO: Remove this const once this value is provided over CRI + // See https://github.com/kubernetes/kubernetes/issues/47938 + PodInfraOOMAdj int = -998 + // PodInfraCPUshares is default cpu shares for sandbox container. + PodInfraCPUshares = 2 +) + +// privilegedSandbox returns true if the sandbox configuration +// requires additional host privileges for the sandbox. +func (s *Server) privilegedSandbox(req *pb.RunPodSandboxRequest) bool { + securityContext := req.GetConfig().GetLinux().GetSecurityContext() + if securityContext == nil { + return false + } + + if securityContext.Privileged { + return true + } + + namespaceOptions := securityContext.GetNamespaceOptions() + if namespaceOptions == nil { + return false + } + + if namespaceOptions.HostNetwork || + namespaceOptions.HostPid || + namespaceOptions.HostIpc { + return true + } + + return false +} + +// trustedSandbox returns true if the sandbox will run trusted workloads. +func (s *Server) trustedSandbox(req *pb.RunPodSandboxRequest) bool { + kubeAnnotations := req.GetConfig().GetAnnotations() + + trustedAnnotation, ok := kubeAnnotations[annotations.TrustedSandbox] + if !ok { + // A sandbox is trusted by default. + return true + } + + return isTrue(trustedAnnotation) +} + +func (s *Server) runContainer(container *oci.Container, cgroupParent string) error { + if err := s.Runtime().CreateContainer(container, cgroupParent); err != nil { + return err + } + return s.Runtime().StartContainer(container) +} + +var ( + conflictRE = regexp.MustCompile(`already reserved for pod "([0-9a-z]+)"`) +) + +// RunPodSandbox creates and runs a pod-level sandbox. +func (s *Server) RunPodSandbox(ctx context.Context, req *pb.RunPodSandboxRequest) (resp *pb.RunPodSandboxResponse, err error) { + s.updateLock.RLock() + defer s.updateLock.RUnlock() + + logrus.Debugf("RunPodSandboxRequest %+v", req) + var processLabel, mountLabel, resolvPath string + // process req.Name + kubeName := req.GetConfig().GetMetadata().Name + if kubeName == "" { + return nil, fmt.Errorf("PodSandboxConfig.Name should not be empty") + } + + namespace := req.GetConfig().GetMetadata().Namespace + attempt := req.GetConfig().GetMetadata().Attempt + + id, name, err := s.generatePodIDandName(req.GetConfig()) + if err != nil { + if strings.Contains(err.Error(), "already reserved for pod") { + matches := conflictRE.FindStringSubmatch(err.Error()) + if len(matches) != 2 { + return nil, err + } + dupID := matches[1] + if _, err := s.StopPodSandbox(ctx, &pb.StopPodSandboxRequest{PodSandboxId: dupID}); err != nil { + return nil, err + } + if _, err := s.RemovePodSandbox(ctx, &pb.RemovePodSandboxRequest{PodSandboxId: dupID}); err != nil { + return nil, err + } + id, name, err = s.generatePodIDandName(req.GetConfig()) + if err != nil { + return nil, err + } + } else { + return nil, err + } + } + + defer func() { + if err != nil { + s.ReleasePodName(name) + } + }() + + _, containerName, err := s.generateContainerIDandNameForSandbox(req.GetConfig()) + if err != nil { + return nil, err + } + + defer func() { + if err != nil { + s.ReleaseContainerName(containerName) + } + }() + + podContainer, err := s.StorageRuntimeServer().CreatePodSandbox(s.ImageContext(), + name, id, + s.config.PauseImage, "", + containerName, + req.GetConfig().GetMetadata().Name, + req.GetConfig().GetMetadata().Uid, + namespace, + attempt, + nil) + if errors.Cause(err) == storage.ErrDuplicateName { + return nil, fmt.Errorf("pod sandbox with name %q already exists", name) + } + if err != nil { + return nil, fmt.Errorf("error creating pod sandbox with name %q: %v", name, err) + } + defer func() { + if err != nil { + if err2 := s.StorageRuntimeServer().RemovePodSandbox(id); err2 != nil { + logrus.Warnf("couldn't cleanup pod sandbox %q: %v", id, err2) + } + } + }() + + // TODO: factor generating/updating the spec into something other projects can vendor + + // creates a spec Generator with the default spec. + g := generate.New() + + // setup defaults for the pod sandbox + g.SetRootReadonly(true) + if s.config.PauseCommand == "" { + if podContainer.Config != nil { + g.SetProcessArgs(podContainer.Config.Config.Cmd) + } else { + g.SetProcessArgs([]string{sandbox.PodInfraCommand}) + } + } else { + g.SetProcessArgs([]string{s.config.PauseCommand}) + } + + // set DNS options + if req.GetConfig().GetDnsConfig() != nil { + dnsServers := req.GetConfig().GetDnsConfig().Servers + dnsSearches := req.GetConfig().GetDnsConfig().Searches + dnsOptions := req.GetConfig().GetDnsConfig().Options + resolvPath = fmt.Sprintf("%s/resolv.conf", podContainer.RunDir) + err = parseDNSOptions(dnsServers, dnsSearches, dnsOptions, resolvPath) + if err != nil { + err1 := removeFile(resolvPath) + if err1 != nil { + err = err1 + return nil, fmt.Errorf("%v; failed to remove %s: %v", err, resolvPath, err1) + } + return nil, err + } + if err := label.Relabel(resolvPath, mountLabel, true); err != nil && err != unix.ENOTSUP { + return nil, err + } + + g.AddBindMount(resolvPath, "/etc/resolv.conf", []string{"ro"}) + } + + // add metadata + metadata := req.GetConfig().GetMetadata() + metadataJSON, err := json.Marshal(metadata) + if err != nil { + return nil, err + } + + // add labels + labels := req.GetConfig().GetLabels() + + // Add special container name label for the infra container + labelsJSON := []byte{} + if labels != nil { + labels[types.KubernetesContainerNameLabel] = leaky.PodInfraContainerName + labelsJSON, err = json.Marshal(labels) + if err != nil { + return nil, err + } + } + + // add annotations + kubeAnnotations := req.GetConfig().GetAnnotations() + kubeAnnotationsJSON, err := json.Marshal(kubeAnnotations) + if err != nil { + return nil, err + } + + // set log directory + logDir := req.GetConfig().LogDirectory + if logDir == "" { + logDir = filepath.Join(s.config.LogDir, id) + } + if err = os.MkdirAll(logDir, 0700); err != nil { + return nil, err + } + // This should always be absolute from k8s. + if !filepath.IsAbs(logDir) { + return nil, fmt.Errorf("requested logDir for sbox id %s is a relative path: %s", id, logDir) + } + + privileged := s.privilegedSandbox(req) + + securityContext := req.GetConfig().GetLinux().GetSecurityContext() + if securityContext == nil { + logrus.Warn("no security context found in config.") + } + + processLabel, mountLabel, err = getSELinuxLabels(securityContext.GetSelinuxOptions(), privileged) + if err != nil { + return nil, err + } + + // Don't use SELinux separation with Host Pid or IPC Namespace or privileged. + if securityContext.GetNamespaceOptions().GetHostPid() || securityContext.GetNamespaceOptions().GetHostIpc() { + processLabel, mountLabel = "", "" + } + g.SetProcessSelinuxLabel(processLabel) + g.SetLinuxMountLabel(mountLabel) + + // create shm mount for the pod containers. + var shmPath string + if securityContext.GetNamespaceOptions().GetHostIpc() { + shmPath = "/dev/shm" + } else { + shmPath, err = setupShm(podContainer.RunDir, mountLabel) + if err != nil { + return nil, err + } + defer func() { + if err != nil { + if err2 := unix.Unmount(shmPath, unix.MNT_DETACH); err2 != nil { + logrus.Warnf("failed to unmount shm for pod: %v", err2) + } + } + }() + } + + err = s.setPodSandboxMountLabel(id, mountLabel) + if err != nil { + return nil, err + } + + if err = s.CtrIDIndex().Add(id); err != nil { + return nil, err + } + + defer func() { + if err != nil { + if err2 := s.CtrIDIndex().Delete(id); err2 != nil { + logrus.Warnf("couldn't delete ctr id %s from idIndex", id) + } + } + }() + + // set log path inside log directory + logPath := filepath.Join(logDir, id+".log") + + // Handle https://issues.k8s.io/44043 + if err := ensureSaneLogPath(logPath); err != nil { + return nil, err + } + + hostNetwork := securityContext.GetNamespaceOptions().GetHostNetwork() + + hostname, err := getHostname(id, req.GetConfig().Hostname, hostNetwork) + if err != nil { + return nil, err + } + g.SetHostname(hostname) + + trusted := s.trustedSandbox(req) + g.AddAnnotation(annotations.Metadata, string(metadataJSON)) + g.AddAnnotation(annotations.Labels, string(labelsJSON)) + g.AddAnnotation(annotations.Annotations, string(kubeAnnotationsJSON)) + g.AddAnnotation(annotations.LogPath, logPath) + g.AddAnnotation(annotations.Name, name) + g.AddAnnotation(annotations.ContainerType, annotations.ContainerTypeSandbox) + g.AddAnnotation(annotations.SandboxID, id) + g.AddAnnotation(annotations.ContainerName, containerName) + g.AddAnnotation(annotations.ContainerID, id) + g.AddAnnotation(annotations.ShmPath, shmPath) + g.AddAnnotation(annotations.PrivilegedRuntime, fmt.Sprintf("%v", privileged)) + g.AddAnnotation(annotations.TrustedSandbox, fmt.Sprintf("%v", trusted)) + g.AddAnnotation(annotations.ResolvPath, resolvPath) + g.AddAnnotation(annotations.HostName, hostname) + g.AddAnnotation(annotations.KubeName, kubeName) + if podContainer.Config.Config.StopSignal != "" { + // this key is defined in image-spec conversion document at https://github.com/opencontainers/image-spec/pull/492/files#diff-8aafbe2c3690162540381b8cdb157112R57 + g.AddAnnotation("org.opencontainers.image.stopSignal", podContainer.Config.Config.StopSignal) + } + + created := time.Now() + g.AddAnnotation(annotations.Created, created.Format(time.RFC3339Nano)) + + portMappings := convertPortMappings(req.GetConfig().GetPortMappings()) + + // setup cgroup settings + cgroupParent := req.GetConfig().GetLinux().GetCgroupParent() + if cgroupParent != "" { + if s.config.CgroupManager == oci.SystemdCgroupsManager { + if len(cgroupParent) <= 6 || !strings.HasSuffix(path.Base(cgroupParent), ".slice") { + return nil, fmt.Errorf("cri-o configured with systemd cgroup manager, but did not receive slice as parent: %s", cgroupParent) + } + cgPath, err := convertCgroupFsNameToSystemd(cgroupParent) + if err != nil { + return nil, err + } + g.SetLinuxCgroupsPath(cgPath + ":" + "crio" + ":" + id) + cgroupParent = cgPath + } else { + if strings.HasSuffix(path.Base(cgroupParent), ".slice") { + return nil, fmt.Errorf("cri-o configured with cgroupfs cgroup manager, but received systemd slice as parent: %s", cgroupParent) + } + cgPath := filepath.Join(cgroupParent, scopePrefix+"-"+id) + g.SetLinuxCgroupsPath(cgPath) + } + } + + sb, err := sandbox.New(id, namespace, name, kubeName, logDir, labels, kubeAnnotations, processLabel, mountLabel, metadata, shmPath, cgroupParent, privileged, trusted, resolvPath, hostname, portMappings) + if err != nil { + return nil, err + } + + s.addSandbox(sb) + defer func() { + if err != nil { + s.removeSandbox(id) + } + }() + + if err = s.PodIDIndex().Add(id); err != nil { + return nil, err + } + + defer func() { + if err != nil { + if err := s.PodIDIndex().Delete(id); err != nil { + logrus.Warnf("couldn't delete pod id %s from idIndex", id) + } + } + }() + + for k, v := range kubeAnnotations { + g.AddAnnotation(k, v) + } + for k, v := range labels { + g.AddAnnotation(k, v) + } + + // extract linux sysctls from annotations and pass down to oci runtime + safe, unsafe, err := SysctlsFromPodAnnotations(kubeAnnotations) + if err != nil { + return nil, err + } + for _, sysctl := range safe { + g.AddLinuxSysctl(sysctl.Name, sysctl.Value) + } + for _, sysctl := range unsafe { + g.AddLinuxSysctl(sysctl.Name, sysctl.Value) + } + + // Set OOM score adjust of the infra container to be very low + // so it doesn't get killed. + g.SetProcessOOMScoreAdj(PodInfraOOMAdj) + + g.SetLinuxResourcesCPUShares(PodInfraCPUshares) + + // set up namespaces + if hostNetwork { + err = g.RemoveLinuxNamespace(string(runtimespec.NetworkNamespace)) + if err != nil { + return nil, err + } + } else { + // Create the sandbox network namespace + if err = sb.NetNsCreate(); err != nil { + return nil, err + } + + defer func() { + if err == nil { + return + } + + if netnsErr := sb.NetNsRemove(); netnsErr != nil { + logrus.Warnf("Failed to remove networking namespace: %v", netnsErr) + } + }() + + // Pass the created namespace path to the runtime + err = g.AddOrReplaceLinuxNamespace(string(runtimespec.NetworkNamespace), sb.NetNsPath()) + if err != nil { + return nil, err + } + } + + if securityContext.GetNamespaceOptions().GetHostPid() { + err = g.RemoveLinuxNamespace(string(runtimespec.PIDNamespace)) + if err != nil { + return nil, err + } + } + + if securityContext.GetNamespaceOptions().GetHostIpc() { + err = g.RemoveLinuxNamespace(string(runtimespec.IPCNamespace)) + if err != nil { + return nil, err + } + } + + if !s.seccompEnabled { + g.Spec().Linux.Seccomp = nil + } + + saveOptions := generate.ExportOptions{} + mountPoint, err := s.StorageRuntimeServer().StartContainer(id) + if err != nil { + return nil, fmt.Errorf("failed to mount container %s in pod sandbox %s(%s): %v", containerName, sb.Name(), id, err) + } + g.AddAnnotation(annotations.MountPoint, mountPoint) + g.SetRootPath(mountPoint) + + hostnamePath := fmt.Sprintf("%s/hostname", podContainer.RunDir) + if err := ioutil.WriteFile(hostnamePath, []byte(hostname+"\n"), 0644); err != nil { + return nil, err + } + if err := label.Relabel(hostnamePath, mountLabel, true); err != nil && err != unix.ENOTSUP { + return nil, err + } + g.AddBindMount(hostnamePath, "/etc/hostname", []string{"ro"}) + g.AddAnnotation(annotations.HostnamePath, hostnamePath) + sb.AddHostnamePath(hostnamePath) + + container, err := oci.NewContainer(id, containerName, podContainer.RunDir, logPath, sb.NetNs(), labels, g.Spec().Annotations, kubeAnnotations, "", "", "", nil, id, false, false, false, sb.Privileged(), sb.Trusted(), podContainer.Dir, created, podContainer.Config.Config.StopSignal) + if err != nil { + return nil, err + } + container.SetSpec(g.Spec()) + container.SetMountPoint(mountPoint) + + sb.SetInfraContainer(container) + + var ip string + ip, err = s.networkStart(hostNetwork, sb) + if err != nil { + return nil, err + } + defer func() { + if err != nil { + s.networkStop(hostNetwork, sb) + } + }() + + g.AddAnnotation(annotations.IP, ip) + sb.AddIP(ip) + + err = g.SaveToFile(filepath.Join(podContainer.Dir, "config.json"), saveOptions) + if err != nil { + return nil, fmt.Errorf("failed to save template configuration for pod sandbox %s(%s): %v", sb.Name(), id, err) + } + if err = g.SaveToFile(filepath.Join(podContainer.RunDir, "config.json"), saveOptions); err != nil { + return nil, fmt.Errorf("failed to write runtime configuration for pod sandbox %s(%s): %v", sb.Name(), id, err) + } + + if err = s.runContainer(container, sb.CgroupParent()); err != nil { + return nil, err + } + + s.addInfraContainer(container) + + s.ContainerStateToDisk(container) + + resp = &pb.RunPodSandboxResponse{PodSandboxId: id} + logrus.Debugf("RunPodSandboxResponse: %+v", resp) + return resp, nil +} + +func convertPortMappings(in []*pb.PortMapping) []*hostport.PortMapping { + if in == nil { + return nil + } + out := make([]*hostport.PortMapping, len(in)) + for i, v := range in { + out[i] = &hostport.PortMapping{ + HostPort: v.HostPort, + ContainerPort: v.ContainerPort, + Protocol: v1.Protocol(v.Protocol.String()), + HostIP: v.HostIp, + } + } + return out +} + +func getHostname(id, hostname string, hostNetwork bool) (string, error) { + if hostNetwork { + if hostname == "" { + h, err := os.Hostname() + if err != nil { + return "", err + } + hostname = h + } + } else { + if hostname == "" { + hostname = id[:12] + } + } + return hostname, nil +} + +func (s *Server) setPodSandboxMountLabel(id, mountLabel string) error { + storageMetadata, err := s.StorageRuntimeServer().GetContainerMetadata(id) + if err != nil { + return err + } + storageMetadata.SetMountLabel(mountLabel) + return s.StorageRuntimeServer().SetContainerMetadata(id, storageMetadata) +} + +func getSELinuxLabels(selinuxOptions *pb.SELinuxOption, privileged bool) (processLabel string, mountLabel string, err error) { + if privileged { + return "", "", nil + } + labels := []string{} + if selinuxOptions != nil { + if selinuxOptions.User != "" { + labels = append(labels, "user:"+selinuxOptions.User) + } + if selinuxOptions.Role != "" { + labels = append(labels, "role:"+selinuxOptions.Role) + } + if selinuxOptions.Type != "" { + labels = append(labels, "type:"+selinuxOptions.Type) + } + if selinuxOptions.Level != "" { + labels = append(labels, "level:"+selinuxOptions.Level) + } + } + return label.InitLabels(labels) +} + +func setupShm(podSandboxRunDir, mountLabel string) (shmPath string, err error) { + shmPath = filepath.Join(podSandboxRunDir, "shm") + if err = os.Mkdir(shmPath, 0700); err != nil { + return "", err + } + shmOptions := "mode=1777,size=" + strconv.Itoa(sandbox.DefaultShmSize) + if err = unix.Mount("shm", shmPath, "tmpfs", unix.MS_NOEXEC|unix.MS_NOSUID|unix.MS_NODEV, + label.FormatMountLabel(shmOptions, mountLabel)); err != nil { + return "", fmt.Errorf("failed to mount shm tmpfs for pod: %v", err) + } + return shmPath, nil +} + +// convertCgroupFsNameToSystemd converts an expanded cgroupfs name to its systemd name. +// For example, it will convert test.slice/test-a.slice/test-a-b.slice to become test-a-b.slice +// NOTE: this is public right now to allow its usage in dockermanager and dockershim, ideally both those +// code areas could use something from libcontainer if we get this style function upstream. +func convertCgroupFsNameToSystemd(cgroupfsName string) (string, error) { + // TODO: see if libcontainer systemd implementation could use something similar, and if so, move + // this function up to that library. At that time, it would most likely do validation specific to systemd + // above and beyond the simple assumption here that the base of the path encodes the hierarchy + // per systemd convention. + return path.Base(cgroupfsName), nil +} |