diff options
Diffstat (limited to 'libpod')
105 files changed, 9435 insertions, 7368 deletions
diff --git a/libpod/boltdb_state.go b/libpod/boltdb_state.go index 81f11410b..4fd95a3cf 100644 --- a/libpod/boltdb_state.go +++ b/libpod/boltdb_state.go @@ -85,7 +85,7 @@ func NewBoltState(path string, runtime *Runtime) (State, error) { db, err := bolt.Open(path, 0600, nil) if err != nil { - return nil, fmt.Errorf("error opening database %s: %w", path, err) + return nil, fmt.Errorf("opening database %s: %w", path, err) } // Everywhere else, we use s.deferredCloseDBCon(db) to ensure the state's DB // mutex is also unlocked. @@ -123,7 +123,7 @@ func NewBoltState(path string, runtime *Runtime) (State, error) { return nil }) if err != nil { - return nil, fmt.Errorf("error checking DB schema: %w", err) + return nil, fmt.Errorf("checking DB schema: %w", err) } if !needsUpdate { @@ -135,13 +135,13 @@ func NewBoltState(path string, runtime *Runtime) (State, error) { err = db.Update(func(tx *bolt.Tx) error { for _, bkt := range createBuckets { if _, err := tx.CreateBucketIfNotExists(bkt); err != nil { - return fmt.Errorf("error creating bucket %s: %w", string(bkt), err) + return fmt.Errorf("creating bucket %s: %w", string(bkt), err) } } return nil }) if err != nil { - return nil, fmt.Errorf("error creating buckets for DB: %w", err) + return nil, fmt.Errorf("creating buckets for DB: %w", err) } state.valid = true @@ -220,11 +220,11 @@ func (s *BoltState) Refresh() error { return nil }) if err != nil { - return fmt.Errorf("error reading exit codes bucket: %w", err) + return fmt.Errorf("reading exit codes bucket: %w", err) } for _, id := range toRemoveExitCodes { if err := exitCodeBucket.Delete([]byte(id)); err != nil { - return fmt.Errorf("error removing exit code for ID %s: %w", id, err) + return fmt.Errorf("removing exit code for ID %s: %w", id, err) } } @@ -276,7 +276,7 @@ func (s *BoltState) Refresh() error { state := new(podState) if err := json.Unmarshal(stateBytes, state); err != nil { - return fmt.Errorf("error unmarshalling state for pod %s: %w", string(id), err) + return fmt.Errorf("unmarshalling state for pod %s: %w", string(id), err) } // Clear the Cgroup path @@ -284,11 +284,11 @@ func (s *BoltState) Refresh() error { newStateBytes, err := json.Marshal(state) if err != nil { - return fmt.Errorf("error marshalling modified state for pod %s: %w", string(id), err) + return fmt.Errorf("marshalling modified state for pod %s: %w", string(id), err) } if err := podBkt.Put(stateKey, newStateBytes); err != nil { - return fmt.Errorf("error updating state for pod %s in DB: %w", string(id), err) + return fmt.Errorf("updating state for pod %s in DB: %w", string(id), err) } // It's not a container, nothing to do @@ -297,7 +297,7 @@ func (s *BoltState) Refresh() error { // First, delete the network namespace if err := ctrBkt.Delete(netNSKey); err != nil { - return fmt.Errorf("error removing network namespace for container %s: %w", string(id), err) + return fmt.Errorf("removing network namespace for container %s: %w", string(id), err) } stateBytes := ctrBkt.Get(stateKey) @@ -309,18 +309,18 @@ func (s *BoltState) Refresh() error { state := new(ContainerState) if err := json.Unmarshal(stateBytes, state); err != nil { - return fmt.Errorf("error unmarshalling state for container %s: %w", string(id), err) + return fmt.Errorf("unmarshalling state for container %s: %w", string(id), err) } resetState(state) newStateBytes, err := json.Marshal(state) if err != nil { - return fmt.Errorf("error marshalling modified state for container %s: %w", string(id), err) + return fmt.Errorf("marshalling modified state for container %s: %w", string(id), err) } if err := ctrBkt.Put(stateKey, newStateBytes); err != nil { - return fmt.Errorf("error updating state for container %s in DB: %w", string(id), err) + return fmt.Errorf("updating state for container %s in DB: %w", string(id), err) } // Delete all exec sessions, if there are any @@ -338,7 +338,7 @@ func (s *BoltState) Refresh() error { } for _, execID := range toRemove { if err := ctrExecBkt.Delete([]byte(execID)); err != nil { - return fmt.Errorf("error removing exec session %s from container %s: %w", execID, string(id), err) + return fmt.Errorf("removing exec session %s from container %s: %w", execID, string(id), err) } } } @@ -358,12 +358,12 @@ func (s *BoltState) Refresh() error { if testID := namesBucket.Get(name); testID != nil { logrus.Infof("Found dangling name %s (ID %s) in database", string(name), id) if err := namesBucket.Delete(name); err != nil { - return fmt.Errorf("error removing dangling name %s (ID %s) from database: %w", string(name), id, err) + return fmt.Errorf("removing dangling name %s (ID %s) from database: %w", string(name), id, err) } } } if err := idBucket.Delete([]byte(id)); err != nil { - return fmt.Errorf("error removing dangling ID %s from database: %w", id, err) + return fmt.Errorf("removing dangling ID %s from database: %w", id, err) } } @@ -384,7 +384,7 @@ func (s *BoltState) Refresh() error { oldState := new(VolumeState) if err := json.Unmarshal(volStateBytes, oldState); err != nil { - return fmt.Errorf("error unmarshalling state for volume %s: %w", string(id), err) + return fmt.Errorf("unmarshalling state for volume %s: %w", string(id), err) } // Reset mount count to 0 @@ -393,11 +393,11 @@ func (s *BoltState) Refresh() error { newState, err := json.Marshal(oldState) if err != nil { - return fmt.Errorf("error marshalling state for volume %s: %w", string(id), err) + return fmt.Errorf("marshalling state for volume %s: %w", string(id), err) } if err := dbVol.Put(stateKey, newState); err != nil { - return fmt.Errorf("error storing new state for volume %s: %w", string(id), err) + return fmt.Errorf("storing new state for volume %s: %w", string(id), err) } return nil @@ -421,7 +421,7 @@ func (s *BoltState) Refresh() error { for _, execSession := range toRemoveExec { if err := execBucket.Delete([]byte(execSession)); err != nil { - return fmt.Errorf("error deleting exec session %s registry from database: %w", execSession, err) + return fmt.Errorf("deleting exec session %s registry from database: %w", execSession, err) } } @@ -841,7 +841,7 @@ func (s *BoltState) UpdateContainer(ctr *Container) error { } if err := json.Unmarshal(newStateBytes, newState); err != nil { - return fmt.Errorf("error unmarshalling container %s state: %w", ctr.ID(), err) + return fmt.Errorf("unmarshalling container %s state: %w", ctr.ID(), err) } netNSBytes := ctrToUpdate.Get(netNSKey) @@ -886,7 +886,7 @@ func (s *BoltState) SaveContainer(ctr *Container) error { stateJSON, err := json.Marshal(ctr.state) if err != nil { - return fmt.Errorf("error marshalling container %s state to JSON: %w", ctr.ID(), err) + return fmt.Errorf("marshalling container %s state to JSON: %w", ctr.ID(), err) } netNSPath := getNetNSPath(ctr) @@ -912,17 +912,17 @@ func (s *BoltState) SaveContainer(ctr *Container) error { // Update the state if err := ctrToSave.Put(stateKey, stateJSON); err != nil { - return fmt.Errorf("error updating container %s state in DB: %w", ctr.ID(), err) + return fmt.Errorf("updating container %s state in DB: %w", ctr.ID(), err) } if netNSPath != "" { if err := ctrToSave.Put(netNSKey, []byte(netNSPath)); err != nil { - return fmt.Errorf("error updating network namespace path for container %s in DB: %w", ctr.ID(), err) + return fmt.Errorf("updating network namespace path for container %s in DB: %w", ctr.ID(), err) } } else { // Delete the existing network namespace if err := ctrToSave.Delete(netNSKey); err != nil { - return fmt.Errorf("error removing network namespace path for container %s in DB: %w", ctr.ID(), err) + return fmt.Errorf("removing network namespace path for container %s in DB: %w", ctr.ID(), err) } } @@ -1142,7 +1142,7 @@ func (s *BoltState) GetNetworks(ctr *Container) (map[string]types.PerNetworkOpti if ctrNetworkBkt == nil { ctrNetworkBkt, err = dbCtr.CreateBucket(networksBkt) if err != nil { - return fmt.Errorf("error creating networks bucket for container %s: %w", ctr.ID(), err) + return fmt.Errorf("creating networks bucket for container %s: %w", ctr.ID(), err) } // the container has no networks in the db lookup config and write to the db networkList = ctr.config.NetworksDeprecated @@ -1249,7 +1249,7 @@ func (s *BoltState) NetworkConnect(ctr *Container, network string, opts types.Pe optBytes, err := json.Marshal(opts) if err != nil { - return fmt.Errorf("error marshalling network options JSON for container %s: %w", ctr.ID(), err) + return fmt.Errorf("marshalling network options JSON for container %s: %w", ctr.ID(), err) } ctrID := []byte(ctr.ID()) @@ -1278,12 +1278,12 @@ func (s *BoltState) NetworkConnect(ctr *Container, network string, opts types.Pe } netConnected := ctrNetworksBkt.Get([]byte(network)) if netConnected != nil { - return fmt.Errorf("container %s is already connected to network %q: %w", ctr.ID(), network, define.ErrNetworkExists) + return fmt.Errorf("container %s is already connected to network %q: %w", ctr.ID(), network, define.ErrNetworkConnected) } // Add the network if err := ctrNetworksBkt.Put([]byte(network), optBytes); err != nil { - return fmt.Errorf("error adding container %s to network %s in DB: %w", ctr.ID(), network, err) + return fmt.Errorf("adding container %s to network %s in DB: %w", ctr.ID(), network, err) } return nil @@ -1340,7 +1340,7 @@ func (s *BoltState) NetworkDisconnect(ctr *Container, network string) error { } if err := ctrNetworksBkt.Delete([]byte(network)); err != nil { - return fmt.Errorf("error removing container %s from network %s: %w", ctr.ID(), network, err) + return fmt.Errorf("removing container %s from network %s: %w", ctr.ID(), network, err) } if ctrAliasesBkt != nil { @@ -1350,7 +1350,7 @@ func (s *BoltState) NetworkDisconnect(ctr *Container, network string) error { } if err := ctrAliasesBkt.DeleteBucket([]byte(network)); err != nil { - return fmt.Errorf("error removing container %s network aliases for network %s: %w", ctr.ID(), network, err) + return fmt.Errorf("removing container %s network aliases for network %s: %w", ctr.ID(), network, err) } } @@ -1626,7 +1626,7 @@ func (s *BoltState) AddExecSession(ctr *Container, session *ExecSession) error { ctrExecSessionBucket, err := dbCtr.CreateBucketIfNotExists(execBkt) if err != nil { - return fmt.Errorf("error creating exec sessions bucket for container %s: %w", ctr.ID(), err) + return fmt.Errorf("creating exec sessions bucket for container %s: %w", ctr.ID(), err) } execExists := execBucket.Get(sessionID) @@ -1635,11 +1635,11 @@ func (s *BoltState) AddExecSession(ctr *Container, session *ExecSession) error { } if err := execBucket.Put(sessionID, ctrID); err != nil { - return fmt.Errorf("error adding exec session %s to DB: %w", session.ID(), err) + return fmt.Errorf("adding exec session %s to DB: %w", session.ID(), err) } if err := ctrExecSessionBucket.Put(sessionID, ctrID); err != nil { - return fmt.Errorf("error adding exec session %s to container %s in DB: %w", session.ID(), ctr.ID(), err) + return fmt.Errorf("adding exec session %s to container %s in DB: %w", session.ID(), ctr.ID(), err) } return nil @@ -1716,7 +1716,7 @@ func (s *BoltState) RemoveExecSession(session *ExecSession) error { } if err := execBucket.Delete(sessionID); err != nil { - return fmt.Errorf("error removing exec session %s from database: %w", session.ID(), err) + return fmt.Errorf("removing exec session %s from database: %w", session.ID(), err) } dbCtr := ctrBucket.Bucket(containerID) @@ -1739,7 +1739,7 @@ func (s *BoltState) RemoveExecSession(session *ExecSession) error { ctrSessionExists := ctrExecBucket.Get(sessionID) if ctrSessionExists != nil { if err := ctrExecBucket.Delete(sessionID); err != nil { - return fmt.Errorf("error removing exec session %s from container %s in database: %w", session.ID(), session.ContainerID(), err) + return fmt.Errorf("removing exec session %s from container %s in database: %w", session.ID(), session.ContainerID(), err) } } @@ -1847,7 +1847,7 @@ func (s *BoltState) RemoveContainerExecSessions(ctr *Container) error { for _, session := range sessions { if err := ctrExecSessions.Delete([]byte(session)); err != nil { - return fmt.Errorf("error removing container %s exec session %s from database: %w", ctr.ID(), session, err) + return fmt.Errorf("removing container %s exec session %s from database: %w", ctr.ID(), session, err) } // Check if the session exists in the global table // before removing. It should, but in cases where the DB @@ -1861,7 +1861,7 @@ func (s *BoltState) RemoveContainerExecSessions(ctr *Container) error { return fmt.Errorf("database mismatch: exec session %s is associated with containers %s and %s: %w", session, ctr.ID(), string(sessionExists), define.ErrInternal) } if err := execBucket.Delete([]byte(session)); err != nil { - return fmt.Errorf("error removing container %s exec session %s from exec sessions: %w", ctr.ID(), session, err) + return fmt.Errorf("removing container %s exec session %s from exec sessions: %w", ctr.ID(), session, err) } } @@ -1884,7 +1884,7 @@ func (s *BoltState) RewriteContainerConfig(ctr *Container, newCfg *ContainerConf newCfgJSON, err := json.Marshal(newCfg) if err != nil { - return fmt.Errorf("error marshalling new configuration JSON for container %s: %w", ctr.ID(), err) + return fmt.Errorf("marshalling new configuration JSON for container %s: %w", ctr.ID(), err) } db, err := s.getDBCon() @@ -1906,7 +1906,7 @@ func (s *BoltState) RewriteContainerConfig(ctr *Container, newCfg *ContainerConf } if err := ctrDB.Put(configKey, newCfgJSON); err != nil { - return fmt.Errorf("error updating container %s config JSON: %w", ctr.ID(), err) + return fmt.Errorf("updating container %s config JSON: %w", ctr.ID(), err) } return nil @@ -1937,7 +1937,7 @@ func (s *BoltState) SafeRewriteContainerConfig(ctr *Container, oldName, newName newCfgJSON, err := json.Marshal(newCfg) if err != nil { - return fmt.Errorf("error marshalling new configuration JSON for container %s: %w", ctr.ID(), err) + return fmt.Errorf("marshalling new configuration JSON for container %s: %w", ctr.ID(), err) } db, err := s.getDBCon() @@ -1978,16 +1978,16 @@ func (s *BoltState) SafeRewriteContainerConfig(ctr *Container, oldName, newName // buckets are ID-indexed so we just need to // overwrite the values there. if err := namesBkt.Delete([]byte(oldName)); err != nil { - return fmt.Errorf("error deleting container %s old name from DB for rename: %w", ctr.ID(), err) + return fmt.Errorf("deleting container %s old name from DB for rename: %w", ctr.ID(), err) } if err := idBkt.Put([]byte(ctr.ID()), []byte(newName)); err != nil { - return fmt.Errorf("error renaming container %s in ID bucket in DB: %w", ctr.ID(), err) + return fmt.Errorf("renaming container %s in ID bucket in DB: %w", ctr.ID(), err) } if err := namesBkt.Put([]byte(newName), []byte(ctr.ID())); err != nil { - return fmt.Errorf("error adding new name %s for container %s in DB: %w", newName, ctr.ID(), err) + return fmt.Errorf("adding new name %s for container %s in DB: %w", newName, ctr.ID(), err) } if err := allCtrsBkt.Put([]byte(ctr.ID()), []byte(newName)); err != nil { - return fmt.Errorf("error renaming container %s in all containers bucket in DB: %w", ctr.ID(), err) + return fmt.Errorf("renaming container %s in all containers bucket in DB: %w", ctr.ID(), err) } if ctr.config.Pod != "" { podsBkt, err := getPodBucket(tx) @@ -2003,7 +2003,7 @@ func (s *BoltState) SafeRewriteContainerConfig(ctr *Container, oldName, newName return fmt.Errorf("pod %s does not have a containers bucket: %w", ctr.config.Pod, define.ErrInternal) } if err := podCtrBkt.Put([]byte(ctr.ID()), []byte(newName)); err != nil { - return fmt.Errorf("error renaming container %s in pod %s members bucket: %w", ctr.ID(), ctr.config.Pod, err) + return fmt.Errorf("renaming container %s in pod %s members bucket: %w", ctr.ID(), ctr.config.Pod, err) } } } @@ -2021,7 +2021,7 @@ func (s *BoltState) SafeRewriteContainerConfig(ctr *Container, oldName, newName } if err := ctrDB.Put(configKey, newCfgJSON); err != nil { - return fmt.Errorf("error updating container %s config JSON: %w", ctr.ID(), err) + return fmt.Errorf("updating container %s config JSON: %w", ctr.ID(), err) } return nil @@ -2043,7 +2043,7 @@ func (s *BoltState) RewritePodConfig(pod *Pod, newCfg *PodConfig) error { newCfgJSON, err := json.Marshal(newCfg) if err != nil { - return fmt.Errorf("error marshalling new configuration JSON for pod %s: %w", pod.ID(), err) + return fmt.Errorf("marshalling new configuration JSON for pod %s: %w", pod.ID(), err) } db, err := s.getDBCon() @@ -2065,7 +2065,7 @@ func (s *BoltState) RewritePodConfig(pod *Pod, newCfg *PodConfig) error { } if err := podDB.Put(configKey, newCfgJSON); err != nil { - return fmt.Errorf("error updating pod %s config JSON: %w", pod.ID(), err) + return fmt.Errorf("updating pod %s config JSON: %w", pod.ID(), err) } return nil @@ -2087,7 +2087,7 @@ func (s *BoltState) RewriteVolumeConfig(volume *Volume, newCfg *VolumeConfig) er newCfgJSON, err := json.Marshal(newCfg) if err != nil { - return fmt.Errorf("error marshalling new configuration JSON for volume %q: %w", volume.Name(), err) + return fmt.Errorf("marshalling new configuration JSON for volume %q: %w", volume.Name(), err) } db, err := s.getDBCon() @@ -2109,7 +2109,7 @@ func (s *BoltState) RewriteVolumeConfig(volume *Volume, newCfg *VolumeConfig) er } if err := volDB.Put(configKey, newCfgJSON); err != nil { - return fmt.Errorf("error updating volume %q config JSON: %w", volume.Name(), err) + return fmt.Errorf("updating volume %q config JSON: %w", volume.Name(), err) } return nil @@ -2522,7 +2522,7 @@ func (s *BoltState) AddVolume(volume *Volume) error { volConfigJSON, err := json.Marshal(volume.config) if err != nil { - return fmt.Errorf("error marshalling volume %s config to JSON: %w", volume.Name(), err) + return fmt.Errorf("marshalling volume %s config to JSON: %w", volume.Name(), err) } // Volume state is allowed to not exist @@ -2530,7 +2530,7 @@ func (s *BoltState) AddVolume(volume *Volume) error { if volume.state != nil { volStateJSON, err = json.Marshal(volume.state) if err != nil { - return fmt.Errorf("error marshalling volume %s state to JSON: %w", volume.Name(), err) + return fmt.Errorf("marshalling volume %s state to JSON: %w", volume.Name(), err) } } @@ -2561,27 +2561,27 @@ func (s *BoltState) AddVolume(volume *Volume) error { // Make a bucket for it newVol, err := volBkt.CreateBucket(volName) if err != nil { - return fmt.Errorf("error creating bucket for volume %s: %w", volume.Name(), err) + return fmt.Errorf("creating bucket for volume %s: %w", volume.Name(), err) } // Make a subbucket for the containers using the volume. Dependent container IDs will be addedremoved to // this bucket in addcontainer/removeContainer if _, err := newVol.CreateBucket(volDependenciesBkt); err != nil { - return fmt.Errorf("error creating bucket for containers using volume %s: %w", volume.Name(), err) + return fmt.Errorf("creating bucket for containers using volume %s: %w", volume.Name(), err) } if err := newVol.Put(configKey, volConfigJSON); err != nil { - return fmt.Errorf("error storing volume %s configuration in DB: %w", volume.Name(), err) + return fmt.Errorf("storing volume %s configuration in DB: %w", volume.Name(), err) } if volStateJSON != nil { if err := newVol.Put(stateKey, volStateJSON); err != nil { - return fmt.Errorf("error storing volume %s state in DB: %w", volume.Name(), err) + return fmt.Errorf("storing volume %s state in DB: %w", volume.Name(), err) } } if err := allVolsBkt.Put(volName, volName); err != nil { - return fmt.Errorf("error storing volume %s in all volumes bucket in DB: %w", volume.Name(), err) + return fmt.Errorf("storing volume %s in all volumes bucket in DB: %w", volume.Name(), err) } return nil @@ -2650,7 +2650,7 @@ func (s *BoltState) RemoveVolume(volume *Volume) error { return nil }) if err != nil { - return fmt.Errorf("error getting list of dependencies from dependencies bucket for volumes %q: %w", volume.Name(), err) + return fmt.Errorf("getting list of dependencies from dependencies bucket for volumes %q: %w", volume.Name(), err) } if len(deps) > 0 { return fmt.Errorf("volume %s is being used by container(s) %s: %w", volume.Name(), strings.Join(deps, ","), define.ErrVolumeBeingUsed) @@ -2660,10 +2660,10 @@ func (s *BoltState) RemoveVolume(volume *Volume) error { // volume is ready for removal // Let's kick it out if err := allVolsBkt.Delete(volName); err != nil { - return fmt.Errorf("error removing volume %s from all volumes bucket in DB: %w", volume.Name(), err) + return fmt.Errorf("removing volume %s from all volumes bucket in DB: %w", volume.Name(), err) } if err := volBkt.DeleteBucket(volName); err != nil { - return fmt.Errorf("error removing volume %s from DB: %w", volume.Name(), err) + return fmt.Errorf("removing volume %s from DB: %w", volume.Name(), err) } return nil @@ -2710,7 +2710,7 @@ func (s *BoltState) UpdateVolume(volume *Volume) error { } if err := json.Unmarshal(stateBytes, newState); err != nil { - return fmt.Errorf("error unmarshalling volume %s state: %w", volume.Name(), err) + return fmt.Errorf("unmarshalling volume %s state: %w", volume.Name(), err) } return nil @@ -2740,7 +2740,7 @@ func (s *BoltState) SaveVolume(volume *Volume) error { if volume.state != nil { stateJSON, err := json.Marshal(volume.state) if err != nil { - return fmt.Errorf("error marshalling volume %s state to JSON: %w", volume.Name(), err) + return fmt.Errorf("marshalling volume %s state to JSON: %w", volume.Name(), err) } newStateJSON = stateJSON } @@ -3060,12 +3060,12 @@ func (s *BoltState) AddPod(pod *Pod) error { podConfigJSON, err := json.Marshal(pod.config) if err != nil { - return fmt.Errorf("error marshalling pod %s config to JSON: %w", pod.ID(), err) + return fmt.Errorf("marshalling pod %s config to JSON: %w", pod.ID(), err) } podStateJSON, err := json.Marshal(pod.state) if err != nil { - return fmt.Errorf("error marshalling pod %s state to JSON: %w", pod.ID(), err) + return fmt.Errorf("marshalling pod %s state to JSON: %w", pod.ID(), err) } db, err := s.getDBCon() @@ -3122,40 +3122,40 @@ func (s *BoltState) AddPod(pod *Pod) error { // Make a bucket for it newPod, err := podBkt.CreateBucket(podID) if err != nil { - return fmt.Errorf("error creating bucket for pod %s: %w", pod.ID(), err) + return fmt.Errorf("creating bucket for pod %s: %w", pod.ID(), err) } // Make a subbucket for pod containers if _, err := newPod.CreateBucket(containersBkt); err != nil { - return fmt.Errorf("error creating bucket for pod %s containers: %w", pod.ID(), err) + return fmt.Errorf("creating bucket for pod %s containers: %w", pod.ID(), err) } if err := newPod.Put(configKey, podConfigJSON); err != nil { - return fmt.Errorf("error storing pod %s configuration in DB: %w", pod.ID(), err) + return fmt.Errorf("storing pod %s configuration in DB: %w", pod.ID(), err) } if err := newPod.Put(stateKey, podStateJSON); err != nil { - return fmt.Errorf("error storing pod %s state JSON in DB: %w", pod.ID(), err) + return fmt.Errorf("storing pod %s state JSON in DB: %w", pod.ID(), err) } if podNamespace != nil { if err := newPod.Put(namespaceKey, podNamespace); err != nil { - return fmt.Errorf("error storing pod %s namespace in DB: %w", pod.ID(), err) + return fmt.Errorf("storing pod %s namespace in DB: %w", pod.ID(), err) } if err := nsBkt.Put(podID, podNamespace); err != nil { - return fmt.Errorf("error storing pod %s namespace in DB: %w", pod.ID(), err) + return fmt.Errorf("storing pod %s namespace in DB: %w", pod.ID(), err) } } // Add us to the ID and names buckets if err := idsBkt.Put(podID, podName); err != nil { - return fmt.Errorf("error storing pod %s ID in DB: %w", pod.ID(), err) + return fmt.Errorf("storing pod %s ID in DB: %w", pod.ID(), err) } if err := namesBkt.Put(podName, podID); err != nil { - return fmt.Errorf("error storing pod %s name in DB: %w", pod.Name(), err) + return fmt.Errorf("storing pod %s name in DB: %w", pod.Name(), err) } if err := allPodsBkt.Put(podID, podName); err != nil { - return fmt.Errorf("error storing pod %s in all pods bucket in DB: %w", pod.ID(), err) + return fmt.Errorf("storing pod %s in all pods bucket in DB: %w", pod.ID(), err) } return nil @@ -3240,19 +3240,19 @@ func (s *BoltState) RemovePod(pod *Pod) error { // Pod is empty, and ready for removal // Let's kick it out if err := idsBkt.Delete(podID); err != nil { - return fmt.Errorf("error removing pod %s ID from DB: %w", pod.ID(), err) + return fmt.Errorf("removing pod %s ID from DB: %w", pod.ID(), err) } if err := namesBkt.Delete(podName); err != nil { - return fmt.Errorf("error removing pod %s name (%s) from DB: %w", pod.ID(), pod.Name(), err) + return fmt.Errorf("removing pod %s name (%s) from DB: %w", pod.ID(), pod.Name(), err) } if err := nsBkt.Delete(podID); err != nil { - return fmt.Errorf("error removing pod %s namespace from DB: %w", pod.ID(), err) + return fmt.Errorf("removing pod %s namespace from DB: %w", pod.ID(), err) } if err := allPodsBkt.Delete(podID); err != nil { - return fmt.Errorf("error removing pod %s ID from all pods bucket in DB: %w", pod.ID(), err) + return fmt.Errorf("removing pod %s ID from all pods bucket in DB: %w", pod.ID(), err) } if err := podBkt.DeleteBucket(podID); err != nil { - return fmt.Errorf("error removing pod %s from DB: %w", pod.ID(), err) + return fmt.Errorf("removing pod %s from DB: %w", pod.ID(), err) } return nil @@ -3353,19 +3353,19 @@ func (s *BoltState) RemovePodContainers(pod *Pod) error { // Dependencies are set, we're clear to remove if err := ctrBkt.DeleteBucket(id); err != nil { - return fmt.Errorf("error deleting container %s from DB: %w", string(id), define.ErrInternal) + return fmt.Errorf("deleting container %s from DB: %w", string(id), define.ErrInternal) } if err := idsBkt.Delete(id); err != nil { - return fmt.Errorf("error deleting container %s ID in DB: %w", string(id), err) + return fmt.Errorf("deleting container %s ID in DB: %w", string(id), err) } if err := namesBkt.Delete(name); err != nil { - return fmt.Errorf("error deleting container %s name in DB: %w", string(id), err) + return fmt.Errorf("deleting container %s name in DB: %w", string(id), err) } if err := allCtrsBkt.Delete(id); err != nil { - return fmt.Errorf("error deleting container %s ID from all containers bucket in DB: %w", string(id), err) + return fmt.Errorf("deleting container %s ID from all containers bucket in DB: %w", string(id), err) } return nil @@ -3376,10 +3376,10 @@ func (s *BoltState) RemovePodContainers(pod *Pod) error { // Delete and recreate the bucket to empty it if err := podDB.DeleteBucket(containersBkt); err != nil { - return fmt.Errorf("error removing pod %s containers bucket: %w", pod.ID(), err) + return fmt.Errorf("removing pod %s containers bucket: %w", pod.ID(), err) } if _, err := podDB.CreateBucket(containersBkt); err != nil { - return fmt.Errorf("error recreating pod %s containers bucket: %w", pod.ID(), err) + return fmt.Errorf("recreating pod %s containers bucket: %w", pod.ID(), err) } return nil @@ -3496,7 +3496,7 @@ func (s *BoltState) UpdatePod(pod *Pod) error { } if err := json.Unmarshal(podStateBytes, newState); err != nil { - return fmt.Errorf("error unmarshalling pod %s state JSON: %w", pod.ID(), err) + return fmt.Errorf("unmarshalling pod %s state JSON: %w", pod.ID(), err) } return nil @@ -3526,7 +3526,7 @@ func (s *BoltState) SavePod(pod *Pod) error { stateJSON, err := json.Marshal(pod.state) if err != nil { - return fmt.Errorf("error marshalling pod %s state to JSON: %w", pod.ID(), err) + return fmt.Errorf("marshalling pod %s state to JSON: %w", pod.ID(), err) } db, err := s.getDBCon() @@ -3551,7 +3551,7 @@ func (s *BoltState) SavePod(pod *Pod) error { // Set the pod state JSON if err := podDB.Put(stateKey, stateJSON); err != nil { - return fmt.Errorf("error updating pod %s state in database: %w", pod.ID(), err) + return fmt.Errorf("updating pod %s state in database: %w", pod.ID(), err) } return nil diff --git a/libpod/boltdb_state_freebsd.go b/libpod/boltdb_state_freebsd.go new file mode 100644 index 000000000..d0a2d4f28 --- /dev/null +++ b/libpod/boltdb_state_freebsd.go @@ -0,0 +1,25 @@ +//go:build freebsd +// +build freebsd + +package libpod + +// replaceNetNS handle network namespace transitions after updating a +// container's state. +func replaceNetNS(netNSPath string, ctr *Container, newState *ContainerState) error { + if netNSPath != "" { + // On FreeBSD, we just record the network jail's name in our state. + newState.NetNS = &jailNetNS{Name: netNSPath} + } else { + newState.NetNS = nil + } + return nil +} + +// getNetNSPath retrieves the netns path to be stored in the database +func getNetNSPath(ctr *Container) string { + if ctr.state.NetNS != nil { + return ctr.state.NetNS.Name + } else { + return "" + } +} diff --git a/libpod/boltdb_state_internal.go b/libpod/boltdb_state_internal.go index f28fadfa9..87f1fa4eb 100644 --- a/libpod/boltdb_state_internal.go +++ b/libpod/boltdb_state_internal.go @@ -195,7 +195,7 @@ func checkRuntimeConfig(db *bolt.DB, rt *Runtime) error { } if err := configBkt.Put(missing.key, dbValue); err != nil { - return fmt.Errorf("error updating %s in DB runtime config: %w", missing.name, err) + return fmt.Errorf("updating %s in DB runtime config: %w", missing.name, err) } } @@ -254,7 +254,7 @@ func (s *BoltState) getDBCon() (*bolt.DB, error) { db, err := bolt.Open(s.dbPath, 0600, nil) if err != nil { - return nil, fmt.Errorf("error opening database %s: %w", s.dbPath, err) + return nil, fmt.Errorf("opening database %s: %w", s.dbPath, err) } return db, nil @@ -403,7 +403,7 @@ func (s *BoltState) getContainerConfigFromDB(id []byte, config *ContainerConfig, } if err := json.Unmarshal(configBytes, config); err != nil { - return fmt.Errorf("error unmarshalling container %s config: %w", string(id), err) + return fmt.Errorf("unmarshalling container %s config: %w", string(id), err) } // convert ports to the new format if needed @@ -426,7 +426,7 @@ func (s *BoltState) getContainerFromDB(id []byte, ctr *Container, ctrsBkt *bolt. // Get the lock lock, err := s.runtime.lockManager.RetrieveLock(ctr.config.LockID) if err != nil { - return fmt.Errorf("error retrieving lock for container %s: %w", string(id), err) + return fmt.Errorf("retrieving lock for container %s: %w", string(id), err) } ctr.lock = lock @@ -489,13 +489,13 @@ func (s *BoltState) getPodFromDB(id []byte, pod *Pod, podBkt *bolt.Bucket) error } if err := json.Unmarshal(podConfigBytes, pod.config); err != nil { - return fmt.Errorf("error unmarshalling pod %s config from DB: %w", string(id), err) + return fmt.Errorf("unmarshalling pod %s config from DB: %w", string(id), err) } // Get the lock lock, err := s.runtime.lockManager.RetrieveLock(pod.config.LockID) if err != nil { - return fmt.Errorf("error retrieving lock for pod %s: %w", string(id), err) + return fmt.Errorf("retrieving lock for pod %s: %w", string(id), err) } pod.lock = lock @@ -517,14 +517,14 @@ func (s *BoltState) getVolumeFromDB(name []byte, volume *Volume, volBkt *bolt.Bu } if err := json.Unmarshal(volConfigBytes, volume.config); err != nil { - return fmt.Errorf("error unmarshalling volume %s config from DB: %w", string(name), err) + return fmt.Errorf("unmarshalling volume %s config from DB: %w", string(name), err) } // Volume state is allowed to be nil for legacy compatibility volStateBytes := volDB.Get(stateKey) if volStateBytes != nil { if err := json.Unmarshal(volStateBytes, volume.state); err != nil { - return fmt.Errorf("error unmarshalling volume %s state from DB: %w", string(name), err) + return fmt.Errorf("unmarshalling volume %s state from DB: %w", string(name), err) } } @@ -546,7 +546,7 @@ func (s *BoltState) getVolumeFromDB(name []byte, volume *Volume, volBkt *bolt.Bu // Get the lock lock, err := s.runtime.lockManager.RetrieveLock(volume.config.LockID) if err != nil { - return fmt.Errorf("error retrieving lock for volume %q: %w", string(name), err) + return fmt.Errorf("retrieving lock for volume %q: %w", string(name), err) } volume.lock = lock @@ -572,11 +572,11 @@ func (s *BoltState) addContainer(ctr *Container, pod *Pod) error { // JSON container structs to insert into DB configJSON, err := json.Marshal(ctr.config) if err != nil { - return fmt.Errorf("error marshalling container %s config to JSON: %w", ctr.ID(), err) + return fmt.Errorf("marshalling container %s config to JSON: %w", ctr.ID(), err) } stateJSON, err := json.Marshal(ctr.state) if err != nil { - return fmt.Errorf("error marshalling container %s state to JSON: %w", ctr.ID(), err) + return fmt.Errorf("marshalling container %s state to JSON: %w", ctr.ID(), err) } netNSPath := getNetNSPath(ctr) dependsCtrs := ctr.Dependencies() @@ -603,7 +603,7 @@ func (s *BoltState) addContainer(ctr *Container, pod *Pod) error { opts.Aliases = append(opts.Aliases, ctr.config.ID[:12]) optBytes, err := json.Marshal(opts) if err != nil { - return fmt.Errorf("error marshalling network options JSON for container %s: %w", ctr.ID(), err) + return fmt.Errorf("marshalling network options JSON for container %s: %w", ctr.ID(), err) } networks[net] = optBytes } @@ -694,60 +694,60 @@ func (s *BoltState) addContainer(ctr *Container, pod *Pod) error { // No overlapping containers // Add the new container to the DB if err := idsBucket.Put(ctrID, ctrName); err != nil { - return fmt.Errorf("error adding container %s ID to DB: %w", ctr.ID(), err) + return fmt.Errorf("adding container %s ID to DB: %w", ctr.ID(), err) } if err := namesBucket.Put(ctrName, ctrID); err != nil { - return fmt.Errorf("error adding container %s name (%s) to DB: %w", ctr.ID(), ctr.Name(), err) + return fmt.Errorf("adding container %s name (%s) to DB: %w", ctr.ID(), ctr.Name(), err) } if ctrNamespace != nil { if err := nsBucket.Put(ctrID, ctrNamespace); err != nil { - return fmt.Errorf("error adding container %s namespace (%q) to DB: %w", ctr.ID(), ctr.Namespace(), err) + return fmt.Errorf("adding container %s namespace (%q) to DB: %w", ctr.ID(), ctr.Namespace(), err) } } if err := allCtrsBucket.Put(ctrID, ctrName); err != nil { - return fmt.Errorf("error adding container %s to all containers bucket in DB: %w", ctr.ID(), err) + return fmt.Errorf("adding container %s to all containers bucket in DB: %w", ctr.ID(), err) } newCtrBkt, err := ctrBucket.CreateBucket(ctrID) if err != nil { - return fmt.Errorf("error adding container %s bucket to DB: %w", ctr.ID(), err) + return fmt.Errorf("adding container %s bucket to DB: %w", ctr.ID(), err) } if err := newCtrBkt.Put(configKey, configJSON); err != nil { - return fmt.Errorf("error adding container %s config to DB: %w", ctr.ID(), err) + return fmt.Errorf("adding container %s config to DB: %w", ctr.ID(), err) } if err := newCtrBkt.Put(stateKey, stateJSON); err != nil { - return fmt.Errorf("error adding container %s state to DB: %w", ctr.ID(), err) + return fmt.Errorf("adding container %s state to DB: %w", ctr.ID(), err) } if ctrNamespace != nil { if err := newCtrBkt.Put(namespaceKey, ctrNamespace); err != nil { - return fmt.Errorf("error adding container %s namespace to DB: %w", ctr.ID(), err) + return fmt.Errorf("adding container %s namespace to DB: %w", ctr.ID(), err) } } if pod != nil { if err := newCtrBkt.Put(podIDKey, []byte(pod.ID())); err != nil { - return fmt.Errorf("error adding container %s pod to DB: %w", ctr.ID(), err) + return fmt.Errorf("adding container %s pod to DB: %w", ctr.ID(), err) } } if netNSPath != "" { if err := newCtrBkt.Put(netNSKey, []byte(netNSPath)); err != nil { - return fmt.Errorf("error adding container %s netns path to DB: %w", ctr.ID(), err) + return fmt.Errorf("adding container %s netns path to DB: %w", ctr.ID(), err) } } if len(networks) > 0 { ctrNetworksBkt, err := newCtrBkt.CreateBucket(networksBkt) if err != nil { - return fmt.Errorf("error creating networks bucket for container %s: %w", ctr.ID(), err) + return fmt.Errorf("creating networks bucket for container %s: %w", ctr.ID(), err) } for network, opts := range networks { if err := ctrNetworksBkt.Put([]byte(network), opts); err != nil { - return fmt.Errorf("error adding network %q to networks bucket for container %s: %w", network, ctr.ID(), err) + return fmt.Errorf("adding network %q to networks bucket for container %s: %w", network, ctr.ID(), err) } } } if _, err := newCtrBkt.CreateBucket(dependenciesBkt); err != nil { - return fmt.Errorf("error creating dependencies bucket for container %s: %w", ctr.ID(), err) + return fmt.Errorf("creating dependencies bucket for container %s: %w", ctr.ID(), err) } // Add dependencies for the container @@ -784,14 +784,14 @@ func (s *BoltState) addContainer(ctr *Container, pod *Pod) error { return fmt.Errorf("container %s does not have a dependencies bucket: %w", dependsCtr, define.ErrInternal) } if err := depCtrDependsBkt.Put(ctrID, ctrName); err != nil { - return fmt.Errorf("error adding ctr %s as dependency of container %s: %w", ctr.ID(), dependsCtr, err) + return fmt.Errorf("adding ctr %s as dependency of container %s: %w", ctr.ID(), dependsCtr, err) } } // Add ctr to pod if pod != nil && podCtrs != nil { if err := podCtrs.Put(ctrID, ctrName); err != nil { - return fmt.Errorf("error adding container %s to pod %s: %w", ctr.ID(), pod.ID(), err) + return fmt.Errorf("adding container %s to pod %s: %w", ctr.ID(), pod.ID(), err) } } @@ -804,11 +804,11 @@ func (s *BoltState) addContainer(ctr *Container, pod *Pod) error { ctrDepsBkt, err := volDB.CreateBucketIfNotExists(volDependenciesBkt) if err != nil { - return fmt.Errorf("error creating volume %s dependencies bucket to add container %s: %w", vol.Name, ctr.ID(), err) + return fmt.Errorf("creating volume %s dependencies bucket to add container %s: %w", vol.Name, ctr.ID(), err) } if depExists := ctrDepsBkt.Get(ctrID); depExists == nil { if err := ctrDepsBkt.Put(ctrID, ctrID); err != nil { - return fmt.Errorf("error adding container %s to volume %s dependencies: %w", ctr.ID(), vol.Name, err) + return fmt.Errorf("adding container %s to volume %s dependencies: %w", ctr.ID(), vol.Name, err) } } } @@ -902,7 +902,7 @@ func (s *BoltState) removeContainer(ctr *Container, pod *Pod, tx *bolt.Tx) error return fmt.Errorf("container %s is not in pod %s: %w", ctr.ID(), pod.ID(), define.ErrNoSuchCtr) } if err := podCtrs.Delete(ctrID); err != nil { - return fmt.Errorf("error removing container %s from pod %s: %w", ctr.ID(), pod.ID(), err) + return fmt.Errorf("removing container %s from pod %s: %w", ctr.ID(), pod.ID(), err) } } } @@ -943,21 +943,21 @@ func (s *BoltState) removeContainer(ctr *Container, pod *Pod, tx *bolt.Tx) error } if err := ctrBucket.DeleteBucket(ctrID); err != nil { - return fmt.Errorf("error deleting container %s from DB: %w", ctr.ID(), define.ErrInternal) + return fmt.Errorf("deleting container %s from DB: %w", ctr.ID(), define.ErrInternal) } if err := idsBucket.Delete(ctrID); err != nil { - return fmt.Errorf("error deleting container %s ID in DB: %w", ctr.ID(), err) + return fmt.Errorf("deleting container %s ID in DB: %w", ctr.ID(), err) } if err := namesBucket.Delete(ctrName); err != nil { - return fmt.Errorf("error deleting container %s name in DB: %w", ctr.ID(), err) + return fmt.Errorf("deleting container %s name in DB: %w", ctr.ID(), err) } if err := nsBucket.Delete(ctrID); err != nil { - return fmt.Errorf("error deleting container %s namespace in DB: %w", ctr.ID(), err) + return fmt.Errorf("deleting container %s namespace in DB: %w", ctr.ID(), err) } if err := allCtrsBucket.Delete(ctrID); err != nil { - return fmt.Errorf("error deleting container %s from all containers bucket in DB: %w", ctr.ID(), err) + return fmt.Errorf("deleting container %s from all containers bucket in DB: %w", ctr.ID(), err) } depCtrs := ctr.Dependencies() @@ -986,7 +986,7 @@ func (s *BoltState) removeContainer(ctr *Container, pod *Pod, tx *bolt.Tx) error } if err := depCtrDependsBkt.Delete(ctrID); err != nil { - return fmt.Errorf("error removing container %s as a dependency of container %s: %w", ctr.ID(), depCtr, err) + return fmt.Errorf("removing container %s as a dependency of container %s: %w", ctr.ID(), depCtr, err) } } @@ -1005,7 +1005,7 @@ func (s *BoltState) removeContainer(ctr *Container, pod *Pod, tx *bolt.Tx) error } if depExists := ctrDepsBkt.Get(ctrID); depExists == nil { if err := ctrDepsBkt.Delete(ctrID); err != nil { - return fmt.Errorf("error deleting container %s dependency on volume %s: %w", ctr.ID(), vol.Name, err) + return fmt.Errorf("deleting container %s dependency on volume %s: %w", ctr.ID(), vol.Name, err) } } } diff --git a/libpod/boltdb_state_linux.go b/libpod/boltdb_state_linux.go index 813afd8bf..3d9f5b6a2 100644 --- a/libpod/boltdb_state_linux.go +++ b/libpod/boltdb_state_linux.go @@ -30,7 +30,7 @@ func replaceNetNS(netNSPath string, ctr *Container, newState *ContainerState) er newState.NetNS = ns } else { if ctr.ensureState(define.ContainerStateRunning, define.ContainerStatePaused) { - return fmt.Errorf("error joining network namespace of container %s: %w", ctr.ID(), err) + return fmt.Errorf("joining network namespace of container %s: %w", ctr.ID(), err) } logrus.Errorf("Joining network namespace for container %s: %v", ctr.ID(), err) diff --git a/libpod/boltdb_state_unsupported.go b/libpod/boltdb_state_unsupported.go new file mode 100644 index 000000000..9db1e3c4b --- /dev/null +++ b/libpod/boltdb_state_unsupported.go @@ -0,0 +1,19 @@ +//go:build !linux && !freebsd +// +build !linux,!freebsd + +package libpod + +import ( + "errors" +) + +// replaceNetNS handle network namespace transitions after updating a +// container's state. +func replaceNetNS(netNSPath string, ctr *Container, newState *ContainerState) error { + return errors.New("replaceNetNS not supported on this platform") +} + +// getNetNSPath retrieves the netns path to be stored in the database +func getNetNSPath(ctr *Container) string { + return "" +} diff --git a/libpod/common/common.go b/libpod/common/common.go deleted file mode 100644 index 34cabeadc..000000000 --- a/libpod/common/common.go +++ /dev/null @@ -1,16 +0,0 @@ -package common - -// IsTrue determines whether the given string equals "true". -func IsTrue(str string) bool { - return str == "true" -} - -// IsFalse determines whether the given string equals "false". -func IsFalse(str string) bool { - return str == "false" -} - -// IsValidBool determines whether the given string equals "true" or "false". -func IsValidBool(str string) bool { - return IsTrue(str) || IsFalse(str) -} diff --git a/libpod/common/signing_options.go b/libpod/common/signing_options.go deleted file mode 100644 index b7e14be82..000000000 --- a/libpod/common/signing_options.go +++ /dev/null @@ -1,10 +0,0 @@ -package common - -// SigningOptions encapsulates settings that control whether or not we strip or -// add signatures to images when writing them. -type SigningOptions struct { - // RemoveSignatures directs us to remove any signatures which are already present. - RemoveSignatures bool - // SignBy is a key identifier of some kind, indicating that a signature should be generated using the specified private key and stored with the image. - SignBy string -} diff --git a/libpod/container.go b/libpod/container.go index 4e2d93860..cfffd8ea1 100644 --- a/libpod/container.go +++ b/libpod/container.go @@ -124,10 +124,6 @@ type Container struct { // This is true if a container is restored from a checkpoint. restoreFromCheckpoint bool - // Used to query the NOTIFY_SOCKET once along with setting up - // mounts etc. - notifySocket string - slirp4netnsSubnet *net.IPNet } @@ -241,6 +237,9 @@ type ContainerNamedVolume struct { Dest string `json:"dest"` // Options are fstab style mount options Options []string `json:"options,omitempty"` + // IsAnonymous sets the named volume as anonymous even if it has a name + // This is used for emptyDir volumes from a kube yaml + IsAnonymous bool `json:"setAnonymous,omitempty"` } // ContainerOverlayVolume is a overlay volume that will be mounted into the @@ -354,14 +353,16 @@ func (c *Container) specFromState() (*spec.Spec, error) { returnSpec = new(spec.Spec) content, err := ioutil.ReadAll(f) if err != nil { - return nil, fmt.Errorf("error reading container config: %w", err) + return nil, fmt.Errorf("reading container config: %w", err) } if err := json.Unmarshal(content, &returnSpec); err != nil { - return nil, fmt.Errorf("error unmarshalling container config: %w", err) + // Malformed spec, just use c.config.Spec instead + logrus.Warnf("Error unmarshalling container %s config: %v", c.ID(), err) + return c.config.Spec, nil } } else if !os.IsNotExist(err) { // ignore when the file does not exist - return nil, fmt.Errorf("error opening container config: %w", err) + return nil, fmt.Errorf("opening container config: %w", err) } return returnSpec, nil @@ -704,7 +705,7 @@ func (c *Container) Mounted() (bool, string, error) { c.lock.Lock() defer c.lock.Unlock() if err := c.syncContainer(); err != nil { - return false, "", fmt.Errorf("error updating container %s state: %w", c.ID(), err) + return false, "", fmt.Errorf("updating container %s state: %w", c.ID(), err) } } // We cannot directly return c.state.Mountpoint as it is not guaranteed @@ -734,7 +735,7 @@ func (c *Container) StartedTime() (time.Time, error) { c.lock.Lock() defer c.lock.Unlock() if err := c.syncContainer(); err != nil { - return time.Time{}, fmt.Errorf("error updating container %s state: %w", c.ID(), err) + return time.Time{}, fmt.Errorf("updating container %s state: %w", c.ID(), err) } } return c.state.StartedTime, nil @@ -746,7 +747,7 @@ func (c *Container) FinishedTime() (time.Time, error) { c.lock.Lock() defer c.lock.Unlock() if err := c.syncContainer(); err != nil { - return time.Time{}, fmt.Errorf("error updating container %s state: %w", c.ID(), err) + return time.Time{}, fmt.Errorf("updating container %s state: %w", c.ID(), err) } } return c.state.FinishedTime, nil @@ -761,7 +762,7 @@ func (c *Container) ExitCode() (int32, bool, error) { c.lock.Lock() defer c.lock.Unlock() if err := c.syncContainer(); err != nil { - return 0, false, fmt.Errorf("error updating container %s state: %w", c.ID(), err) + return 0, false, fmt.Errorf("updating container %s state: %w", c.ID(), err) } } return c.state.ExitCode, c.state.Exited, nil @@ -773,7 +774,7 @@ func (c *Container) OOMKilled() (bool, error) { c.lock.Lock() defer c.lock.Unlock() if err := c.syncContainer(); err != nil { - return false, fmt.Errorf("error updating container %s state: %w", c.ID(), err) + return false, fmt.Errorf("updating container %s state: %w", c.ID(), err) } } return c.state.OOMKilled, nil @@ -860,7 +861,7 @@ func (c *Container) ExecSession(id string) (*ExecSession, error) { returnSession := new(ExecSession) if err := JSONDeepCopy(session, returnSession); err != nil { - return nil, fmt.Errorf("error copying contents of container %s exec session %s: %w", c.ID(), session.ID(), err) + return nil, fmt.Errorf("copying contents of container %s exec session %s: %w", c.ID(), session.ID(), err) } return returnSession, nil @@ -920,7 +921,7 @@ func (c *Container) NamespacePath(linuxNS LinuxNS) (string, error) { //nolint:in c.lock.Lock() defer c.lock.Unlock() if err := c.syncContainer(); err != nil { - return "", fmt.Errorf("error updating container %s state: %w", c.ID(), err) + return "", fmt.Errorf("updating container %s state: %w", c.ID(), err) } } @@ -958,7 +959,7 @@ func (c *Container) CgroupPath() (string, error) { c.lock.Lock() defer c.lock.Unlock() if err := c.syncContainer(); err != nil { - return "", fmt.Errorf("error updating container %s state: %w", c.ID(), err) + return "", fmt.Errorf("updating container %s state: %w", c.ID(), err) } } return c.cGroupPath() @@ -1058,7 +1059,7 @@ func (c *Container) RootFsSize() (int64, error) { c.lock.Lock() defer c.lock.Unlock() if err := c.syncContainer(); err != nil { - return -1, fmt.Errorf("error updating container %s state: %w", c.ID(), err) + return -1, fmt.Errorf("updating container %s state: %w", c.ID(), err) } } return c.rootFsSize() @@ -1070,7 +1071,7 @@ func (c *Container) RWSize() (int64, error) { c.lock.Lock() defer c.lock.Unlock() if err := c.syncContainer(); err != nil { - return -1, fmt.Errorf("error updating container %s state: %w", c.ID(), err) + return -1, fmt.Errorf("updating container %s state: %w", c.ID(), err) } } return c.rwSize() @@ -1134,20 +1135,6 @@ func (c *Container) NetworkDisabled() (bool, error) { return networkDisabled(c) } -func networkDisabled(c *Container) (bool, error) { - if c.config.CreateNetNS { - return false, nil - } - if !c.config.PostConfigureNetNS { - for _, ns := range c.config.Spec.Linux.Namespaces { - if ns.Type == spec.NetworkNamespace { - return ns.Path == "", nil - } - } - } - return false, nil -} - func (c *Container) HostNetwork() bool { if c.config.CreateNetNS || c.config.NetNsCtr != "" { return false @@ -1172,7 +1159,7 @@ func (c *Container) ContainerState() (*ContainerState, error) { } returnConfig := new(ContainerState) if err := JSONDeepCopy(c.state, returnConfig); err != nil { - return nil, fmt.Errorf("error copying container %s state: %w", c.ID(), err) + return nil, fmt.Errorf("copying container %s state: %w", c.ID(), err) } return c.state, nil } diff --git a/libpod/container_api.go b/libpod/container_api.go index 2ff4bfe08..dd47b4d12 100644 --- a/libpod/container_api.go +++ b/libpod/container_api.go @@ -16,6 +16,7 @@ import ( "github.com/containers/podman/v4/libpod/events" "github.com/containers/podman/v4/pkg/signal" "github.com/containers/storage/pkg/archive" + spec "github.com/opencontainers/runtime-spec/specs-go" "github.com/sirupsen/logrus" ) @@ -98,6 +99,15 @@ func (c *Container) Start(ctx context.Context, recursive bool) error { return c.start() } +// Update updates the given container. +// only the cgroup config can be updated and therefore only a linux resource spec is passed. +func (c *Container) Update(res *spec.LinuxResources) error { + if err := c.syncContainer(); err != nil { + return err + } + return c.update(res) +} + // StartAndAttach starts a container and attaches to it. // This acts as a combination of the Start and Attach APIs, ensuring proper // ordering of the two such that no output from the container is lost (e.g. the @@ -674,7 +684,7 @@ func (c *Container) Cleanup(ctx context.Context) error { // When the container has already been removed, the OCI runtime directory remain. if errors.Is(err, define.ErrNoSuchCtr) || errors.Is(err, define.ErrCtrRemoved) { if err := c.cleanupRuntime(ctx); err != nil { - return fmt.Errorf("error cleaning up container %s from OCI runtime: %w", c.ID(), err) + return fmt.Errorf("cleaning up container %s from OCI runtime: %w", c.ID(), err) } return nil } diff --git a/libpod/container_commit.go b/libpod/container_commit.go index c93c9c7bb..f447816e3 100644 --- a/libpod/container_commit.go +++ b/libpod/container_commit.go @@ -48,7 +48,7 @@ func (c *Container) Commit(ctx context.Context, destImage string, options Contai if c.state.State == define.ContainerStateRunning && options.Pause { if err := c.pause(); err != nil { - return nil, fmt.Errorf("error pausing container %q to commit: %w", c.ID(), err) + return nil, fmt.Errorf("pausing container %q to commit: %w", c.ID(), err) } defer func() { if err := c.unpause(); err != nil { @@ -202,7 +202,7 @@ func (c *Container) Commit(ctx context.Context, destImage string, options Contai imageRef, err := is.Transport.ParseStoreReference(c.runtime.store, resolvedImageName) if err != nil { - return nil, fmt.Errorf("error parsing target image name %q: %w", destImage, err) + return nil, fmt.Errorf("parsing target image name %q: %w", destImage, err) } commitRef = imageRef } diff --git a/libpod/container_config.go b/libpod/container_config.go index 544c45a8c..f3585d22c 100644 --- a/libpod/container_config.go +++ b/libpod/container_config.go @@ -7,6 +7,7 @@ import ( "github.com/containers/common/libnetwork/types" "github.com/containers/common/pkg/secrets" "github.com/containers/image/v5/manifest" + "github.com/containers/podman/v4/libpod/define" "github.com/containers/podman/v4/pkg/namespaces" "github.com/containers/podman/v4/pkg/specgen" "github.com/containers/storage" @@ -386,10 +387,14 @@ type ContainerMiscConfig struct { IsService bool `json:"isService"` // SdNotifyMode tells libpod what to do with a NOTIFY_SOCKET if passed SdNotifyMode string `json:"sdnotifyMode,omitempty"` + // SdNotifySocket stores NOTIFY_SOCKET in use by the container + SdNotifySocket string `json:"sdnotifySocket,omitempty"` // Systemd tells libpod to set up the container in systemd mode, a value of nil denotes false Systemd *bool `json:"systemd,omitempty"` // HealthCheckConfig has the health check command and related timings HealthCheckConfig *manifest.Schema2HealthConfig `json:"healthcheck"` + // HealthCheckOnFailureAction defines an action to take once the container turns unhealthy. + HealthCheckOnFailureAction define.HealthCheckOnFailureAction `json:"healthcheck_on_failure_action"` // PreserveFDs is a number of additional file descriptors (in addition // to 0, 1, 2) that will be passed to the executed process. The total FDs // passed will be 3 + PreserveFDs. diff --git a/libpod/container_copy_unsupported.go b/libpod/container_copy_unsupported.go new file mode 100644 index 000000000..62937279a --- /dev/null +++ b/libpod/container_copy_unsupported.go @@ -0,0 +1,17 @@ +//go:build !linux +// +build !linux + +package libpod + +import ( + "errors" + "io" +) + +func (c *Container) copyFromArchive(path string, chown, noOverwriteDirNonDir bool, rename map[string]string, reader io.Reader) (func() error, error) { + return nil, errors.New("not implemented (*Container) copyFromArchive") +} + +func (c *Container) copyToArchive(path string, writer io.Writer) (func() error, error) { + return nil, errors.New("not implemented (*Container) copyToArchive") +} diff --git a/libpod/container_exec.go b/libpod/container_exec.go index d3c80e896..3a2cba52f 100644 --- a/libpod/container_exec.go +++ b/libpod/container_exec.go @@ -182,7 +182,7 @@ func (c *Container) ExecCreate(config *ExecConfig) (string, error) { } // Generate an ID for our new exec session - sessionID := stringid.GenerateNonCryptoID() + sessionID := stringid.GenerateRandomID() found := true // This really ought to be a do-while, but Go doesn't have those... for found { @@ -194,7 +194,7 @@ func (c *Container) ExecCreate(config *ExecConfig) (string, error) { } } if found { - sessionID = stringid.GenerateNonCryptoID() + sessionID = stringid.GenerateRandomID() } } @@ -205,7 +205,7 @@ func (c *Container) ExecCreate(config *ExecConfig) (string, error) { session.State = define.ExecStateCreated session.Config = new(ExecConfig) if err := JSONDeepCopy(config, session.Config); err != nil { - return "", fmt.Errorf("error copying exec configuration into exec session: %w", err) + return "", fmt.Errorf("copying exec configuration into exec session: %w", err) } if len(session.Config.ExitCommand) > 0 { @@ -372,7 +372,7 @@ func (c *Container) execStartAndAttach(sessionID string, streams *define.AttachS if lastErr != nil { logrus.Errorf("Container %s exec session %s error: %v", c.ID(), session.ID(), lastErr) } - return fmt.Errorf("error syncing container %s state to update exec session %s: %w", c.ID(), sessionID, err) + return fmt.Errorf("syncing container %s state to update exec session %s: %w", c.ID(), sessionID, err) } // Now handle the error from readExecExitCode above. @@ -809,7 +809,7 @@ func (c *Container) exec(config *ExecConfig, streams *define.AttachStreams, resi // streaming. diedEvent, err := c.runtime.GetExecDiedEvent(context.Background(), c.ID(), sessionID) if err != nil { - return -1, fmt.Errorf("error retrieving exec session %s exit code: %w", sessionID, err) + return -1, fmt.Errorf("retrieving exec session %s exit code: %w", sessionID, err) } return diedEvent.ContainerExitCode, nil } @@ -911,7 +911,7 @@ func (c *Container) createExecBundle(sessionID string) (retErr error) { if err := os.MkdirAll(c.execExitFileDir(sessionID), execDirPermission); err != nil { // The directory is allowed to exist if !os.IsExist(err) { - return fmt.Errorf("error creating OCI runtime exit file path %s: %w", c.execExitFileDir(sessionID), err) + return fmt.Errorf("creating OCI runtime exit file path %s: %w", c.execExitFileDir(sessionID), err) } } return nil @@ -1121,7 +1121,7 @@ func writeExecExitCode(c *Container, sessionID string, exitCode int) error { // need to. Exit without error. return nil } - return fmt.Errorf("error syncing container %s state to remove exec session %s: %w", c.ID(), sessionID, err) + return fmt.Errorf("syncing container %s state to remove exec session %s: %w", c.ID(), sessionID, err) } return justWriteExecExitCode(c, sessionID, exitCode) diff --git a/libpod/container_freebsd.go b/libpod/container_freebsd.go new file mode 100644 index 000000000..87fb494dd --- /dev/null +++ b/libpod/container_freebsd.go @@ -0,0 +1,31 @@ +//go:build freebsd +// +build freebsd + +package libpod + +type containerPlatformState struct { + // NetNS is the name of the container's network VNET + // jail. Will only be set if config.CreateNetNS is true, or + // the container was told to join another container's network + // namespace. + NetNS *jailNetNS `json:"-"` +} + +type jailNetNS struct { + Name string `json:"-"` +} + +func (ns *jailNetNS) Path() string { + // The jail name approximately corresponds to the Linux netns path + return ns.Name +} + +func networkDisabled(c *Container) (bool, error) { + if c.config.CreateNetNS { + return false, nil + } + if !c.config.PostConfigureNetNS { + return c.state.NetNS != nil, nil + } + return false, nil +} diff --git a/libpod/container_graph.go b/libpod/container_graph.go index 67b1abc34..d43579e4a 100644 --- a/libpod/container_graph.go +++ b/libpod/container_graph.go @@ -160,7 +160,7 @@ func detectCycles(graph *ContainerGraph) (bool, error) { // Popped item is no longer on the stack, mark as such topInfo, ok := nodes[topOfStack.id] if !ok { - return false, fmt.Errorf("error finding node info for %s: %w", topOfStack.id, define.ErrInternal) + return false, fmt.Errorf("finding node info for %s: %w", topOfStack.id, define.ErrInternal) } topInfo.onStack = false @@ -281,3 +281,94 @@ func startNode(ctx context.Context, node *containerNode, setError bool, ctrError startNode(ctx, successor, ctrErrored, ctrErrors, ctrsVisited, restart) } } + +// Visit a node on the container graph and remove it, or set an error if it +// failed to remove. Only intended for use in pod removal; do *not* use when +// removing individual containers. +// All containers are assumed to be *UNLOCKED* on running this function. +// Container locks will be acquired as necessary. +// Pod and infraID are optional. If a pod is given it must be *LOCKED*. +func removeNode(ctx context.Context, node *containerNode, pod *Pod, force bool, timeout *uint, setError bool, ctrErrors map[string]error, ctrsVisited map[string]bool, ctrNamedVolumes map[string]*ContainerNamedVolume) { + // If we already visited this node, we're done. + if ctrsVisited[node.id] { + return + } + + // Someone who depends on us failed. + // Mark us as failed and recurse. + if setError { + ctrsVisited[node.id] = true + ctrErrors[node.id] = fmt.Errorf("a container that depends on container %s could not be removed: %w", node.id, define.ErrCtrStateInvalid) + + // Hit anyone who depends on us, set errors there as well. + for _, successor := range node.dependsOn { + removeNode(ctx, successor, pod, force, timeout, true, ctrErrors, ctrsVisited, ctrNamedVolumes) + } + } + + // Does anyone still depend on us? + // Cannot remove if true. Once all our dependencies have been removed, + // we will be removed. + for _, dep := range node.dependedOn { + // The container that depends on us hasn't been removed yet. + // OK to continue on + if ok := ctrsVisited[dep.id]; !ok { + return + } + } + + // Going to try to remove the node, mark us as visited + ctrsVisited[node.id] = true + + ctrErrored := false + + // Verify that all that depend on us are gone. + // Graph traversal should guarantee this is true, but this isn't that + // expensive, and it's better to be safe. + for _, dep := range node.dependedOn { + if _, err := node.container.runtime.GetContainer(dep.id); err == nil { + ctrErrored = true + ctrErrors[node.id] = fmt.Errorf("a container that depends on container %s still exists: %w", node.id, define.ErrDepExists) + } + } + + // Lock the container + node.container.lock.Lock() + + // Gate all subsequent bits behind a ctrErrored check - we don't want to + // proceed if a previous step failed. + if !ctrErrored { + if err := node.container.syncContainer(); err != nil { + ctrErrored = true + ctrErrors[node.id] = err + } + } + + if !ctrErrored { + for _, vol := range node.container.config.NamedVolumes { + ctrNamedVolumes[vol.Name] = vol + } + + if pod != nil && pod.state.InfraContainerID == node.id { + pod.state.InfraContainerID = "" + if err := pod.save(); err != nil { + ctrErrored = true + ctrErrors[node.id] = fmt.Errorf("error removing infra container %s from pod %s: %w", node.id, pod.ID(), err) + } + } + } + + if !ctrErrored { + if err := node.container.runtime.removeContainer(ctx, node.container, force, false, true, false, timeout); err != nil { + ctrErrored = true + ctrErrors[node.id] = err + } + } + + node.container.lock.Unlock() + + // Recurse to anyone who we depend on and remove them + for _, successor := range node.dependsOn { + removeNode(ctx, successor, pod, force, timeout, ctrErrored, ctrErrors, ctrsVisited, ctrNamedVolumes) + } +} diff --git a/libpod/container_inspect.go b/libpod/container_inspect.go index fa2130a28..e4089efa6 100644 --- a/libpod/container_inspect.go +++ b/libpod/container_inspect.go @@ -3,20 +3,15 @@ package libpod import ( "errors" "fmt" - "sort" "strings" - "github.com/containers/common/pkg/config" "github.com/containers/podman/v4/libpod/define" "github.com/containers/podman/v4/libpod/driver" "github.com/containers/podman/v4/pkg/util" "github.com/containers/storage/types" units "github.com/docker/go-units" spec "github.com/opencontainers/runtime-spec/specs-go" - "github.com/opencontainers/runtime-tools/generate" - "github.com/opencontainers/runtime-tools/validate" "github.com/sirupsen/logrus" - "github.com/syndtr/gocapability/capability" ) // inspectLocked inspects a container for low-level information. @@ -24,15 +19,15 @@ import ( func (c *Container) inspectLocked(size bool) (*define.InspectContainerData, error) { storeCtr, err := c.runtime.store.Container(c.ID()) if err != nil { - return nil, fmt.Errorf("error getting container from store %q: %w", c.ID(), err) + return nil, fmt.Errorf("getting container from store %q: %w", c.ID(), err) } layer, err := c.runtime.store.Layer(storeCtr.LayerID) if err != nil { - return nil, fmt.Errorf("error reading information about layer %q: %w", storeCtr.LayerID, err) + return nil, fmt.Errorf("reading information about layer %q: %w", storeCtr.LayerID, err) } driverData, err := driver.GetDriverData(c.runtime.store, layer.ID) if err != nil { - return nil, fmt.Errorf("error getting graph driver info %q: %w", c.ID(), err) + return nil, fmt.Errorf("getting graph driver info %q: %w", c.ID(), err) } return c.getContainerInspectData(size, driverData) } @@ -163,8 +158,6 @@ func (c *Container) getContainerInspectData(size bool, driverData *define.Driver Driver: driverData.Name, MountLabel: config.MountLabel, ProcessLabel: config.ProcessLabel, - EffectiveCaps: ctrSpec.Process.Capabilities.Effective, - BoundingCaps: ctrSpec.Process.Capabilities.Bounding, AppArmorProfile: ctrSpec.Process.ApparmorProfile, ExecIDs: execIDs, GraphDriver: driverData, @@ -173,6 +166,10 @@ func (c *Container) getContainerInspectData(size bool, driverData *define.Driver IsInfra: c.IsInfra(), IsService: c.IsService(), } + if ctrSpec.Process.Capabilities != nil { + data.EffectiveCaps = ctrSpec.Process.Capabilities.Effective + data.BoundingCaps = ctrSpec.Process.Capabilities.Bounding + } if c.state.ConfigPath != "" { data.OCIConfigPath = c.state.ConfigPath @@ -241,7 +238,7 @@ func (c *Container) GetMounts(namedVolumes []*ContainerNamedVolume, imageVolumes // volume. volFromDB, err := c.runtime.state.Volume(volume.Name) if err != nil { - return nil, fmt.Errorf("error looking up volume %s in container %s config: %w", volume.Name, c.ID(), err) + return nil, fmt.Errorf("looking up volume %s in container %s config: %w", volume.Name, c.ID(), err) } mountStruct.Driver = volFromDB.Driver() @@ -390,6 +387,8 @@ func (c *Container) generateInspectContainerConfig(spec *spec.Spec) *define.Insp // leak. ctrConfig.Healthcheck = c.config.HealthCheckConfig + ctrConfig.HealthcheckOnFailureAction = c.config.HealthCheckOnFailureAction.String() + ctrConfig.CreateCommand = c.config.CreateCommand ctrConfig.Timezone = c.config.Timezone @@ -414,6 +413,8 @@ func (c *Container) generateInspectContainerConfig(spec *spec.Spec) *define.Insp ctrConfig.Passwd = c.config.Passwd ctrConfig.ChrootDirs = append(ctrConfig.ChrootDirs, c.config.ChrootDirs...) + ctrConfig.SdNotifyMode = c.config.SdNotifyMode + ctrConfig.SdNotifySocket = c.config.SdNotifySocket return ctrConfig } @@ -480,11 +481,6 @@ func (c *Container) generateInspectContainerHostConfig(ctrSpec *spec.Spec, named hostConfig.ShmSize = c.config.ShmSize hostConfig.Runtime = "oci" - // This is very expensive to initialize. - // So we don't want to initialize it unless we absolutely have to - IE, - // there are things that require a major:minor to path translation. - var deviceNodes map[string]string - // Annotations if ctrSpec.Annotations != nil { hostConfig.ContainerIDFile = ctrSpec.Annotations[define.InspectAnnotationCIDFile] @@ -502,109 +498,8 @@ func (c *Container) generateInspectContainerHostConfig(ctrSpec *spec.Spec, named } } - // Resource limits - if ctrSpec.Linux != nil { - if ctrSpec.Linux.Resources != nil { - if ctrSpec.Linux.Resources.CPU != nil { - if ctrSpec.Linux.Resources.CPU.Shares != nil { - hostConfig.CpuShares = *ctrSpec.Linux.Resources.CPU.Shares - } - if ctrSpec.Linux.Resources.CPU.Period != nil { - hostConfig.CpuPeriod = *ctrSpec.Linux.Resources.CPU.Period - } - if ctrSpec.Linux.Resources.CPU.Quota != nil { - hostConfig.CpuQuota = *ctrSpec.Linux.Resources.CPU.Quota - } - if ctrSpec.Linux.Resources.CPU.RealtimePeriod != nil { - hostConfig.CpuRealtimePeriod = *ctrSpec.Linux.Resources.CPU.RealtimePeriod - } - if ctrSpec.Linux.Resources.CPU.RealtimeRuntime != nil { - hostConfig.CpuRealtimeRuntime = *ctrSpec.Linux.Resources.CPU.RealtimeRuntime - } - hostConfig.CpusetCpus = ctrSpec.Linux.Resources.CPU.Cpus - hostConfig.CpusetMems = ctrSpec.Linux.Resources.CPU.Mems - } - if ctrSpec.Linux.Resources.Memory != nil { - if ctrSpec.Linux.Resources.Memory.Limit != nil { - hostConfig.Memory = *ctrSpec.Linux.Resources.Memory.Limit - } - if ctrSpec.Linux.Resources.Memory.Reservation != nil { - hostConfig.MemoryReservation = *ctrSpec.Linux.Resources.Memory.Reservation - } - if ctrSpec.Linux.Resources.Memory.Swap != nil { - hostConfig.MemorySwap = *ctrSpec.Linux.Resources.Memory.Swap - } - if ctrSpec.Linux.Resources.Memory.Swappiness != nil { - hostConfig.MemorySwappiness = int64(*ctrSpec.Linux.Resources.Memory.Swappiness) - } else { - // Swappiness has a default of -1 - hostConfig.MemorySwappiness = -1 - } - if ctrSpec.Linux.Resources.Memory.DisableOOMKiller != nil { - hostConfig.OomKillDisable = *ctrSpec.Linux.Resources.Memory.DisableOOMKiller - } - } - if ctrSpec.Linux.Resources.Pids != nil { - hostConfig.PidsLimit = ctrSpec.Linux.Resources.Pids.Limit - } - hostConfig.CgroupConf = ctrSpec.Linux.Resources.Unified - if ctrSpec.Linux.Resources.BlockIO != nil { - if ctrSpec.Linux.Resources.BlockIO.Weight != nil { - hostConfig.BlkioWeight = *ctrSpec.Linux.Resources.BlockIO.Weight - } - hostConfig.BlkioWeightDevice = []define.InspectBlkioWeightDevice{} - for _, dev := range ctrSpec.Linux.Resources.BlockIO.WeightDevice { - key := fmt.Sprintf("%d:%d", dev.Major, dev.Minor) - // TODO: how do we handle LeafWeight vs - // Weight? For now, ignore anything - // without Weight set. - if dev.Weight == nil { - logrus.Infof("Ignoring weight device %s as it lacks a weight", key) - continue - } - if deviceNodes == nil { - nodes, err := util.FindDeviceNodes() - if err != nil { - return nil, err - } - deviceNodes = nodes - } - path, ok := deviceNodes[key] - if !ok { - logrus.Infof("Could not locate weight device %s in system devices", key) - continue - } - weightDev := define.InspectBlkioWeightDevice{} - weightDev.Path = path - weightDev.Weight = *dev.Weight - hostConfig.BlkioWeightDevice = append(hostConfig.BlkioWeightDevice, weightDev) - } - - readBps, err := blkioDeviceThrottle(deviceNodes, ctrSpec.Linux.Resources.BlockIO.ThrottleReadBpsDevice) - if err != nil { - return nil, err - } - hostConfig.BlkioDeviceReadBps = readBps - - writeBps, err := blkioDeviceThrottle(deviceNodes, ctrSpec.Linux.Resources.BlockIO.ThrottleWriteBpsDevice) - if err != nil { - return nil, err - } - hostConfig.BlkioDeviceWriteBps = writeBps - - readIops, err := blkioDeviceThrottle(deviceNodes, ctrSpec.Linux.Resources.BlockIO.ThrottleReadIOPSDevice) - if err != nil { - return nil, err - } - hostConfig.BlkioDeviceReadIOps = readIops - - writeIops, err := blkioDeviceThrottle(deviceNodes, ctrSpec.Linux.Resources.BlockIO.ThrottleWriteIOPSDevice) - if err != nil { - return nil, err - } - hostConfig.BlkioDeviceWriteIOps = writeIops - } - } + if err := c.platformInspectContainerHostConfig(ctrSpec, hostConfig); err != nil { + return nil, err } // NanoCPUs. @@ -655,182 +550,6 @@ func (c *Container) generateInspectContainerHostConfig(ctrSpec *spec.Spec, named hostConfig.PortBindings = make(map[string][]define.InspectHostPort) } - // Cap add and cap drop. - // We need a default set of capabilities to compare against. - // The OCI generate package has one, and is commonly used, so we'll - // use it. - // Problem: there are 5 sets of capabilities. - // Use the bounding set for this computation, it's the most encompassing - // (but still not perfect). - capAdd := []string{} - capDrop := []string{} - // No point in continuing if we got a spec without a Process block... - if ctrSpec.Process != nil { - // Max an O(1) lookup table for default bounding caps. - boundingCaps := make(map[string]bool) - g, err := generate.New("linux") - if err != nil { - return nil, err - } - if !hostConfig.Privileged { - for _, cap := range g.Config.Process.Capabilities.Bounding { - boundingCaps[cap] = true - } - } else { - // If we are privileged, use all caps. - for _, cap := range capability.List() { - if g.HostSpecific && cap > validate.LastCap() { - continue - } - boundingCaps[fmt.Sprintf("CAP_%s", strings.ToUpper(cap.String()))] = true - } - } - // Iterate through spec caps. - // If it's not in default bounding caps, it was added. - // If it is, delete from the default set. Whatever remains after - // we finish are the dropped caps. - for _, cap := range ctrSpec.Process.Capabilities.Bounding { - if _, ok := boundingCaps[cap]; ok { - delete(boundingCaps, cap) - } else { - capAdd = append(capAdd, cap) - } - } - for cap := range boundingCaps { - capDrop = append(capDrop, cap) - } - // Sort CapDrop so it displays in consistent order (GH #9490) - sort.Strings(capDrop) - } - hostConfig.CapAdd = capAdd - hostConfig.CapDrop = capDrop - switch { - case c.config.IPCNsCtr != "": - hostConfig.IpcMode = fmt.Sprintf("container:%s", c.config.IPCNsCtr) - case ctrSpec.Linux != nil: - // Locate the spec's IPC namespace. - // If there is none, it's ipc=host. - // If there is one and it has a path, it's "ns:". - // If no path, it's default - the empty string. - for _, ns := range ctrSpec.Linux.Namespaces { - if ns.Type == spec.IPCNamespace { - if ns.Path != "" { - hostConfig.IpcMode = fmt.Sprintf("ns:%s", ns.Path) - } else { - break - } - } - } - case c.config.NoShm: - hostConfig.IpcMode = "none" - case c.config.NoShmShare: - hostConfig.IpcMode = "private" - } - if hostConfig.IpcMode == "" { - hostConfig.IpcMode = "shareable" - } - - // Cgroup namespace mode - cgroupMode := "" - if c.config.CgroupNsCtr != "" { - cgroupMode = fmt.Sprintf("container:%s", c.config.CgroupNsCtr) - } else if ctrSpec.Linux != nil { - // Locate the spec's cgroup namespace - // If there is none, it's cgroup=host. - // If there is one and it has a path, it's "ns:". - // If there is no path, it's private. - for _, ns := range ctrSpec.Linux.Namespaces { - if ns.Type == spec.CgroupNamespace { - if ns.Path != "" { - cgroupMode = fmt.Sprintf("ns:%s", ns.Path) - } else { - cgroupMode = "private" - } - } - } - if cgroupMode == "" { - cgroupMode = "host" - } - } - hostConfig.CgroupMode = cgroupMode - - // Cgroup parent - // Need to check if it's the default, and not print if so. - defaultCgroupParent := "" - switch c.CgroupManager() { - case config.CgroupfsCgroupsManager: - defaultCgroupParent = CgroupfsDefaultCgroupParent - case config.SystemdCgroupsManager: - defaultCgroupParent = SystemdDefaultCgroupParent - } - if c.config.CgroupParent != defaultCgroupParent { - hostConfig.CgroupParent = c.config.CgroupParent - } - hostConfig.CgroupManager = c.CgroupManager() - - // PID namespace mode - pidMode := "" - if c.config.PIDNsCtr != "" { - pidMode = fmt.Sprintf("container:%s", c.config.PIDNsCtr) - } else if ctrSpec.Linux != nil { - // Locate the spec's PID namespace. - // If there is none, it's pid=host. - // If there is one and it has a path, it's "ns:". - // If there is no path, it's default - the empty string. - for _, ns := range ctrSpec.Linux.Namespaces { - if ns.Type == spec.PIDNamespace { - if ns.Path != "" { - pidMode = fmt.Sprintf("ns:%s", ns.Path) - } else { - pidMode = "private" - } - break - } - } - if pidMode == "" { - pidMode = "host" - } - } - hostConfig.PidMode = pidMode - - // UTS namespace mode - utsMode := c.NamespaceMode(spec.UTSNamespace, ctrSpec) - - hostConfig.UTSMode = utsMode - - // User namespace mode - usernsMode := "" - if c.config.UserNsCtr != "" { - usernsMode = fmt.Sprintf("container:%s", c.config.UserNsCtr) - } else if ctrSpec.Linux != nil { - // Locate the spec's user namespace. - // If there is none, it's default - the empty string. - // If there is one, it's "private" if no path, or "ns:" if - // there's a path. - - for _, ns := range ctrSpec.Linux.Namespaces { - if ns.Type == spec.UserNamespace { - if ns.Path != "" { - usernsMode = fmt.Sprintf("ns:%s", ns.Path) - } else { - usernsMode = "private" - } - } - } - } - hostConfig.UsernsMode = usernsMode - if c.config.IDMappings.UIDMap != nil && c.config.IDMappings.GIDMap != nil { - hostConfig.IDMappings = generateIDMappings(c.config.IDMappings) - } - // Devices - // Do not include if privileged - assumed that all devices will be - // included. - var err error - hostConfig.Devices, err = c.GetDevices(hostConfig.Privileged, *ctrSpec, deviceNodes) - if err != nil { - return nil, err - } - // Ulimits hostConfig.Ulimits = []define.InspectUlimit{} if ctrSpec.Process != nil { diff --git a/libpod/container_inspect_freebsd.go b/libpod/container_inspect_freebsd.go new file mode 100644 index 000000000..8b4e8df87 --- /dev/null +++ b/libpod/container_inspect_freebsd.go @@ -0,0 +1,17 @@ +package libpod + +import ( + "github.com/containers/podman/v4/libpod/define" + spec "github.com/opencontainers/runtime-spec/specs-go" +) + +func (c *Container) platformInspectContainerHostConfig(ctrSpec *spec.Spec, hostConfig *define.InspectContainerHostConfig) error { + // Not sure what to put here. FreeBSD jails use pids from the + // global pool but can only see their own pids. + hostConfig.PidMode = "host" + + // UTS namespace mode + hostConfig.UTSMode = c.NamespaceMode(spec.UTSNamespace, ctrSpec) + + return nil +} diff --git a/libpod/container_inspect_linux.go b/libpod/container_inspect_linux.go new file mode 100644 index 000000000..355690d70 --- /dev/null +++ b/libpod/container_inspect_linux.go @@ -0,0 +1,306 @@ +package libpod + +import ( + "fmt" + "sort" + "strings" + + "github.com/containers/common/pkg/config" + "github.com/containers/podman/v4/libpod/define" + "github.com/containers/podman/v4/pkg/util" + spec "github.com/opencontainers/runtime-spec/specs-go" + "github.com/opencontainers/runtime-tools/generate" + "github.com/opencontainers/runtime-tools/validate" + "github.com/sirupsen/logrus" + "github.com/syndtr/gocapability/capability" +) + +func (c *Container) platformInspectContainerHostConfig(ctrSpec *spec.Spec, hostConfig *define.InspectContainerHostConfig) error { + // This is very expensive to initialize. + // So we don't want to initialize it unless we absolutely have to - IE, + // there are things that require a major:minor to path translation. + var deviceNodes map[string]string + + // Resource limits + if ctrSpec.Linux != nil { + if ctrSpec.Linux.Resources != nil { + if ctrSpec.Linux.Resources.CPU != nil { + if ctrSpec.Linux.Resources.CPU.Shares != nil { + hostConfig.CpuShares = *ctrSpec.Linux.Resources.CPU.Shares + } + if ctrSpec.Linux.Resources.CPU.Period != nil { + hostConfig.CpuPeriod = *ctrSpec.Linux.Resources.CPU.Period + } + if ctrSpec.Linux.Resources.CPU.Quota != nil { + hostConfig.CpuQuota = *ctrSpec.Linux.Resources.CPU.Quota + } + if ctrSpec.Linux.Resources.CPU.RealtimePeriod != nil { + hostConfig.CpuRealtimePeriod = *ctrSpec.Linux.Resources.CPU.RealtimePeriod + } + if ctrSpec.Linux.Resources.CPU.RealtimeRuntime != nil { + hostConfig.CpuRealtimeRuntime = *ctrSpec.Linux.Resources.CPU.RealtimeRuntime + } + hostConfig.CpusetCpus = ctrSpec.Linux.Resources.CPU.Cpus + hostConfig.CpusetMems = ctrSpec.Linux.Resources.CPU.Mems + } + if ctrSpec.Linux.Resources.Memory != nil { + if ctrSpec.Linux.Resources.Memory.Limit != nil { + hostConfig.Memory = *ctrSpec.Linux.Resources.Memory.Limit + } + if ctrSpec.Linux.Resources.Memory.Reservation != nil { + hostConfig.MemoryReservation = *ctrSpec.Linux.Resources.Memory.Reservation + } + if ctrSpec.Linux.Resources.Memory.Swap != nil { + hostConfig.MemorySwap = *ctrSpec.Linux.Resources.Memory.Swap + } + if ctrSpec.Linux.Resources.Memory.Swappiness != nil { + hostConfig.MemorySwappiness = int64(*ctrSpec.Linux.Resources.Memory.Swappiness) + } else { + // Swappiness has a default of -1 + hostConfig.MemorySwappiness = -1 + } + if ctrSpec.Linux.Resources.Memory.DisableOOMKiller != nil { + hostConfig.OomKillDisable = *ctrSpec.Linux.Resources.Memory.DisableOOMKiller + } + } + if ctrSpec.Linux.Resources.Pids != nil { + hostConfig.PidsLimit = ctrSpec.Linux.Resources.Pids.Limit + } + hostConfig.CgroupConf = ctrSpec.Linux.Resources.Unified + if ctrSpec.Linux.Resources.BlockIO != nil { + if ctrSpec.Linux.Resources.BlockIO.Weight != nil { + hostConfig.BlkioWeight = *ctrSpec.Linux.Resources.BlockIO.Weight + } + hostConfig.BlkioWeightDevice = []define.InspectBlkioWeightDevice{} + for _, dev := range ctrSpec.Linux.Resources.BlockIO.WeightDevice { + key := fmt.Sprintf("%d:%d", dev.Major, dev.Minor) + // TODO: how do we handle LeafWeight vs + // Weight? For now, ignore anything + // without Weight set. + if dev.Weight == nil { + logrus.Infof("Ignoring weight device %s as it lacks a weight", key) + continue + } + if deviceNodes == nil { + nodes, err := util.FindDeviceNodes() + if err != nil { + return err + } + deviceNodes = nodes + } + path, ok := deviceNodes[key] + if !ok { + logrus.Infof("Could not locate weight device %s in system devices", key) + continue + } + weightDev := define.InspectBlkioWeightDevice{} + weightDev.Path = path + weightDev.Weight = *dev.Weight + hostConfig.BlkioWeightDevice = append(hostConfig.BlkioWeightDevice, weightDev) + } + + readBps, err := blkioDeviceThrottle(deviceNodes, ctrSpec.Linux.Resources.BlockIO.ThrottleReadBpsDevice) + if err != nil { + return err + } + hostConfig.BlkioDeviceReadBps = readBps + + writeBps, err := blkioDeviceThrottle(deviceNodes, ctrSpec.Linux.Resources.BlockIO.ThrottleWriteBpsDevice) + if err != nil { + return err + } + hostConfig.BlkioDeviceWriteBps = writeBps + + readIops, err := blkioDeviceThrottle(deviceNodes, ctrSpec.Linux.Resources.BlockIO.ThrottleReadIOPSDevice) + if err != nil { + return err + } + hostConfig.BlkioDeviceReadIOps = readIops + + writeIops, err := blkioDeviceThrottle(deviceNodes, ctrSpec.Linux.Resources.BlockIO.ThrottleWriteIOPSDevice) + if err != nil { + return err + } + hostConfig.BlkioDeviceWriteIOps = writeIops + } + } + } + + // Cap add and cap drop. + // We need a default set of capabilities to compare against. + // The OCI generate package has one, and is commonly used, so we'll + // use it. + // Problem: there are 5 sets of capabilities. + // Use the bounding set for this computation, it's the most encompassing + // (but still not perfect). + capAdd := []string{} + capDrop := []string{} + // No point in continuing if we got a spec without a Process block... + if ctrSpec.Process != nil { + // Max an O(1) lookup table for default bounding caps. + boundingCaps := make(map[string]bool) + g, err := generate.New("linux") + if err != nil { + return err + } + if !hostConfig.Privileged { + for _, cap := range g.Config.Process.Capabilities.Bounding { + boundingCaps[cap] = true + } + } else { + // If we are privileged, use all caps. + for _, cap := range capability.List() { + if g.HostSpecific && cap > validate.LastCap() { + continue + } + boundingCaps[fmt.Sprintf("CAP_%s", strings.ToUpper(cap.String()))] = true + } + } + // Iterate through spec caps. + // If it's not in default bounding caps, it was added. + // If it is, delete from the default set. Whatever remains after + // we finish are the dropped caps. + for _, cap := range ctrSpec.Process.Capabilities.Bounding { + if _, ok := boundingCaps[cap]; ok { + delete(boundingCaps, cap) + } else { + capAdd = append(capAdd, cap) + } + } + for cap := range boundingCaps { + capDrop = append(capDrop, cap) + } + // Sort CapDrop so it displays in consistent order (GH #9490) + sort.Strings(capDrop) + } + hostConfig.CapAdd = capAdd + hostConfig.CapDrop = capDrop + switch { + case c.config.IPCNsCtr != "": + hostConfig.IpcMode = fmt.Sprintf("container:%s", c.config.IPCNsCtr) + case ctrSpec.Linux != nil: + // Locate the spec's IPC namespace. + // If there is none, it's ipc=host. + // If there is one and it has a path, it's "ns:". + // If no path, it's default - the empty string. + for _, ns := range ctrSpec.Linux.Namespaces { + if ns.Type == spec.IPCNamespace { + if ns.Path != "" { + hostConfig.IpcMode = fmt.Sprintf("ns:%s", ns.Path) + } else { + break + } + } + } + case c.config.NoShm: + hostConfig.IpcMode = "none" + case c.config.NoShmShare: + hostConfig.IpcMode = "private" + } + if hostConfig.IpcMode == "" { + hostConfig.IpcMode = "shareable" + } + + // Cgroup namespace mode + cgroupMode := "" + if c.config.CgroupNsCtr != "" { + cgroupMode = fmt.Sprintf("container:%s", c.config.CgroupNsCtr) + } else if ctrSpec.Linux != nil { + // Locate the spec's cgroup namespace + // If there is none, it's cgroup=host. + // If there is one and it has a path, it's "ns:". + // If there is no path, it's private. + for _, ns := range ctrSpec.Linux.Namespaces { + if ns.Type == spec.CgroupNamespace { + if ns.Path != "" { + cgroupMode = fmt.Sprintf("ns:%s", ns.Path) + } else { + cgroupMode = "private" + } + } + } + if cgroupMode == "" { + cgroupMode = "host" + } + } + hostConfig.CgroupMode = cgroupMode + + // Cgroup parent + // Need to check if it's the default, and not print if so. + defaultCgroupParent := "" + switch c.CgroupManager() { + case config.CgroupfsCgroupsManager: + defaultCgroupParent = CgroupfsDefaultCgroupParent + case config.SystemdCgroupsManager: + defaultCgroupParent = SystemdDefaultCgroupParent + } + if c.config.CgroupParent != defaultCgroupParent { + hostConfig.CgroupParent = c.config.CgroupParent + } + hostConfig.CgroupManager = c.CgroupManager() + + // PID namespace mode + pidMode := "" + if c.config.PIDNsCtr != "" { + pidMode = fmt.Sprintf("container:%s", c.config.PIDNsCtr) + } else if ctrSpec.Linux != nil { + // Locate the spec's PID namespace. + // If there is none, it's pid=host. + // If there is one and it has a path, it's "ns:". + // If there is no path, it's default - the empty string. + for _, ns := range ctrSpec.Linux.Namespaces { + if ns.Type == spec.PIDNamespace { + if ns.Path != "" { + pidMode = fmt.Sprintf("ns:%s", ns.Path) + } else { + pidMode = "private" + } + break + } + } + if pidMode == "" { + pidMode = "host" + } + } + hostConfig.PidMode = pidMode + + // UTS namespace mode + utsMode := c.NamespaceMode(spec.UTSNamespace, ctrSpec) + + hostConfig.UTSMode = utsMode + + // User namespace mode + usernsMode := "" + if c.config.UserNsCtr != "" { + usernsMode = fmt.Sprintf("container:%s", c.config.UserNsCtr) + } else if ctrSpec.Linux != nil { + // Locate the spec's user namespace. + // If there is none, it's default - the empty string. + // If there is one, it's "private" if no path, or "ns:" if + // there's a path. + + for _, ns := range ctrSpec.Linux.Namespaces { + if ns.Type == spec.UserNamespace { + if ns.Path != "" { + usernsMode = fmt.Sprintf("ns:%s", ns.Path) + } else { + usernsMode = "private" + } + } + } + } + hostConfig.UsernsMode = usernsMode + if c.config.IDMappings.UIDMap != nil && c.config.IDMappings.GIDMap != nil { + hostConfig.IDMappings = generateIDMappings(c.config.IDMappings) + } + // Devices + // Do not include if privileged - assumed that all devices will be + // included. + var err error + hostConfig.Devices, err = c.GetDevices(hostConfig.Privileged, *ctrSpec, deviceNodes) + if err != nil { + return err + } + + return nil +} diff --git a/libpod/container_internal.go b/libpod/container_internal.go index bad68991b..994243805 100644 --- a/libpod/container_internal.go +++ b/libpod/container_internal.go @@ -27,10 +27,12 @@ import ( cutil "github.com/containers/common/pkg/util" "github.com/containers/podman/v4/libpod/define" "github.com/containers/podman/v4/libpod/events" + "github.com/containers/podman/v4/libpod/shutdown" "github.com/containers/podman/v4/pkg/ctime" "github.com/containers/podman/v4/pkg/lookup" "github.com/containers/podman/v4/pkg/rootless" "github.com/containers/podman/v4/pkg/selinux" + "github.com/containers/podman/v4/pkg/systemd/notifyproxy" "github.com/containers/podman/v4/pkg/util" "github.com/containers/storage" "github.com/containers/storage/pkg/archive" @@ -205,7 +207,7 @@ func (c *Container) handleExitFile(exitFile string, fi os.FileInfo) error { } statusCode, err := strconv.Atoi(string(statusCodeStr)) if err != nil { - return fmt.Errorf("error converting exit status code (%q, err) for container %s to int: %w", + return fmt.Errorf("converting exit status code (%q, err) for container %s to int: %w", c.ID(), statusCodeStr, err) } c.state.ExitCode = int32(statusCode) @@ -292,20 +294,8 @@ func (c *Container) handleRestartPolicy(ctx context.Context) (_ bool, retErr err } // set up slirp4netns again because slirp4netns will die when conmon exits - if c.config.NetMode.IsSlirp4netns() { - err := c.runtime.setupSlirp4netns(c, c.state.NetNS) - if err != nil { - return false, err - } - } - - // set up rootlesskit port forwarder again since it dies when conmon exits - // we use rootlesskit port forwarder only as rootless and when bridge network is used - if rootless.IsRootless() && c.config.NetMode.IsBridge() && len(c.config.PortMappings) > 0 { - err := c.runtime.setupRootlessPortMappingViaRLK(c, c.state.NetNS.Path(), c.state.NetworkStatus) - if err != nil { - return false, err - } + if err := c.setupRootlessNetwork(); err != nil { + return false, err } if c.state.State == define.ContainerStateStopped { @@ -475,7 +465,7 @@ func (c *Container) setupStorage(ctx context.Context) error { defOptions, err := storage.GetMountOptions(c.runtime.store.GraphDriverName(), c.runtime.store.GraphOptions()) if err != nil { - return fmt.Errorf("error getting default mount options: %w", err) + return fmt.Errorf("getting default mount options: %w", err) } var newOptions []string for _, opt := range defOptions { @@ -510,7 +500,7 @@ func (c *Container) setupStorage(ctx context.Context) error { } } if containerInfoErr != nil { - return fmt.Errorf("error creating container storage: %w", containerInfoErr) + return fmt.Errorf("creating container storage: %w", containerInfoErr) } // Only reconfig IDMappings if layer was mounted from storage. @@ -552,7 +542,7 @@ func (c *Container) setupStorage(ctx context.Context) error { artifacts := filepath.Join(c.config.StaticDir, artifactsDir) if err := os.MkdirAll(artifacts, 0755); err != nil { - return fmt.Errorf("error creating artifacts directory: %w", err) + return fmt.Errorf("creating artifacts directory: %w", err) } return nil @@ -586,7 +576,7 @@ func (c *Container) teardownStorage() error { artifacts := filepath.Join(c.config.StaticDir, artifactsDir) if err := os.RemoveAll(artifacts); err != nil { - return fmt.Errorf("error removing container %s artifacts %q: %w", c.ID(), artifacts, err) + return fmt.Errorf("removing container %s artifacts %q: %w", c.ID(), artifacts, err) } if err := c.cleanupStorage(); err != nil { @@ -603,7 +593,7 @@ func (c *Container) teardownStorage() error { return nil } - return fmt.Errorf("error removing container %s root filesystem: %w", c.ID(), err) + return fmt.Errorf("removing container %s root filesystem: %w", c.ID(), err) } return nil @@ -654,7 +644,7 @@ func (c *Container) refresh() error { // It was lost in the reboot and must be recreated dir, err := c.runtime.storageService.GetRunDir(c.ID()) if err != nil { - return fmt.Errorf("error retrieving temporary directory for container %s: %w", c.ID(), err) + return fmt.Errorf("retrieving temporary directory for container %s: %w", c.ID(), err) } c.state.RunDir = dir @@ -668,7 +658,7 @@ func (c *Container) refresh() error { } root := filepath.Join(c.runtime.config.Engine.TmpDir, "containers-root", c.ID()) if err := os.MkdirAll(root, 0755); err != nil { - return fmt.Errorf("error creating userNS tmpdir for container %s: %w", c.ID(), err) + return fmt.Errorf("creating userNS tmpdir for container %s: %w", c.ID(), err) } if err := os.Chown(root, c.RootUID(), c.RootGID()); err != nil { return err @@ -678,7 +668,7 @@ func (c *Container) refresh() error { // We need to pick up a new lock lock, err := c.runtime.lockManager.AllocateAndRetrieveLock(c.config.LockID) if err != nil { - return fmt.Errorf("error acquiring lock %d for container %s: %w", c.config.LockID, c.ID(), err) + return fmt.Errorf("acquiring lock %d for container %s: %w", c.config.LockID, c.ID(), err) } c.lock = lock @@ -699,7 +689,7 @@ func (c *Container) refresh() error { } if err := c.save(); err != nil { - return fmt.Errorf("error refreshing state for container %s: %w", c.ID(), err) + return fmt.Errorf("refreshing state for container %s: %w", c.ID(), err) } // Remove ctl and attach files, which may persist across reboot @@ -720,22 +710,22 @@ func (c *Container) removeConmonFiles() error { } if err := os.Remove(attachFile); err != nil && !os.IsNotExist(err) { - return fmt.Errorf("error removing container %s attach file: %w", c.ID(), err) + return fmt.Errorf("removing container %s attach file: %w", c.ID(), err) } ctlFile := filepath.Join(c.bundlePath(), "ctl") if err := os.Remove(ctlFile); err != nil && !os.IsNotExist(err) { - return fmt.Errorf("error removing container %s ctl file: %w", c.ID(), err) + return fmt.Errorf("removing container %s ctl file: %w", c.ID(), err) } winszFile := filepath.Join(c.bundlePath(), "winsz") if err := os.Remove(winszFile); err != nil && !os.IsNotExist(err) { - return fmt.Errorf("error removing container %s winsz file: %w", c.ID(), err) + return fmt.Errorf("removing container %s winsz file: %w", c.ID(), err) } oomFile := filepath.Join(c.bundlePath(), "oom") if err := os.Remove(oomFile); err != nil && !os.IsNotExist(err) { - return fmt.Errorf("error removing container %s OOM file: %w", c.ID(), err) + return fmt.Errorf("removing container %s OOM file: %w", c.ID(), err) } // Remove the exit file so we don't leak memory in tmpfs @@ -744,7 +734,7 @@ func (c *Container) removeConmonFiles() error { return err } if err := os.Remove(exitFile); err != nil && !os.IsNotExist(err) { - return fmt.Errorf("error removing container %s exit file: %w", c.ID(), err) + return fmt.Errorf("removing container %s exit file: %w", c.ID(), err) } return nil @@ -767,12 +757,12 @@ func (c *Container) export(path string) error { input, err := archive.Tar(mountPoint, archive.Uncompressed) if err != nil { - return fmt.Errorf("error reading container directory %q: %w", c.ID(), err) + return fmt.Errorf("reading container directory %q: %w", c.ID(), err) } outFile, err := os.Create(path) if err != nil { - return fmt.Errorf("error creating file %q: %w", path, err) + return fmt.Errorf("creating file %q: %w", path, err) } defer outFile.Close() @@ -788,7 +778,7 @@ func (c *Container) getArtifactPath(name string) string { // save container state to the database func (c *Container) save() error { if err := c.runtime.state.SaveContainer(c); err != nil { - return fmt.Errorf("error saving container %s state: %w", c.ID(), err) + return fmt.Errorf("saving container %s state: %w", c.ID(), err) } return nil } @@ -842,7 +832,7 @@ func (c *Container) prepareToStart(ctx context.Context, recursive bool) (retErr func (c *Container) checkDependenciesAndHandleError() error { notRunning, err := c.checkDependenciesRunning() if err != nil { - return fmt.Errorf("error checking dependencies for container %s: %w", c.ID(), err) + return fmt.Errorf("checking dependencies for container %s: %w", c.ID(), err) } if len(notRunning) > 0 { depString := strings.Join(notRunning, ",") @@ -861,7 +851,7 @@ func (c *Container) startDependencies(ctx context.Context) error { depVisitedCtrs := make(map[string]*Container) if err := c.getAllDependencies(depVisitedCtrs); err != nil { - return fmt.Errorf("error starting dependency for container %s: %w", c.ID(), err) + return fmt.Errorf("starting dependency for container %s: %w", c.ID(), err) } // Because of how Go handles passing slices through functions, a slice cannot grow between function calls @@ -874,7 +864,7 @@ func (c *Container) startDependencies(ctx context.Context) error { // Build a dependency graph of containers graph, err := BuildContainerGraph(depCtrs) if err != nil { - return fmt.Errorf("error generating dependency graph for container %s: %w", c.ID(), err) + return fmt.Errorf("generating dependency graph for container %s: %w", c.ID(), err) } // If there are no containers without dependencies, we can't start @@ -900,7 +890,7 @@ func (c *Container) startDependencies(ctx context.Context) error { for _, e := range ctrErrors { logrus.Errorf("%q", e) } - return fmt.Errorf("error starting some containers: %w", define.ErrInternal) + return fmt.Errorf("starting some containers: %w", define.ErrInternal) } return nil } @@ -956,13 +946,13 @@ func (c *Container) checkDependenciesRunning() ([]string, error) { // Get the dependency container depCtr, err := c.runtime.state.Container(dep) if err != nil { - return nil, fmt.Errorf("error retrieving dependency %s of container %s from state: %w", dep, c.ID(), err) + return nil, fmt.Errorf("retrieving dependency %s of container %s from state: %w", dep, c.ID(), err) } // Check the status state, err := depCtr.State() if err != nil { - return nil, fmt.Errorf("error retrieving state of dependency %s of container %s: %w", dep, c.ID(), err) + return nil, fmt.Errorf("retrieving state of dependency %s of container %s: %w", dep, c.ID(), err) } if state != define.ContainerStateRunning && !depCtr.config.IsInfra { notRunning = append(notRunning, dep) @@ -1049,6 +1039,13 @@ func (c *Container) init(ctx context.Context, retainRetries bool) error { } } + // To ensure that we don't lose track of Conmon if hit by a SIGTERM + // in the middle of setting up the container, inhibit shutdown signals + // until after we save Conmon's PID to the state. + // TODO: This can likely be removed once conmon-rs support merges. + shutdown.Inhibit() + defer shutdown.Uninhibit() + // With the spec complete, do an OCI create if _, err = c.ociRuntime.CreateContainer(c, nil); err != nil { return err @@ -1084,6 +1081,7 @@ func (c *Container) init(ctx context.Context, retainRetries bool) error { if err := c.save(); err != nil { return err } + if c.config.HealthCheckConfig != nil { if err := c.createTimer(); err != nil { logrus.Error(err) @@ -1224,9 +1222,9 @@ func (c *Container) start() error { payload += "\n" payload += daemon.SdNotifyReady } - if sent, err := daemon.SdNotify(false, payload); err != nil { + if err := notifyproxy.SendMessage(c.config.SdNotifySocket, payload); err != nil { logrus.Errorf("Notifying systemd of Conmon PID: %s", err.Error()) - } else if sent { + } else { logrus.Debugf("Notify sent successfully") } } @@ -1282,12 +1280,18 @@ func (c *Container) stop(timeout uint) error { // is held when busy-waiting for the container to be stopped. c.state.State = define.ContainerStateStopping if err := c.save(); err != nil { - return fmt.Errorf("error saving container %s state before stopping: %w", c.ID(), err) + return fmt.Errorf("saving container %s state before stopping: %w", c.ID(), err) } if !c.batched { c.lock.Unlock() } + if c.config.HealthCheckConfig != nil { + if err := c.removeTransientFiles(context.Background()); err != nil { + logrus.Error(err.Error()) + } + } + stopErr := c.ociRuntime.StopContainer(c, timeout, all) if !c.batched { @@ -1342,7 +1346,7 @@ func (c *Container) stop(timeout uint) error { } if err := c.save(); err != nil { - return fmt.Errorf("error saving container %s state after stopping: %w", c.ID(), err) + return fmt.Errorf("saving container %s state after stopping: %w", c.ID(), err) } // Wait until we have an exit file, and sync once we do @@ -1556,7 +1560,7 @@ func (c *Container) mountStorage() (_ string, deferredErr error) { rootUID, rootGID := c.RootUID(), c.RootGID() - dirfd, err := unix.Open(mountPoint, unix.O_RDONLY|unix.O_PATH, 0) + dirfd, err := openDirectory(mountPoint) if err != nil { return "", fmt.Errorf("open mount point: %w", err) } @@ -1579,7 +1583,7 @@ func (c *Container) mountStorage() (_ string, deferredErr error) { return "", fmt.Errorf("resolve /etc in the container: %w", err) } - etcInTheContainerFd, err := unix.Open(etcInTheContainerPath, unix.O_RDONLY|unix.O_PATH, 0) + etcInTheContainerFd, err := openDirectory(etcInTheContainerPath) if err != nil { return "", fmt.Errorf("open /etc in the container: %w", err) } @@ -1629,7 +1633,7 @@ func (c *Container) mountNamedVolume(v *ContainerNamedVolume, mountpoint string) logrus.Debugf("Going to mount named volume %s", v.Name) vol, err := c.runtime.state.Volume(v.Name) if err != nil { - return nil, fmt.Errorf("error retrieving named volume %s for container %s: %w", v.Name, c.ID(), err) + return nil, fmt.Errorf("retrieving named volume %s for container %s: %w", v.Name, c.ID(), err) } if vol.config.LockID == c.config.LockID { @@ -1639,7 +1643,7 @@ func (c *Container) mountNamedVolume(v *ContainerNamedVolume, mountpoint string) defer vol.lock.Unlock() if vol.needsMount() { if err := vol.mount(); err != nil { - return nil, fmt.Errorf("error mounting volume %s for container %s: %w", vol.Name(), c.ID(), err) + return nil, fmt.Errorf("mounting volume %s for container %s: %w", vol.Name(), c.ID(), err) } } // The volume may need a copy-up. Check the state. @@ -1652,7 +1656,7 @@ func (c *Container) mountNamedVolume(v *ContainerNamedVolume, mountpoint string) srcDir, err := securejoin.SecureJoin(mountpoint, v.Dest) if err != nil { - return nil, fmt.Errorf("error calculating destination path to copy up container %s volume %s: %w", c.ID(), vol.Name(), err) + return nil, fmt.Errorf("calculating destination path to copy up container %s volume %s: %w", c.ID(), vol.Name(), err) } // Do a manual stat on the source directory to verify existence. // Skip the rest if it exists. @@ -1663,7 +1667,7 @@ func (c *Container) mountNamedVolume(v *ContainerNamedVolume, mountpoint string) // up. return vol, nil } - return nil, fmt.Errorf("error identifying source directory for copy up into volume %s: %w", vol.Name(), err) + return nil, fmt.Errorf("identifying source directory for copy up into volume %s: %w", vol.Name(), err) } // If it's not a directory we're mounting over it. if !srcStat.IsDir() { @@ -1673,9 +1677,9 @@ func (c *Container) mountNamedVolume(v *ContainerNamedVolume, mountpoint string) // a bizarre issue where something copier.Get will ENOENT on // empty directories and sometimes it will not. // RHBZ#1928643 - srcContents, err := ioutil.ReadDir(srcDir) + srcContents, err := os.ReadDir(srcDir) if err != nil { - return nil, fmt.Errorf("error reading contents of source directory for copy up into volume %s: %w", vol.Name(), err) + return nil, fmt.Errorf("reading contents of source directory for copy up into volume %s: %w", vol.Name(), err) } if len(srcContents) == 0 { return vol, nil @@ -1683,9 +1687,9 @@ func (c *Container) mountNamedVolume(v *ContainerNamedVolume, mountpoint string) // If the volume is not empty, we should not copy up. volMount := vol.mountPoint() - contents, err := ioutil.ReadDir(volMount) + contents, err := os.ReadDir(volMount) if err != nil { - return nil, fmt.Errorf("error listing contents of volume %s mountpoint when copying up from container %s: %w", vol.Name(), c.ID(), err) + return nil, fmt.Errorf("listing contents of volume %s mountpoint when copying up from container %s: %w", vol.Name(), c.ID(), err) } if len(contents) > 0 { // The volume is not empty. It was likely modified @@ -1727,11 +1731,11 @@ func (c *Container) mountNamedVolume(v *ContainerNamedVolume, mountpoint string) if err2 != nil { logrus.Errorf("Streaming contents of container %s directory for volume copy-up: %v", c.ID(), err2) } - return nil, fmt.Errorf("error copying up to volume %s: %w", vol.Name(), err) + return nil, fmt.Errorf("copying up to volume %s: %w", vol.Name(), err) } if err := <-errChan; err != nil { - return nil, fmt.Errorf("error streaming container content for copy up into volume %s: %w", vol.Name(), err) + return nil, fmt.Errorf("streaming container content for copy up into volume %s: %w", vol.Name(), err) } } return vol, nil @@ -1814,7 +1818,7 @@ func (c *Container) cleanupStorage() error { if cleanupErr != nil { logrus.Errorf("Unmounting container %s: %v", c.ID(), cleanupErr) } - cleanupErr = fmt.Errorf("error retrieving named volume %s for container %s: %w", v.Name, c.ID(), err) + cleanupErr = fmt.Errorf("retrieving named volume %s for container %s: %w", v.Name, c.ID(), err) // We need to try and unmount every volume, so continue // if they fail. @@ -1827,7 +1831,7 @@ func (c *Container) cleanupStorage() error { if cleanupErr != nil { logrus.Errorf("Unmounting container %s: %v", c.ID(), cleanupErr) } - cleanupErr = fmt.Errorf("error unmounting volume %s for container %s: %w", vol.Name(), c.ID(), err) + cleanupErr = fmt.Errorf("unmounting volume %s for container %s: %w", vol.Name(), c.ID(), err) } vol.lock.Unlock() } @@ -1852,7 +1856,7 @@ func (c *Container) cleanup(ctx context.Context) error { // Clean up network namespace, if present if err := c.cleanupNetwork(); err != nil { - lastError = fmt.Errorf("error removing container %s network: %w", c.ID(), err) + lastError = fmt.Errorf("removing container %s network: %w", c.ID(), err) } // cleanup host entry if it is shared @@ -1890,7 +1894,7 @@ func (c *Container) cleanup(ctx context.Context) error { if lastError != nil { logrus.Errorf("Unmounting container %s storage: %v", c.ID(), err) } else { - lastError = fmt.Errorf("error unmounting container %s storage: %w", c.ID(), err) + lastError = fmt.Errorf("unmounting container %s storage: %w", c.ID(), err) } } @@ -1974,7 +1978,7 @@ func (c *Container) stopPodIfNeeded(ctx context.Context) error { // hooks. func (c *Container) delete(ctx context.Context) error { if err := c.ociRuntime.DeleteContainer(c); err != nil { - return fmt.Errorf("error removing container %s from runtime: %w", c.ID(), err) + return fmt.Errorf("removing container %s from runtime: %w", c.ID(), err) } if err := c.postDeleteHooks(ctx); err != nil { @@ -2037,7 +2041,7 @@ func (c *Container) writeStringToRundir(destFile, contents string) (string, erro destFileName := filepath.Join(c.state.RunDir, destFile) if err := os.Remove(destFileName); err != nil && !os.IsNotExist(err) { - return "", fmt.Errorf("error removing %s for container %s: %w", destFile, c.ID(), err) + return "", fmt.Errorf("removing %s for container %s: %w", destFile, c.ID(), err) } if err := writeStringToPath(destFileName, contents, c.config.MountLabel, c.RootUID(), c.RootGID()); err != nil { @@ -2071,22 +2075,22 @@ func (c *Container) saveSpec(spec *spec.Spec) error { jsonPath := filepath.Join(c.bundlePath(), "config.json") if _, err := os.Stat(jsonPath); err != nil { if !os.IsNotExist(err) { - return fmt.Errorf("error doing stat on container %s spec: %w", c.ID(), err) + return fmt.Errorf("doing stat on container %s spec: %w", c.ID(), err) } // The spec does not exist, we're fine } else { // The spec exists, need to remove it if err := os.Remove(jsonPath); err != nil { - return fmt.Errorf("error replacing runtime spec for container %s: %w", c.ID(), err) + return fmt.Errorf("replacing runtime spec for container %s: %w", c.ID(), err) } } fileJSON, err := json.Marshal(spec) if err != nil { - return fmt.Errorf("error exporting runtime spec for container %s to JSON: %w", c.ID(), err) + return fmt.Errorf("exporting runtime spec for container %s to JSON: %w", c.ID(), err) } if err := ioutil.WriteFile(jsonPath, fileJSON, 0644); err != nil { - return fmt.Errorf("error writing runtime spec JSON for container %s to disk: %w", c.ID(), err) + return fmt.Errorf("writing runtime spec JSON for container %s to disk: %w", c.ID(), err) } logrus.Debugf("Created OCI spec for container %s at %s", c.ID(), jsonPath) @@ -2154,11 +2158,11 @@ func (c *Container) mount() (string, error) { mountPoint, err := c.runtime.storageService.MountContainerImage(c.ID()) if err != nil { - return "", fmt.Errorf("error mounting storage for container %s: %w", c.ID(), err) + return "", fmt.Errorf("mounting storage for container %s: %w", c.ID(), err) } mountPoint, err = filepath.EvalSymlinks(mountPoint) if err != nil { - return "", fmt.Errorf("error resolving storage path for container %s: %w", c.ID(), err) + return "", fmt.Errorf("resolving storage path for container %s: %w", c.ID(), err) } if err := os.Chown(mountPoint, c.RootUID(), c.RootGID()); err != nil { return "", fmt.Errorf("cannot chown %s to %d:%d: %w", mountPoint, c.RootUID(), c.RootGID(), err) @@ -2170,7 +2174,7 @@ func (c *Container) mount() (string, error) { func (c *Container) unmount(force bool) error { // Also unmount storage if _, err := c.runtime.storageService.UnmountContainerImage(c.ID(), force); err != nil { - return fmt.Errorf("error unmounting container %s root filesystem: %w", c.ID(), err) + return fmt.Errorf("unmounting container %s root filesystem: %w", c.ID(), err) } return nil @@ -2299,7 +2303,7 @@ func (c *Container) checkExitFile() error { return nil } - return fmt.Errorf("error running stat on container %s exit file: %w", c.ID(), err) + return fmt.Errorf("running stat on container %s exit file: %w", c.ID(), err) } // Alright, it exists. Transition to Stopped state. @@ -2354,3 +2358,12 @@ func (c *Container) extractSecretToCtrStorage(secr *ContainerSecret) error { } return nil } + +// update calls the ociRuntime update function to modify a cgroup config after container creation +func (c *Container) update(resources *spec.LinuxResources) error { + if err := c.ociRuntime.UpdateContainer(c, resources); err != nil { + return err + } + logrus.Debugf("updated container %s", c.ID()) + return nil +} diff --git a/libpod/container_internal_common.go b/libpod/container_internal_common.go new file mode 100644 index 000000000..9c4a3bb67 --- /dev/null +++ b/libpod/container_internal_common.go @@ -0,0 +1,2694 @@ +//go:build linux || freebsd +// +build linux freebsd + +package libpod + +import ( + "context" + "errors" + "fmt" + "io" + "io/ioutil" + "math" + "os" + "os/user" + "path" + "path/filepath" + "strconv" + "strings" + "syscall" + "time" + + metadata "github.com/checkpoint-restore/checkpointctl/lib" + "github.com/checkpoint-restore/go-criu/v5/stats" + cdi "github.com/container-orchestrated-devices/container-device-interface/pkg/cdi" + "github.com/containers/buildah" + "github.com/containers/buildah/pkg/chrootuser" + "github.com/containers/buildah/pkg/overlay" + butil "github.com/containers/buildah/util" + "github.com/containers/common/libnetwork/etchosts" + "github.com/containers/common/libnetwork/resolvconf" + "github.com/containers/common/libnetwork/types" + "github.com/containers/common/pkg/apparmor" + "github.com/containers/common/pkg/chown" + "github.com/containers/common/pkg/config" + "github.com/containers/common/pkg/subscriptions" + "github.com/containers/common/pkg/umask" + cutil "github.com/containers/common/pkg/util" + is "github.com/containers/image/v5/storage" + "github.com/containers/podman/v4/libpod/define" + "github.com/containers/podman/v4/libpod/events" + "github.com/containers/podman/v4/pkg/annotations" + "github.com/containers/podman/v4/pkg/checkpoint/crutils" + "github.com/containers/podman/v4/pkg/criu" + "github.com/containers/podman/v4/pkg/lookup" + "github.com/containers/podman/v4/pkg/rootless" + "github.com/containers/podman/v4/pkg/util" + "github.com/containers/podman/v4/version" + "github.com/containers/storage/pkg/archive" + "github.com/containers/storage/pkg/idtools" + "github.com/containers/storage/pkg/lockfile" + securejoin "github.com/cyphar/filepath-securejoin" + runcuser "github.com/opencontainers/runc/libcontainer/user" + spec "github.com/opencontainers/runtime-spec/specs-go" + "github.com/opencontainers/runtime-tools/generate" + "github.com/opencontainers/selinux/go-selinux" + "github.com/opencontainers/selinux/go-selinux/label" + "github.com/sirupsen/logrus" +) + +// Internal only function which returns upper and work dir from +// overlay options. +func getOverlayUpperAndWorkDir(options []string) (string, string, error) { + upperDir := "" + workDir := "" + for _, o := range options { + if strings.HasPrefix(o, "upperdir") { + splitOpt := strings.SplitN(o, "=", 2) + if len(splitOpt) > 1 { + upperDir = splitOpt[1] + if upperDir == "" { + return "", "", errors.New("cannot accept empty value for upperdir") + } + } + } + if strings.HasPrefix(o, "workdir") { + splitOpt := strings.SplitN(o, "=", 2) + if len(splitOpt) > 1 { + workDir = splitOpt[1] + if workDir == "" { + return "", "", errors.New("cannot accept empty value for workdir") + } + } + } + } + if (upperDir != "" && workDir == "") || (upperDir == "" && workDir != "") { + return "", "", errors.New("must specify both upperdir and workdir") + } + return upperDir, workDir, nil +} + +// Generate spec for a container +// Accepts a map of the container's dependencies +func (c *Container) generateSpec(ctx context.Context) (*spec.Spec, error) { + overrides := c.getUserOverrides() + execUser, err := lookup.GetUserGroupInfo(c.state.Mountpoint, c.config.User, overrides) + if err != nil { + if cutil.StringInSlice(c.config.User, c.config.HostUsers) { + execUser, err = lookupHostUser(c.config.User) + } + if err != nil { + return nil, err + } + } + + // NewFromSpec() is deprecated according to its comment + // however the recommended replace just causes a nil map panic + //nolint:staticcheck + g := generate.NewFromSpec(c.config.Spec) + + // If the flag to mount all devices is set for a privileged container, add + // all the devices from the host's machine into the container + if c.config.MountAllDevices { + if err := util.AddPrivilegedDevices(&g); err != nil { + return nil, err + } + } + + // If network namespace was requested, add it now + if err := c.addNetworkNamespace(&g); err != nil { + return nil, err + } + + // Apply AppArmor checks and load the default profile if needed. + if len(c.config.Spec.Process.ApparmorProfile) > 0 { + updatedProfile, err := apparmor.CheckProfileAndLoadDefault(c.config.Spec.Process.ApparmorProfile) + if err != nil { + return nil, err + } + g.SetProcessApparmorProfile(updatedProfile) + } + + if err := c.makeBindMounts(); err != nil { + return nil, err + } + + if err := c.mountNotifySocket(g); err != nil { + return nil, err + } + + // Get host UID and GID based on the container process UID and GID. + hostUID, hostGID, err := butil.GetHostIDs(util.IDtoolsToRuntimeSpec(c.config.IDMappings.UIDMap), util.IDtoolsToRuntimeSpec(c.config.IDMappings.GIDMap), uint32(execUser.Uid), uint32(execUser.Gid)) + if err != nil { + return nil, err + } + + // Add named volumes + for _, namedVol := range c.config.NamedVolumes { + volume, err := c.runtime.GetVolume(namedVol.Name) + if err != nil { + return nil, fmt.Errorf("retrieving volume %s to add to container %s: %w", namedVol.Name, c.ID(), err) + } + mountPoint, err := volume.MountPoint() + if err != nil { + return nil, err + } + + overlayFlag := false + upperDir := "" + workDir := "" + for _, o := range namedVol.Options { + if o == "O" { + overlayFlag = true + upperDir, workDir, err = getOverlayUpperAndWorkDir(namedVol.Options) + if err != nil { + return nil, err + } + } + } + + if overlayFlag { + var overlayMount spec.Mount + var overlayOpts *overlay.Options + contentDir, err := overlay.TempDir(c.config.StaticDir, c.RootUID(), c.RootGID()) + if err != nil { + return nil, err + } + + overlayOpts = &overlay.Options{RootUID: c.RootUID(), + RootGID: c.RootGID(), + UpperDirOptionFragment: upperDir, + WorkDirOptionFragment: workDir, + GraphOpts: c.runtime.store.GraphOptions(), + } + + overlayMount, err = overlay.MountWithOptions(contentDir, mountPoint, namedVol.Dest, overlayOpts) + if err != nil { + return nil, fmt.Errorf("mounting overlay failed %q: %w", mountPoint, err) + } + + for _, o := range namedVol.Options { + if o == "U" { + if err := c.ChangeHostPathOwnership(mountPoint, true, int(hostUID), int(hostGID)); err != nil { + return nil, err + } + + if err := c.ChangeHostPathOwnership(contentDir, true, int(hostUID), int(hostGID)); err != nil { + return nil, err + } + } + } + g.AddMount(overlayMount) + } else { + volMount := spec.Mount{ + Type: define.TypeBind, + Source: mountPoint, + Destination: namedVol.Dest, + Options: namedVol.Options, + } + g.AddMount(volMount) + } + } + + // Check if the spec file mounts contain the options z, Z or U. + // If they have z or Z, relabel the source directory and then remove the option. + // If they have U, chown the source directory and them remove the option. + for i := range g.Config.Mounts { + m := &g.Config.Mounts[i] + var options []string + for _, o := range m.Options { + switch o { + case "U": + if m.Type == "tmpfs" { + options = append(options, []string{fmt.Sprintf("uid=%d", execUser.Uid), fmt.Sprintf("gid=%d", execUser.Gid)}...) + } else { + // only chown on initial creation of container + if err := c.ChangeHostPathOwnership(m.Source, true, int(hostUID), int(hostGID)); err != nil { + return nil, err + } + } + case "z": + fallthrough + case "Z": + if err := c.relabel(m.Source, c.MountLabel(), label.IsShared(o)); err != nil { + return nil, err + } + + default: + options = append(options, o) + } + } + m.Options = options + } + + c.setProcessLabel(&g) + c.setMountLabel(&g) + + // Add bind mounts to container + for dstPath, srcPath := range c.state.BindMounts { + newMount := spec.Mount{ + Type: define.TypeBind, + Source: srcPath, + Destination: dstPath, + Options: bindOptions, + } + if c.IsReadOnly() && dstPath != "/dev/shm" { + newMount.Options = append(newMount.Options, "ro", "nosuid", "noexec", "nodev") + } + if dstPath == "/dev/shm" && c.state.BindMounts["/dev/shm"] == c.config.ShmDir { + newMount.Options = append(newMount.Options, "nosuid", "noexec", "nodev") + } + if !MountExists(g.Mounts(), dstPath) { + g.AddMount(newMount) + } else { + logrus.Infof("User mount overriding libpod mount at %q", dstPath) + } + } + + // Add overlay volumes + for _, overlayVol := range c.config.OverlayVolumes { + upperDir, workDir, err := getOverlayUpperAndWorkDir(overlayVol.Options) + if err != nil { + return nil, err + } + contentDir, err := overlay.TempDir(c.config.StaticDir, c.RootUID(), c.RootGID()) + if err != nil { + return nil, err + } + overlayOpts := &overlay.Options{RootUID: c.RootUID(), + RootGID: c.RootGID(), + UpperDirOptionFragment: upperDir, + WorkDirOptionFragment: workDir, + GraphOpts: c.runtime.store.GraphOptions(), + } + + overlayMount, err := overlay.MountWithOptions(contentDir, overlayVol.Source, overlayVol.Dest, overlayOpts) + if err != nil { + return nil, fmt.Errorf("mounting overlay failed %q: %w", overlayVol.Source, err) + } + + // Check overlay volume options + for _, o := range overlayVol.Options { + if o == "U" { + if err := c.ChangeHostPathOwnership(overlayVol.Source, true, int(hostUID), int(hostGID)); err != nil { + return nil, err + } + + if err := c.ChangeHostPathOwnership(contentDir, true, int(hostUID), int(hostGID)); err != nil { + return nil, err + } + } + } + + g.AddMount(overlayMount) + } + + // Add image volumes as overlay mounts + for _, volume := range c.config.ImageVolumes { + // Mount the specified image. + img, _, err := c.runtime.LibimageRuntime().LookupImage(volume.Source, nil) + if err != nil { + return nil, fmt.Errorf("creating image volume %q:%q: %w", volume.Source, volume.Dest, err) + } + mountPoint, err := img.Mount(ctx, nil, "") + if err != nil { + return nil, fmt.Errorf("mounting image volume %q:%q: %w", volume.Source, volume.Dest, err) + } + + contentDir, err := overlay.TempDir(c.config.StaticDir, c.RootUID(), c.RootGID()) + if err != nil { + return nil, fmt.Errorf("failed to create TempDir in the %s directory: %w", c.config.StaticDir, err) + } + + var overlayMount spec.Mount + if volume.ReadWrite { + overlayMount, err = overlay.Mount(contentDir, mountPoint, volume.Dest, c.RootUID(), c.RootGID(), c.runtime.store.GraphOptions()) + } else { + overlayMount, err = overlay.MountReadOnly(contentDir, mountPoint, volume.Dest, c.RootUID(), c.RootGID(), c.runtime.store.GraphOptions()) + } + if err != nil { + return nil, fmt.Errorf("creating overlay mount for image %q failed: %w", volume.Source, err) + } + g.AddMount(overlayMount) + } + + hasHomeSet := false + for _, s := range c.config.Spec.Process.Env { + if strings.HasPrefix(s, "HOME=") { + hasHomeSet = true + break + } + } + if !hasHomeSet && execUser.Home != "" { + c.config.Spec.Process.Env = append(c.config.Spec.Process.Env, fmt.Sprintf("HOME=%s", execUser.Home)) + } + + if c.config.User != "" { + // User and Group must go together + g.SetProcessUID(uint32(execUser.Uid)) + g.SetProcessGID(uint32(execUser.Gid)) + g.AddProcessAdditionalGid(uint32(execUser.Gid)) + } + + if c.config.Umask != "" { + decVal, err := strconv.ParseUint(c.config.Umask, 8, 32) + if err != nil { + return nil, fmt.Errorf("invalid Umask Value: %w", err) + } + umask := uint32(decVal) + g.Config.Process.User.Umask = &umask + } + + // Add addition groups if c.config.GroupAdd is not empty + if len(c.config.Groups) > 0 { + gids, err := lookup.GetContainerGroups(c.config.Groups, c.state.Mountpoint, overrides) + if err != nil { + return nil, fmt.Errorf("looking up supplemental groups for container %s: %w", c.ID(), err) + } + for _, gid := range gids { + g.AddProcessAdditionalGid(gid) + } + } + + if err := c.addSystemdMounts(&g); err != nil { + return nil, err + } + + // Look up and add groups the user belongs to, if a group wasn't directly specified + if !strings.Contains(c.config.User, ":") { + // the gidMappings that are present inside the container user namespace + var gidMappings []idtools.IDMap + + switch { + case len(c.config.IDMappings.GIDMap) > 0: + gidMappings = c.config.IDMappings.GIDMap + case rootless.IsRootless(): + // Check whether the current user namespace has enough gids available. + availableGids, err := rootless.GetAvailableGids() + if err != nil { + return nil, fmt.Errorf("cannot read number of available GIDs: %w", err) + } + gidMappings = []idtools.IDMap{{ + ContainerID: 0, + HostID: 0, + Size: int(availableGids), + }} + default: + gidMappings = []idtools.IDMap{{ + ContainerID: 0, + HostID: 0, + Size: math.MaxInt32, + }} + } + for _, gid := range execUser.Sgids { + isGIDAvailable := false + for _, m := range gidMappings { + if gid >= m.ContainerID && gid < m.ContainerID+m.Size { + isGIDAvailable = true + break + } + } + if isGIDAvailable { + g.AddProcessAdditionalGid(uint32(gid)) + } else { + logrus.Warnf("Additional gid=%d is not present in the user namespace, skip setting it", gid) + } + } + } + + // Add shared namespaces from other containers + if err := c.addSharedNamespaces(&g); err != nil { + return nil, err + } + + g.SetRootPath(c.state.Mountpoint) + g.AddAnnotation(annotations.Created, c.config.CreatedTime.Format(time.RFC3339Nano)) + g.AddAnnotation("org.opencontainers.image.stopSignal", fmt.Sprintf("%d", c.config.StopSignal)) + + if _, exists := g.Config.Annotations[annotations.ContainerManager]; !exists { + g.AddAnnotation(annotations.ContainerManager, annotations.ContainerManagerLibpod) + } + + if err := c.setCgroupsPath(&g); err != nil { + return nil, err + } + + // Warning: CDI may alter g.Config in place. + if len(c.config.CDIDevices) > 0 { + registry := cdi.GetRegistry( + cdi.WithAutoRefresh(false), + ) + if err := registry.Refresh(); err != nil { + logrus.Debugf("The following error was triggered when refreshing the CDI registry: %v", err) + } + _, err := registry.InjectDevices(g.Config, c.config.CDIDevices...) + if err != nil { + return nil, fmt.Errorf("setting up CDI devices: %w", err) + } + } + + // Mounts need to be sorted so paths will not cover other paths + mounts := sortMounts(g.Mounts()) + g.ClearMounts() + + for _, m := range mounts { + // We need to remove all symlinks from tmpfs mounts. + // Runc and other runtimes may choke on them. + // Easy solution: use securejoin to do a scoped evaluation of + // the links, then trim off the mount prefix. + if m.Type == "tmpfs" { + finalPath, err := securejoin.SecureJoin(c.state.Mountpoint, m.Destination) + if err != nil { + return nil, fmt.Errorf("resolving symlinks for mount destination %s: %w", m.Destination, err) + } + trimmedPath := strings.TrimPrefix(finalPath, strings.TrimSuffix(c.state.Mountpoint, "/")) + m.Destination = trimmedPath + } + g.AddMount(m) + } + + if err := c.addRootPropagation(&g, mounts); err != nil { + return nil, err + } + + // Warning: precreate hooks may alter g.Config in place. + if c.state.ExtensionStageHooks, err = c.setupOCIHooks(ctx, g.Config); err != nil { + return nil, fmt.Errorf("setting up OCI Hooks: %w", err) + } + if len(c.config.EnvSecrets) > 0 { + manager, err := c.runtime.SecretsManager() + if err != nil { + return nil, err + } + if err != nil { + return nil, err + } + for name, secr := range c.config.EnvSecrets { + _, data, err := manager.LookupSecretData(secr.Name) + if err != nil { + return nil, err + } + g.AddProcessEnv(name, string(data)) + } + } + + // Pass down the LISTEN_* environment (see #10443). + for _, key := range []string{"LISTEN_PID", "LISTEN_FDS", "LISTEN_FDNAMES"} { + if val, ok := os.LookupEnv(key); ok { + // Force the PID to `1` since we cannot rely on (all + // versions of) all runtimes to do it for us. + if key == "LISTEN_PID" { + val = "1" + } + g.AddProcessEnv(key, val) + } + } + + return g.Config, nil +} + +// isWorkDirSymlink returns true if resolved workdir is symlink or a chain of symlinks, +// and final resolved target is present either on volume, mount or inside of container +// otherwise it returns false. Following function is meant for internal use only and +// can change at any point of time. +func (c *Container) isWorkDirSymlink(resolvedPath string) bool { + // We cannot create workdir since explicit --workdir is + // set in config but workdir could also be a symlink. + // If it's a symlink, check if the resolved target is present in the container. + // If so, that's a valid use case: return nil. + + maxSymLinks := 0 + for { + // Linux only supports a chain of 40 links. + // Reference: https://github.com/torvalds/linux/blob/master/include/linux/namei.h#L13 + if maxSymLinks > 40 { + break + } + resolvedSymlink, err := os.Readlink(resolvedPath) + if err != nil { + // End sym-link resolution loop. + break + } + if resolvedSymlink != "" { + _, resolvedSymlinkWorkdir, err := c.resolvePath(c.state.Mountpoint, resolvedSymlink) + if isPathOnVolume(c, resolvedSymlinkWorkdir) || isPathOnMount(c, resolvedSymlinkWorkdir) { + // Resolved symlink exists on external volume or mount + return true + } + if err != nil { + // Could not resolve path so end sym-link resolution loop. + break + } + if resolvedSymlinkWorkdir != "" { + resolvedPath = resolvedSymlinkWorkdir + _, err := os.Stat(resolvedSymlinkWorkdir) + if err == nil { + // Symlink resolved successfully and resolved path exists on container, + // this is a valid use-case so return nil. + logrus.Debugf("Workdir is a symlink with target to %q and resolved symlink exists on container", resolvedSymlink) + return true + } + } + } + maxSymLinks++ + } + return false +} + +// resolveWorkDir resolves the container's workdir and, depending on the +// configuration, will create it, or error out if it does not exist. +// Note that the container must be mounted before. +func (c *Container) resolveWorkDir() error { + workdir := c.WorkingDir() + + // If the specified workdir is a subdir of a volume or mount, + // we don't need to do anything. The runtime is taking care of + // that. + if isPathOnVolume(c, workdir) || isPathOnMount(c, workdir) { + logrus.Debugf("Workdir %q resolved to a volume or mount", workdir) + return nil + } + + _, resolvedWorkdir, err := c.resolvePath(c.state.Mountpoint, workdir) + if err != nil { + return err + } + logrus.Debugf("Workdir %q resolved to host path %q", workdir, resolvedWorkdir) + + st, err := os.Stat(resolvedWorkdir) + if err == nil { + if !st.IsDir() { + return fmt.Errorf("workdir %q exists on container %s, but is not a directory", workdir, c.ID()) + } + return nil + } + if !c.config.CreateWorkingDir { + // No need to create it (e.g., `--workdir=/foo`), so let's make sure + // the path exists on the container. + if err != nil { + if os.IsNotExist(err) { + // If resolved Workdir path gets marked as a valid symlink, + // return nil cause this is valid use-case. + if c.isWorkDirSymlink(resolvedWorkdir) { + return nil + } + return fmt.Errorf("workdir %q does not exist on container %s", workdir, c.ID()) + } + // This might be a serious error (e.g., permission), so + // we need to return the full error. + return fmt.Errorf("detecting workdir %q on container %s: %w", workdir, c.ID(), err) + } + return nil + } + if err := os.MkdirAll(resolvedWorkdir, 0755); err != nil { + if os.IsExist(err) { + return nil + } + return fmt.Errorf("creating container %s workdir: %w", c.ID(), err) + } + + // Ensure container entrypoint is created (if required). + uid, gid, _, err := chrootuser.GetUser(c.state.Mountpoint, c.User()) + if err != nil { + return fmt.Errorf("looking up %s inside of the container %s: %w", c.User(), c.ID(), err) + } + if err := os.Chown(resolvedWorkdir, int(uid), int(gid)); err != nil { + return fmt.Errorf("chowning container %s workdir to container root: %w", c.ID(), err) + } + + return nil +} + +func (c *Container) getUserOverrides() *lookup.Overrides { + var hasPasswdFile, hasGroupFile bool + overrides := lookup.Overrides{} + for _, m := range c.config.Spec.Mounts { + if m.Destination == "/etc/passwd" { + overrides.ContainerEtcPasswdPath = m.Source + hasPasswdFile = true + } + if m.Destination == "/etc/group" { + overrides.ContainerEtcGroupPath = m.Source + hasGroupFile = true + } + if m.Destination == "/etc" { + if !hasPasswdFile { + overrides.ContainerEtcPasswdPath = filepath.Join(m.Source, "passwd") + } + if !hasGroupFile { + overrides.ContainerEtcGroupPath = filepath.Join(m.Source, "group") + } + } + } + if path, ok := c.state.BindMounts["/etc/passwd"]; ok { + overrides.ContainerEtcPasswdPath = path + } + return &overrides +} + +func lookupHostUser(name string) (*runcuser.ExecUser, error) { + var execUser runcuser.ExecUser + // Look up User on host + u, err := util.LookupUser(name) + if err != nil { + return &execUser, err + } + uid, err := strconv.ParseUint(u.Uid, 8, 32) + if err != nil { + return &execUser, err + } + + gid, err := strconv.ParseUint(u.Gid, 8, 32) + if err != nil { + return &execUser, err + } + execUser.Uid = int(uid) + execUser.Gid = int(gid) + execUser.Home = u.HomeDir + return &execUser, nil +} + +// mountNotifySocket mounts the NOTIFY_SOCKET into the container if it's set +// and if the sdnotify mode is set to container. It also sets c.notifySocket +// to avoid redundantly looking up the env variable. +func (c *Container) mountNotifySocket(g generate.Generator) error { + if c.config.SdNotifySocket == "" { + return nil + } + if c.config.SdNotifyMode != define.SdNotifyModeContainer { + return nil + } + + notifyDir := filepath.Join(c.bundlePath(), "notify") + logrus.Debugf("Checking notify %q dir", notifyDir) + if err := os.MkdirAll(notifyDir, 0755); err != nil { + if !os.IsExist(err) { + return fmt.Errorf("unable to create notify %q dir: %w", notifyDir, err) + } + } + if err := label.Relabel(notifyDir, c.MountLabel(), true); err != nil { + return fmt.Errorf("relabel failed %q: %w", notifyDir, err) + } + logrus.Debugf("Add bindmount notify %q dir", notifyDir) + if _, ok := c.state.BindMounts["/run/notify"]; !ok { + c.state.BindMounts["/run/notify"] = notifyDir + } + + // Set the container's notify socket to the proxy socket created by conmon + g.AddProcessEnv("NOTIFY_SOCKET", "/run/notify/notify.sock") + + return nil +} + +func (c *Container) addCheckpointImageMetadata(importBuilder *buildah.Builder) error { + // Get information about host environment + hostInfo, err := c.Runtime().hostInfo() + if err != nil { + return fmt.Errorf("getting host info: %v", err) + } + + criuVersion, err := criu.GetCriuVersion() + if err != nil { + return fmt.Errorf("getting criu version: %v", err) + } + + rootfsImageID, rootfsImageName := c.Image() + + // Add image annotations with information about the container and the host. + // This information is useful to check compatibility before restoring the checkpoint + + checkpointImageAnnotations := map[string]string{ + define.CheckpointAnnotationName: c.config.Name, + define.CheckpointAnnotationRawImageName: c.config.RawImageName, + define.CheckpointAnnotationRootfsImageID: rootfsImageID, + define.CheckpointAnnotationRootfsImageName: rootfsImageName, + define.CheckpointAnnotationPodmanVersion: version.Version.String(), + define.CheckpointAnnotationCriuVersion: strconv.Itoa(criuVersion), + define.CheckpointAnnotationRuntimeName: hostInfo.OCIRuntime.Name, + define.CheckpointAnnotationRuntimeVersion: hostInfo.OCIRuntime.Version, + define.CheckpointAnnotationConmonVersion: hostInfo.Conmon.Version, + define.CheckpointAnnotationHostArch: hostInfo.Arch, + define.CheckpointAnnotationHostKernel: hostInfo.Kernel, + define.CheckpointAnnotationCgroupVersion: hostInfo.CgroupsVersion, + define.CheckpointAnnotationDistributionVersion: hostInfo.Distribution.Version, + define.CheckpointAnnotationDistributionName: hostInfo.Distribution.Distribution, + } + + for key, value := range checkpointImageAnnotations { + importBuilder.SetAnnotation(key, value) + } + + return nil +} + +func (c *Container) resolveCheckpointImageName(options *ContainerCheckpointOptions) error { + if options.CreateImage == "" { + return nil + } + + // Resolve image name + resolvedImageName, err := c.runtime.LibimageRuntime().ResolveName(options.CreateImage) + if err != nil { + return err + } + + options.CreateImage = resolvedImageName + return nil +} + +func (c *Container) createCheckpointImage(ctx context.Context, options ContainerCheckpointOptions) error { + if options.CreateImage == "" { + return nil + } + logrus.Debugf("Create checkpoint image %s", options.CreateImage) + + // Create storage reference + imageRef, err := is.Transport.ParseStoreReference(c.runtime.store, options.CreateImage) + if err != nil { + return errors.New("failed to parse image name") + } + + // Build an image scratch + builderOptions := buildah.BuilderOptions{ + FromImage: "scratch", + } + importBuilder, err := buildah.NewBuilder(ctx, c.runtime.store, builderOptions) + if err != nil { + return err + } + // Clean up buildah working container + defer func() { + if err := importBuilder.Delete(); err != nil { + logrus.Errorf("Image builder delete failed: %v", err) + } + }() + + if err := c.prepareCheckpointExport(); err != nil { + return err + } + + // Export checkpoint into temporary tar file + tmpDir, err := ioutil.TempDir("", "checkpoint_image_") + if err != nil { + return err + } + defer os.RemoveAll(tmpDir) + + options.TargetFile = path.Join(tmpDir, "checkpoint.tar") + + if err := c.exportCheckpoint(options); err != nil { + return err + } + + // Copy checkpoint from temporary tar file in the image + addAndCopyOptions := buildah.AddAndCopyOptions{} + if err := importBuilder.Add("", true, addAndCopyOptions, options.TargetFile); err != nil { + return err + } + + if err := c.addCheckpointImageMetadata(importBuilder); err != nil { + return err + } + + commitOptions := buildah.CommitOptions{ + Squash: true, + SystemContext: c.runtime.imageContext, + } + + // Create checkpoint image + id, _, _, err := importBuilder.Commit(ctx, imageRef, commitOptions) + if err != nil { + return err + } + logrus.Debugf("Created checkpoint image: %s", id) + return nil +} + +func (c *Container) exportCheckpoint(options ContainerCheckpointOptions) error { + if len(c.Dependencies()) == 1 { + // Check if the dependency is an infra container. If it is we can checkpoint + // the container out of the Pod. + if c.config.Pod == "" { + return errors.New("cannot export checkpoints of containers with dependencies") + } + + pod, err := c.runtime.state.Pod(c.config.Pod) + if err != nil { + return fmt.Errorf("container %s is in pod %s, but pod cannot be retrieved: %w", c.ID(), c.config.Pod, err) + } + infraID, err := pod.InfraContainerID() + if err != nil { + return fmt.Errorf("cannot retrieve infra container ID for pod %s: %w", c.config.Pod, err) + } + if c.Dependencies()[0] != infraID { + return errors.New("cannot export checkpoints of containers with dependencies") + } + } + if len(c.Dependencies()) > 1 { + return errors.New("cannot export checkpoints of containers with dependencies") + } + logrus.Debugf("Exporting checkpoint image of container %q to %q", c.ID(), options.TargetFile) + + includeFiles := []string{ + "artifacts", + metadata.DevShmCheckpointTar, + metadata.ConfigDumpFile, + metadata.SpecDumpFile, + metadata.NetworkStatusFile, + stats.StatsDump, + } + + if c.LogDriver() == define.KubernetesLogging || + c.LogDriver() == define.JSONLogging { + includeFiles = append(includeFiles, "ctr.log") + } + if options.PreCheckPoint { + includeFiles = append(includeFiles, preCheckpointDir) + } else { + includeFiles = append(includeFiles, metadata.CheckpointDirectory) + } + // Get root file-system changes included in the checkpoint archive + var addToTarFiles []string + if !options.IgnoreRootfs { + // To correctly track deleted files, let's go through the output of 'podman diff' + rootFsChanges, err := c.runtime.GetDiff("", c.ID(), define.DiffContainer) + if err != nil { + return fmt.Errorf("exporting root file-system diff for %q: %w", c.ID(), err) + } + + addToTarFiles, err := crutils.CRCreateRootFsDiffTar(&rootFsChanges, c.state.Mountpoint, c.bundlePath()) + if err != nil { + return err + } + + includeFiles = append(includeFiles, addToTarFiles...) + } + + // Folder containing archived volumes that will be included in the export + expVolDir := filepath.Join(c.bundlePath(), metadata.CheckpointVolumesDirectory) + + // Create an archive for each volume associated with the container + if !options.IgnoreVolumes { + if err := os.MkdirAll(expVolDir, 0700); err != nil { + return fmt.Errorf("creating volumes export directory %q: %w", expVolDir, err) + } + + for _, v := range c.config.NamedVolumes { + volumeTarFilePath := filepath.Join(metadata.CheckpointVolumesDirectory, v.Name+".tar") + volumeTarFileFullPath := filepath.Join(c.bundlePath(), volumeTarFilePath) + + volumeTarFile, err := os.Create(volumeTarFileFullPath) + if err != nil { + return fmt.Errorf("creating %q: %w", volumeTarFileFullPath, err) + } + + volume, err := c.runtime.GetVolume(v.Name) + if err != nil { + return err + } + + mp, err := volume.MountPoint() + if err != nil { + return err + } + if mp == "" { + return fmt.Errorf("volume %s is not mounted, cannot export: %w", volume.Name(), define.ErrInternal) + } + + input, err := archive.TarWithOptions(mp, &archive.TarOptions{ + Compression: archive.Uncompressed, + IncludeSourceDir: true, + }) + if err != nil { + return fmt.Errorf("reading volume directory %q: %w", v.Dest, err) + } + + _, err = io.Copy(volumeTarFile, input) + if err != nil { + return err + } + volumeTarFile.Close() + + includeFiles = append(includeFiles, volumeTarFilePath) + } + } + + input, err := archive.TarWithOptions(c.bundlePath(), &archive.TarOptions{ + Compression: options.Compression, + IncludeSourceDir: true, + IncludeFiles: includeFiles, + }) + + if err != nil { + return fmt.Errorf("reading checkpoint directory %q: %w", c.ID(), err) + } + + outFile, err := os.Create(options.TargetFile) + if err != nil { + return fmt.Errorf("creating checkpoint export file %q: %w", options.TargetFile, err) + } + defer outFile.Close() + + if err := os.Chmod(options.TargetFile, 0600); err != nil { + return err + } + + _, err = io.Copy(outFile, input) + if err != nil { + return err + } + + for _, file := range addToTarFiles { + os.Remove(filepath.Join(c.bundlePath(), file)) + } + + if !options.IgnoreVolumes { + os.RemoveAll(expVolDir) + } + + return nil +} + +func (c *Container) checkpointRestoreSupported(version int) error { + if !criu.CheckForCriu(version) { + return fmt.Errorf("checkpoint/restore requires at least CRIU %d", version) + } + if !c.ociRuntime.SupportsCheckpoint() { + return errors.New("configured runtime does not support checkpoint/restore") + } + return nil +} + +func (c *Container) checkpoint(ctx context.Context, options ContainerCheckpointOptions) (*define.CRIUCheckpointRestoreStatistics, int64, error) { + if err := c.checkpointRestoreSupported(criu.MinCriuVersion); err != nil { + return nil, 0, err + } + + if c.state.State != define.ContainerStateRunning { + return nil, 0, fmt.Errorf("%q is not running, cannot checkpoint: %w", c.state.State, define.ErrCtrStateInvalid) + } + + if c.AutoRemove() && options.TargetFile == "" { + return nil, 0, errors.New("cannot checkpoint containers that have been started with '--rm' unless '--export' is used") + } + + if err := c.resolveCheckpointImageName(&options); err != nil { + return nil, 0, err + } + + if err := crutils.CRCreateFileWithLabel(c.bundlePath(), "dump.log", c.MountLabel()); err != nil { + return nil, 0, err + } + + // Setting CheckpointLog early in case there is a failure. + c.state.CheckpointLog = path.Join(c.bundlePath(), "dump.log") + c.state.CheckpointPath = c.CheckpointPath() + + runtimeCheckpointDuration, err := c.ociRuntime.CheckpointContainer(c, options) + if err != nil { + return nil, 0, err + } + + // Keep the content of /dev/shm directory + if c.config.ShmDir != "" && c.state.BindMounts["/dev/shm"] == c.config.ShmDir { + shmDirTarFileFullPath := filepath.Join(c.bundlePath(), metadata.DevShmCheckpointTar) + + shmDirTarFile, err := os.Create(shmDirTarFileFullPath) + if err != nil { + return nil, 0, err + } + defer shmDirTarFile.Close() + + input, err := archive.TarWithOptions(c.config.ShmDir, &archive.TarOptions{ + Compression: archive.Uncompressed, + IncludeSourceDir: true, + }) + if err != nil { + return nil, 0, err + } + + if _, err = io.Copy(shmDirTarFile, input); err != nil { + return nil, 0, err + } + } + + // Save network.status. This is needed to restore the container with + // the same IP. Currently limited to one IP address in a container + // with one interface. + // FIXME: will this break something? + if _, err := metadata.WriteJSONFile(c.getNetworkStatus(), c.bundlePath(), metadata.NetworkStatusFile); err != nil { + return nil, 0, err + } + + defer c.newContainerEvent(events.Checkpoint) + + // There is a bug from criu: https://github.com/checkpoint-restore/criu/issues/116 + // We have to change the symbolic link from absolute path to relative path + if options.WithPrevious { + os.Remove(path.Join(c.CheckpointPath(), "parent")) + if err := os.Symlink("../pre-checkpoint", path.Join(c.CheckpointPath(), "parent")); err != nil { + return nil, 0, err + } + } + + if options.TargetFile != "" { + if err := c.exportCheckpoint(options); err != nil { + return nil, 0, err + } + } else { + if err := c.createCheckpointImage(ctx, options); err != nil { + return nil, 0, err + } + } + + logrus.Debugf("Checkpointed container %s", c.ID()) + + if !options.KeepRunning && !options.PreCheckPoint { + c.state.State = define.ContainerStateStopped + c.state.Checkpointed = true + c.state.CheckpointedTime = time.Now() + c.state.Restored = false + c.state.RestoredTime = time.Time{} + + // Clean up Storage and Network + if err := c.cleanup(ctx); err != nil { + return nil, 0, err + } + } + + criuStatistics, err := func() (*define.CRIUCheckpointRestoreStatistics, error) { + if !options.PrintStats { + return nil, nil + } + statsDirectory, err := os.Open(c.bundlePath()) + if err != nil { + return nil, fmt.Errorf("not able to open %q: %w", c.bundlePath(), err) + } + + dumpStatistics, err := stats.CriuGetDumpStats(statsDirectory) + if err != nil { + return nil, fmt.Errorf("displaying checkpointing statistics not possible: %w", err) + } + + return &define.CRIUCheckpointRestoreStatistics{ + FreezingTime: dumpStatistics.GetFreezingTime(), + FrozenTime: dumpStatistics.GetFrozenTime(), + MemdumpTime: dumpStatistics.GetMemdumpTime(), + MemwriteTime: dumpStatistics.GetMemwriteTime(), + PagesScanned: dumpStatistics.GetPagesScanned(), + PagesWritten: dumpStatistics.GetPagesWritten(), + }, nil + }() + if err != nil { + return nil, 0, err + } + + if !options.Keep && !options.PreCheckPoint { + cleanup := []string{ + "dump.log", + stats.StatsDump, + metadata.ConfigDumpFile, + metadata.SpecDumpFile, + } + for _, del := range cleanup { + file := filepath.Join(c.bundlePath(), del) + if err := os.Remove(file); err != nil { + logrus.Debugf("Unable to remove file %s", file) + } + } + // The file has been deleted. Do not mention it. + c.state.CheckpointLog = "" + } + + c.state.FinishedTime = time.Now() + return criuStatistics, runtimeCheckpointDuration, c.save() +} + +func (c *Container) generateContainerSpec() error { + // Make sure the newly created config.json exists on disk + + // NewFromSpec() is deprecated according to its comment + // however the recommended replace just causes a nil map panic + //nolint:staticcheck + g := generate.NewFromSpec(c.config.Spec) + + if err := c.saveSpec(g.Config); err != nil { + return fmt.Errorf("saving imported container specification for restore failed: %w", err) + } + + return nil +} + +func (c *Container) importCheckpointImage(ctx context.Context, imageID string) error { + img, _, err := c.Runtime().LibimageRuntime().LookupImage(imageID, nil) + if err != nil { + return err + } + + mountPoint, err := img.Mount(ctx, nil, "") + defer func() { + if err := c.unmount(true); err != nil { + logrus.Errorf("Failed to unmount container: %v", err) + } + }() + if err != nil { + return err + } + + // Import all checkpoint files except ConfigDumpFile and SpecDumpFile. We + // generate new container config files to enable to specifying a new + // container name. + checkpoint := []string{ + "artifacts", + metadata.CheckpointDirectory, + metadata.CheckpointVolumesDirectory, + metadata.DevShmCheckpointTar, + metadata.RootFsDiffTar, + metadata.DeletedFilesFile, + metadata.PodOptionsFile, + metadata.PodDumpFile, + } + + for _, name := range checkpoint { + src := filepath.Join(mountPoint, name) + dst := filepath.Join(c.bundlePath(), name) + if err := archive.NewDefaultArchiver().CopyWithTar(src, dst); err != nil { + logrus.Debugf("Can't import '%s' from checkpoint image", name) + } + } + + return c.generateContainerSpec() +} + +func (c *Container) importCheckpointTar(input string) error { + if err := crutils.CRImportCheckpointWithoutConfig(c.bundlePath(), input); err != nil { + return err + } + + return c.generateContainerSpec() +} + +func (c *Container) importPreCheckpoint(input string) error { + archiveFile, err := os.Open(input) + if err != nil { + return fmt.Errorf("failed to open pre-checkpoint archive for import: %w", err) + } + + defer archiveFile.Close() + + err = archive.Untar(archiveFile, c.bundlePath(), nil) + if err != nil { + return fmt.Errorf("unpacking of pre-checkpoint archive %s failed: %w", input, err) + } + return nil +} + +func (c *Container) restore(ctx context.Context, options ContainerCheckpointOptions) (criuStatistics *define.CRIUCheckpointRestoreStatistics, runtimeRestoreDuration int64, retErr error) { + minCriuVersion := func() int { + if options.Pod == "" { + return criu.MinCriuVersion + } + return criu.PodCriuVersion + }() + if err := c.checkpointRestoreSupported(minCriuVersion); err != nil { + return nil, 0, err + } + + if options.Pod != "" && !crutils.CRRuntimeSupportsPodCheckpointRestore(c.ociRuntime.Path()) { + return nil, 0, fmt.Errorf("runtime %s does not support pod restore", c.ociRuntime.Path()) + } + + if !c.ensureState(define.ContainerStateConfigured, define.ContainerStateExited) { + return nil, 0, fmt.Errorf("container %s is running or paused, cannot restore: %w", c.ID(), define.ErrCtrStateInvalid) + } + + if options.ImportPrevious != "" { + if err := c.importPreCheckpoint(options.ImportPrevious); err != nil { + return nil, 0, err + } + } + + if options.TargetFile != "" { + if err := c.importCheckpointTar(options.TargetFile); err != nil { + return nil, 0, err + } + } else if options.CheckpointImageID != "" { + if err := c.importCheckpointImage(ctx, options.CheckpointImageID); err != nil { + return nil, 0, err + } + } + + // Let's try to stat() CRIU's inventory file. If it does not exist, it makes + // no sense to try a restore. This is a minimal check if a checkpoint exist. + if _, err := os.Stat(filepath.Join(c.CheckpointPath(), "inventory.img")); os.IsNotExist(err) { + return nil, 0, fmt.Errorf("a complete checkpoint for this container cannot be found, cannot restore: %w", err) + } + + if err := crutils.CRCreateFileWithLabel(c.bundlePath(), "restore.log", c.MountLabel()); err != nil { + return nil, 0, err + } + + // Setting RestoreLog early in case there is a failure. + c.state.RestoreLog = path.Join(c.bundlePath(), "restore.log") + c.state.CheckpointPath = c.CheckpointPath() + + // Read network configuration from checkpoint + var netStatus map[string]types.StatusBlock + _, err := metadata.ReadJSONFile(&netStatus, c.bundlePath(), metadata.NetworkStatusFile) + if err != nil { + logrus.Infof("Failed to unmarshal network status, cannot restore the same ip/mac: %v", err) + } + // If the restored container should get a new name, the IP address of + // the container will not be restored. This assumes that if a new name is + // specified, the container is restored multiple times. + // TODO: This implicit restoring with or without IP depending on an + // unrelated restore parameter (--name) does not seem like the + // best solution. + if err == nil && options.Name == "" && (!options.IgnoreStaticIP || !options.IgnoreStaticMAC) { + // The file with the network.status does exist. Let's restore the + // container with the same networks settings as during checkpointing. + networkOpts, err := c.networks() + if err != nil { + return nil, 0, err + } + + netOpts := make(map[string]types.PerNetworkOptions, len(netStatus)) + for network, perNetOpts := range networkOpts { + // unset mac and ips before we start adding the ones from the status + perNetOpts.StaticMAC = nil + perNetOpts.StaticIPs = nil + for name, netInt := range netStatus[network].Interfaces { + perNetOpts.InterfaceName = name + if !options.IgnoreStaticIP { + perNetOpts.StaticMAC = netInt.MacAddress + } + if !options.IgnoreStaticIP { + for _, netAddress := range netInt.Subnets { + perNetOpts.StaticIPs = append(perNetOpts.StaticIPs, netAddress.IPNet.IP) + } + } + // Normally interfaces have a length of 1, only for some special cni configs we could get more. + // For now just use the first interface to get the ips this should be good enough for most cases. + break + } + netOpts[network] = perNetOpts + } + c.perNetworkOpts = netOpts + } + + defer func() { + if retErr != nil { + if err := c.cleanup(ctx); err != nil { + logrus.Errorf("Cleaning up container %s: %v", c.ID(), err) + } + } + }() + + if err := c.prepare(); err != nil { + return nil, 0, err + } + + // Read config + jsonPath := filepath.Join(c.bundlePath(), "config.json") + logrus.Debugf("generate.NewFromFile at %v", jsonPath) + g, err := generate.NewFromFile(jsonPath) + if err != nil { + logrus.Debugf("generate.NewFromFile failed with %v", err) + return nil, 0, err + } + + // Restoring from an import means that we are doing migration + if options.TargetFile != "" || options.CheckpointImageID != "" { + g.SetRootPath(c.state.Mountpoint) + } + + // We want to have the same network namespace as before. + if err := c.addNetworkNamespace(&g); err != nil { + return nil, 0, err + } + + if options.Pod != "" { + // Running in a Pod means that we have to change all namespace settings to + // the ones from the infrastructure container. + pod, err := c.runtime.LookupPod(options.Pod) + if err != nil { + return nil, 0, fmt.Errorf("pod %q cannot be retrieved: %w", options.Pod, err) + } + + infraContainer, err := pod.InfraContainer() + if err != nil { + return nil, 0, fmt.Errorf("cannot retrieved infra container from pod %q: %w", options.Pod, err) + } + + infraContainer.lock.Lock() + if err := infraContainer.syncContainer(); err != nil { + infraContainer.lock.Unlock() + return nil, 0, fmt.Errorf("syncing infrastructure container %s status: %w", infraContainer.ID(), err) + } + if infraContainer.state.State != define.ContainerStateRunning { + if err := infraContainer.initAndStart(ctx); err != nil { + infraContainer.lock.Unlock() + return nil, 0, fmt.Errorf("starting infrastructure container %s status: %w", infraContainer.ID(), err) + } + } + infraContainer.lock.Unlock() + + if c.config.IPCNsCtr != "" { + nsPath, err := infraContainer.namespacePath(IPCNS) + if err != nil { + return nil, 0, fmt.Errorf("cannot retrieve IPC namespace path for Pod %q: %w", options.Pod, err) + } + if err := g.AddOrReplaceLinuxNamespace(string(spec.IPCNamespace), nsPath); err != nil { + return nil, 0, err + } + } + + if c.config.NetNsCtr != "" { + nsPath, err := infraContainer.namespacePath(NetNS) + if err != nil { + return nil, 0, fmt.Errorf("cannot retrieve network namespace path for Pod %q: %w", options.Pod, err) + } + if err := g.AddOrReplaceLinuxNamespace(string(spec.NetworkNamespace), nsPath); err != nil { + return nil, 0, err + } + } + + if c.config.PIDNsCtr != "" { + nsPath, err := infraContainer.namespacePath(PIDNS) + if err != nil { + return nil, 0, fmt.Errorf("cannot retrieve PID namespace path for Pod %q: %w", options.Pod, err) + } + if err := g.AddOrReplaceLinuxNamespace(string(spec.PIDNamespace), nsPath); err != nil { + return nil, 0, err + } + } + + if c.config.UTSNsCtr != "" { + nsPath, err := infraContainer.namespacePath(UTSNS) + if err != nil { + return nil, 0, fmt.Errorf("cannot retrieve UTS namespace path for Pod %q: %w", options.Pod, err) + } + if err := g.AddOrReplaceLinuxNamespace(string(spec.UTSNamespace), nsPath); err != nil { + return nil, 0, err + } + } + + if c.config.CgroupNsCtr != "" { + nsPath, err := infraContainer.namespacePath(CgroupNS) + if err != nil { + return nil, 0, fmt.Errorf("cannot retrieve Cgroup namespace path for Pod %q: %w", options.Pod, err) + } + if err := g.AddOrReplaceLinuxNamespace(string(spec.CgroupNamespace), nsPath); err != nil { + return nil, 0, err + } + } + } + + if err := c.makeBindMounts(); err != nil { + return nil, 0, err + } + + if options.TargetFile != "" || options.CheckpointImageID != "" { + for dstPath, srcPath := range c.state.BindMounts { + newMount := spec.Mount{ + Type: "bind", + Source: srcPath, + Destination: dstPath, + Options: []string{"bind", "private"}, + } + if c.IsReadOnly() && dstPath != "/dev/shm" { + newMount.Options = append(newMount.Options, "ro", "nosuid", "noexec", "nodev") + } + if dstPath == "/dev/shm" && c.state.BindMounts["/dev/shm"] == c.config.ShmDir { + newMount.Options = append(newMount.Options, "nosuid", "noexec", "nodev") + } + if !MountExists(g.Mounts(), dstPath) { + g.AddMount(newMount) + } + } + } + + // Restore /dev/shm content + if c.config.ShmDir != "" && c.state.BindMounts["/dev/shm"] == c.config.ShmDir { + shmDirTarFileFullPath := filepath.Join(c.bundlePath(), metadata.DevShmCheckpointTar) + if _, err := os.Stat(shmDirTarFileFullPath); err != nil { + logrus.Debug("Container checkpoint doesn't contain dev/shm: ", err.Error()) + } else { + shmDirTarFile, err := os.Open(shmDirTarFileFullPath) + if err != nil { + return nil, 0, err + } + defer shmDirTarFile.Close() + + if err := archive.UntarUncompressed(shmDirTarFile, c.config.ShmDir, nil); err != nil { + return nil, 0, err + } + } + } + + // Cleanup for a working restore. + if err := c.removeConmonFiles(); err != nil { + return nil, 0, err + } + + // Save the OCI spec to disk + if err := c.saveSpec(g.Config); err != nil { + return nil, 0, err + } + + // When restoring from an imported archive, allow restoring the content of volumes. + // Volumes are created in setupContainer() + if !options.IgnoreVolumes && (options.TargetFile != "" || options.CheckpointImageID != "") { + for _, v := range c.config.NamedVolumes { + volumeFilePath := filepath.Join(c.bundlePath(), metadata.CheckpointVolumesDirectory, v.Name+".tar") + + volumeFile, err := os.Open(volumeFilePath) + if err != nil { + return nil, 0, fmt.Errorf("failed to open volume file %s: %w", volumeFilePath, err) + } + defer volumeFile.Close() + + volume, err := c.runtime.GetVolume(v.Name) + if err != nil { + return nil, 0, fmt.Errorf("failed to retrieve volume %s: %w", v.Name, err) + } + + mountPoint, err := volume.MountPoint() + if err != nil { + return nil, 0, err + } + if mountPoint == "" { + return nil, 0, fmt.Errorf("unable to import volume %s as it is not mounted: %w", volume.Name(), err) + } + if err := archive.UntarUncompressed(volumeFile, mountPoint, nil); err != nil { + return nil, 0, fmt.Errorf("failed to extract volume %s to %s: %w", volumeFilePath, mountPoint, err) + } + } + } + + // Before actually restarting the container, apply the root file-system changes + if !options.IgnoreRootfs { + if err := crutils.CRApplyRootFsDiffTar(c.bundlePath(), c.state.Mountpoint); err != nil { + return nil, 0, err + } + + if err := crutils.CRRemoveDeletedFiles(c.ID(), c.bundlePath(), c.state.Mountpoint); err != nil { + return nil, 0, err + } + } + + runtimeRestoreDuration, err = c.ociRuntime.CreateContainer(c, &options) + if err != nil { + return nil, 0, err + } + + criuStatistics, err = func() (*define.CRIUCheckpointRestoreStatistics, error) { + if !options.PrintStats { + return nil, nil + } + statsDirectory, err := os.Open(c.bundlePath()) + if err != nil { + return nil, fmt.Errorf("not able to open %q: %w", c.bundlePath(), err) + } + + restoreStatistics, err := stats.CriuGetRestoreStats(statsDirectory) + if err != nil { + return nil, fmt.Errorf("displaying restore statistics not possible: %w", err) + } + + return &define.CRIUCheckpointRestoreStatistics{ + PagesCompared: restoreStatistics.GetPagesCompared(), + PagesSkippedCow: restoreStatistics.GetPagesSkippedCow(), + ForkingTime: restoreStatistics.GetForkingTime(), + RestoreTime: restoreStatistics.GetRestoreTime(), + PagesRestored: restoreStatistics.GetPagesRestored(), + }, nil + }() + if err != nil { + return nil, 0, err + } + + logrus.Debugf("Restored container %s", c.ID()) + + c.state.State = define.ContainerStateRunning + c.state.Checkpointed = false + c.state.Restored = true + c.state.CheckpointedTime = time.Time{} + c.state.RestoredTime = time.Now() + + if !options.Keep { + // Delete all checkpoint related files. At this point, in theory, all files + // should exist. Still ignoring errors for now as the container should be + // restored and running. Not erroring out just because some cleanup operation + // failed. Starting with the checkpoint directory + err = os.RemoveAll(c.CheckpointPath()) + if err != nil { + logrus.Debugf("Non-fatal: removal of checkpoint directory (%s) failed: %v", c.CheckpointPath(), err) + } + c.state.CheckpointPath = "" + err = os.RemoveAll(c.PreCheckPointPath()) + if err != nil { + logrus.Debugf("Non-fatal: removal of pre-checkpoint directory (%s) failed: %v", c.PreCheckPointPath(), err) + } + err = os.RemoveAll(c.CheckpointVolumesPath()) + if err != nil { + logrus.Debugf("Non-fatal: removal of checkpoint volumes directory (%s) failed: %v", c.CheckpointVolumesPath(), err) + } + cleanup := [...]string{ + "restore.log", + "dump.log", + stats.StatsDump, + stats.StatsRestore, + metadata.DevShmCheckpointTar, + metadata.NetworkStatusFile, + metadata.RootFsDiffTar, + metadata.DeletedFilesFile, + } + for _, del := range cleanup { + file := filepath.Join(c.bundlePath(), del) + err = os.Remove(file) + if err != nil { + logrus.Debugf("Non-fatal: removal of checkpoint file (%s) failed: %v", file, err) + } + } + c.state.CheckpointLog = "" + c.state.RestoreLog = "" + } + + return criuStatistics, runtimeRestoreDuration, c.save() +} + +// Retrieves a container's "root" net namespace container dependency. +func (c *Container) getRootNetNsDepCtr() (depCtr *Container, err error) { + containersVisited := map[string]int{c.config.ID: 1} + nextCtr := c.config.NetNsCtr + for nextCtr != "" { + // Make sure we aren't in a loop + if _, visited := containersVisited[nextCtr]; visited { + return nil, errors.New("loop encountered while determining net namespace container") + } + containersVisited[nextCtr] = 1 + + depCtr, err = c.runtime.state.Container(nextCtr) + if err != nil { + return nil, fmt.Errorf("fetching dependency %s of container %s: %w", c.config.NetNsCtr, c.ID(), err) + } + // This should never happen without an error + if depCtr == nil { + break + } + nextCtr = depCtr.config.NetNsCtr + } + + if depCtr == nil { + return nil, errors.New("unexpected error depCtr is nil without reported error from runtime state") + } + return depCtr, nil +} + +// Ensure standard bind mounts are mounted into all root directories (including chroot directories) +func (c *Container) mountIntoRootDirs(mountName string, mountPath string) error { + c.state.BindMounts[mountName] = mountPath + + for _, chrootDir := range c.config.ChrootDirs { + c.state.BindMounts[filepath.Join(chrootDir, mountName)] = mountPath + } + + return nil +} + +// Make standard bind mounts to include in the container +func (c *Container) makeBindMounts() error { + if err := os.Chown(c.state.RunDir, c.RootUID(), c.RootGID()); err != nil { + return fmt.Errorf("cannot chown run directory: %w", err) + } + + if c.state.BindMounts == nil { + c.state.BindMounts = make(map[string]string) + } + netDisabled, err := c.NetworkDisabled() + if err != nil { + return err + } + + if !netDisabled { + // If /etc/resolv.conf and /etc/hosts exist, delete them so we + // will recreate. Only do this if we aren't sharing them with + // another container. + if c.config.NetNsCtr == "" { + if resolvePath, ok := c.state.BindMounts["/etc/resolv.conf"]; ok { + if err := os.Remove(resolvePath); err != nil && !os.IsNotExist(err) { + return fmt.Errorf("container %s: %w", c.ID(), err) + } + delete(c.state.BindMounts, "/etc/resolv.conf") + } + if hostsPath, ok := c.state.BindMounts["/etc/hosts"]; ok { + if err := os.Remove(hostsPath); err != nil && !os.IsNotExist(err) { + return fmt.Errorf("container %s: %w", c.ID(), err) + } + delete(c.state.BindMounts, "/etc/hosts") + } + } + + if c.config.NetNsCtr != "" && (!c.config.UseImageResolvConf || !c.config.UseImageHosts) { + // We share a net namespace. + // We want /etc/resolv.conf and /etc/hosts from the + // other container. Unless we're not creating both of + // them. + depCtr, err := c.getRootNetNsDepCtr() + if err != nil { + return fmt.Errorf("fetching network namespace dependency container for container %s: %w", c.ID(), err) + } + + // We need that container's bind mounts + bindMounts, err := depCtr.BindMounts() + if err != nil { + return fmt.Errorf("fetching bind mounts from dependency %s of container %s: %w", depCtr.ID(), c.ID(), err) + } + + // The other container may not have a resolv.conf or /etc/hosts + // If it doesn't, don't copy them + resolvPath, exists := bindMounts["/etc/resolv.conf"] + if !c.config.UseImageResolvConf && exists { + err := c.mountIntoRootDirs("/etc/resolv.conf", resolvPath) + + if err != nil { + return fmt.Errorf("assigning mounts to container %s: %w", c.ID(), err) + } + } + + // check if dependency container has an /etc/hosts file. + // It may not have one, so only use it if it does. + hostsPath, exists := bindMounts[config.DefaultHostsFile] + if !c.config.UseImageHosts && exists { + // we cannot use the dependency container lock due ABBA deadlocks in cleanup() + lock, err := lockfile.GetLockfile(hostsPath) + if err != nil { + return fmt.Errorf("failed to lock hosts file: %w", err) + } + lock.Lock() + + // add the newly added container to the hosts file + // we always use 127.0.0.1 as ip since they have the same netns + err = etchosts.Add(hostsPath, getLocalhostHostEntry(c)) + lock.Unlock() + if err != nil { + return fmt.Errorf("creating hosts file for container %s which depends on container %s: %w", c.ID(), depCtr.ID(), err) + } + + // finally, save it in the new container + err = c.mountIntoRootDirs(config.DefaultHostsFile, hostsPath) + if err != nil { + return fmt.Errorf("assigning mounts to container %s: %w", c.ID(), err) + } + } + + if !hasCurrentUserMapped(c) { + if err := makeAccessible(resolvPath, c.RootUID(), c.RootGID()); err != nil { + return err + } + if err := makeAccessible(hostsPath, c.RootUID(), c.RootGID()); err != nil { + return err + } + } + } else { + if !c.config.UseImageResolvConf { + if err := c.generateResolvConf(); err != nil { + return fmt.Errorf("creating resolv.conf for container %s: %w", c.ID(), err) + } + } + + if !c.config.UseImageHosts { + if err := c.createHosts(); err != nil { + return fmt.Errorf("creating hosts file for container %s: %w", c.ID(), err) + } + } + } + + if c.state.BindMounts["/etc/hosts"] != "" { + if err := c.relabel(c.state.BindMounts["/etc/hosts"], c.config.MountLabel, true); err != nil { + return err + } + } + + if c.state.BindMounts["/etc/resolv.conf"] != "" { + if err := c.relabel(c.state.BindMounts["/etc/resolv.conf"], c.config.MountLabel, true); err != nil { + return err + } + } + } else if !c.config.UseImageHosts && c.state.BindMounts["/etc/hosts"] == "" { + if err := c.createHosts(); err != nil { + return fmt.Errorf("creating hosts file for container %s: %w", c.ID(), err) + } + } + + if c.config.ShmDir != "" { + // If ShmDir has a value SHM is always added when we mount the container + c.state.BindMounts["/dev/shm"] = c.config.ShmDir + } + + if c.config.Passwd == nil || *c.config.Passwd { + newPasswd, newGroup, err := c.generatePasswdAndGroup() + if err != nil { + return fmt.Errorf("creating temporary passwd file for container %s: %w", c.ID(), err) + } + if newPasswd != "" { + // Make /etc/passwd + // If it already exists, delete so we can recreate + delete(c.state.BindMounts, "/etc/passwd") + c.state.BindMounts["/etc/passwd"] = newPasswd + } + if newGroup != "" { + // Make /etc/group + // If it already exists, delete so we can recreate + delete(c.state.BindMounts, "/etc/group") + c.state.BindMounts["/etc/group"] = newGroup + } + } + + // Make /etc/localtime + ctrTimezone := c.Timezone() + if ctrTimezone != "" { + // validate the format of the timezone specified if it's not "local" + if ctrTimezone != "local" { + _, err = time.LoadLocation(ctrTimezone) + if err != nil { + return fmt.Errorf("finding timezone for container %s: %w", c.ID(), err) + } + } + if _, ok := c.state.BindMounts["/etc/localtime"]; !ok { + var zonePath string + if ctrTimezone == "local" { + zonePath, err = filepath.EvalSymlinks("/etc/localtime") + if err != nil { + return fmt.Errorf("finding local timezone for container %s: %w", c.ID(), err) + } + } else { + zone := filepath.Join("/usr/share/zoneinfo", ctrTimezone) + zonePath, err = filepath.EvalSymlinks(zone) + if err != nil { + return fmt.Errorf("setting timezone for container %s: %w", c.ID(), err) + } + } + localtimePath, err := c.copyTimezoneFile(zonePath) + if err != nil { + return fmt.Errorf("setting timezone for container %s: %w", c.ID(), err) + } + c.state.BindMounts["/etc/localtime"] = localtimePath + } + } + + _, hasRunContainerenv := c.state.BindMounts["/run/.containerenv"] + if !hasRunContainerenv { + // check in the spec mounts + for _, m := range c.config.Spec.Mounts { + if m.Destination == "/run/.containerenv" || m.Destination == "/run" { + hasRunContainerenv = true + break + } + } + } + + // Make .containerenv if it does not exist + if !hasRunContainerenv { + containerenv := c.runtime.graphRootMountedFlag(c.config.Spec.Mounts) + isRootless := 0 + if rootless.IsRootless() { + isRootless = 1 + } + imageID, imageName := c.Image() + + if c.Privileged() { + // Populate the .containerenv with container information + containerenv = fmt.Sprintf(`engine="podman-%s" +name=%q +id=%q +image=%q +imageid=%q +rootless=%d +%s`, version.Version.String(), c.Name(), c.ID(), imageName, imageID, isRootless, containerenv) + } + containerenvPath, err := c.writeStringToRundir(".containerenv", containerenv) + if err != nil { + return fmt.Errorf("creating containerenv file for container %s: %w", c.ID(), err) + } + c.state.BindMounts["/run/.containerenv"] = containerenvPath + } + + // Add Subscription Mounts + subscriptionMounts := subscriptions.MountsWithUIDGID(c.config.MountLabel, c.state.RunDir, c.runtime.config.Containers.DefaultMountsFile, c.state.Mountpoint, c.RootUID(), c.RootGID(), rootless.IsRootless(), false) + for _, mount := range subscriptionMounts { + if _, ok := c.state.BindMounts[mount.Destination]; !ok { + c.state.BindMounts[mount.Destination] = mount.Source + } + } + + // Secrets are mounted by getting the secret data from the secrets manager, + // copying the data into the container's static dir, + // then mounting the copied dir into /run/secrets. + // The secrets mounting must come after subscription mounts, since subscription mounts + // creates the /run/secrets dir in the container where we mount as well. + if len(c.Secrets()) > 0 { + // create /run/secrets if subscriptions did not create + if err := c.createSecretMountDir(); err != nil { + return fmt.Errorf("creating secrets mount: %w", err) + } + for _, secret := range c.Secrets() { + secretFileName := secret.Name + base := "/run/secrets" + if secret.Target != "" { + secretFileName = secret.Target + // If absolute path for target given remove base. + if filepath.IsAbs(secretFileName) { + base = "" + } + } + src := filepath.Join(c.config.SecretsPath, secret.Name) + dest := filepath.Join(base, secretFileName) + c.state.BindMounts[dest] = src + } + } + + return c.makePlatformBindMounts() +} + +// generateResolvConf generates a containers resolv.conf +func (c *Container) generateResolvConf() error { + var ( + networkNameServers []string + networkSearchDomains []string + ) + + netStatus := c.getNetworkStatus() + for _, status := range netStatus { + if status.DNSServerIPs != nil { + for _, nsIP := range status.DNSServerIPs { + networkNameServers = append(networkNameServers, nsIP.String()) + } + logrus.Debugf("Adding nameserver(s) from network status of '%q'", status.DNSServerIPs) + } + if status.DNSSearchDomains != nil { + networkSearchDomains = append(networkSearchDomains, status.DNSSearchDomains...) + logrus.Debugf("Adding search domain(s) from network status of '%q'", status.DNSSearchDomains) + } + } + + ipv6, err := c.checkForIPv6(netStatus) + if err != nil { + return err + } + + nameservers := make([]string, 0, len(c.runtime.config.Containers.DNSServers)+len(c.config.DNSServer)) + nameservers = append(nameservers, c.runtime.config.Containers.DNSServers...) + for _, ip := range c.config.DNSServer { + nameservers = append(nameservers, ip.String()) + } + // If the user provided dns, it trumps all; then dns masq; then resolv.conf + var search []string + keepHostServers := false + if len(nameservers) == 0 { + keepHostServers = true + // first add the nameservers from the networks status + nameservers = networkNameServers + // when we add network dns server we also have to add the search domains + search = networkSearchDomains + // slirp4netns has a built in DNS forwarder. + nameservers = c.addSlirp4netnsDNS(nameservers) + } + + if len(c.config.DNSSearch) > 0 || len(c.runtime.config.Containers.DNSSearches) > 0 { + customSearch := make([]string, 0, len(c.config.DNSSearch)+len(c.runtime.config.Containers.DNSSearches)) + customSearch = append(customSearch, c.runtime.config.Containers.DNSSearches...) + customSearch = append(customSearch, c.config.DNSSearch...) + search = customSearch + } + + options := make([]string, 0, len(c.config.DNSOption)+len(c.runtime.config.Containers.DNSOptions)) + options = append(options, c.runtime.config.Containers.DNSOptions...) + options = append(options, c.config.DNSOption...) + + destPath := filepath.Join(c.state.RunDir, "resolv.conf") + + var namespaces []spec.LinuxNamespace + if c.config.Spec.Linux != nil { + namespaces = c.config.Spec.Linux.Namespaces + } + + if err := resolvconf.New(&resolvconf.Params{ + IPv6Enabled: ipv6, + KeepHostServers: keepHostServers, + Nameservers: nameservers, + Namespaces: namespaces, + Options: options, + Path: destPath, + Searches: search, + }); err != nil { + return fmt.Errorf("building resolv.conf for container %s: %w", c.ID(), err) + } + + return c.bindMountRootFile(destPath, resolvconf.DefaultResolvConf) +} + +// Check if a container uses IPv6. +func (c *Container) checkForIPv6(netStatus map[string]types.StatusBlock) (bool, error) { + for _, status := range netStatus { + for _, netInt := range status.Interfaces { + for _, netAddress := range netInt.Subnets { + // Note: only using To16() does not work since it also returns a valid ip for ipv4 + if netAddress.IPNet.IP.To4() == nil && netAddress.IPNet.IP.To16() != nil { + return true, nil + } + } + } + } + + return c.isSlirp4netnsIPv6() +} + +// Add a new nameserver to the container's resolv.conf, ensuring that it is the +// first nameserver present. +// Usable only with running containers. +func (c *Container) addNameserver(ips []string) error { + // Take no action if container is not running. + if !c.ensureState(define.ContainerStateRunning, define.ContainerStateCreated) { + return nil + } + + // Do we have a resolv.conf at all? + path, ok := c.state.BindMounts[resolvconf.DefaultResolvConf] + if !ok { + return nil + } + + if err := resolvconf.Add(path, ips); err != nil { + return fmt.Errorf("adding new nameserver to container %s resolv.conf: %w", c.ID(), err) + } + + return nil +} + +// Remove an entry from the existing resolv.conf of the container. +// Usable only with running containers. +func (c *Container) removeNameserver(ips []string) error { + // Take no action if container is not running. + if !c.ensureState(define.ContainerStateRunning, define.ContainerStateCreated) { + return nil + } + + // Do we have a resolv.conf at all? + path, ok := c.state.BindMounts[resolvconf.DefaultResolvConf] + if !ok { + return nil + } + + if err := resolvconf.Remove(path, ips); err != nil { + return fmt.Errorf("removing nameservers from container %s resolv.conf: %w", c.ID(), err) + } + + return nil +} + +func getLocalhostHostEntry(c *Container) etchosts.HostEntries { + return etchosts.HostEntries{{IP: "127.0.0.1", Names: []string{c.Hostname(), c.config.Name}}} +} + +// getHostsEntries returns the container ip host entries for the correct netmode +func (c *Container) getHostsEntries() (etchosts.HostEntries, error) { + var entries etchosts.HostEntries + names := []string{c.Hostname(), c.config.Name} + switch { + case c.config.NetMode.IsBridge(): + entries = etchosts.GetNetworkHostEntries(c.state.NetworkStatus, names...) + case c.config.NetMode.IsSlirp4netns(): + ip, err := GetSlirp4netnsIP(c.slirp4netnsSubnet) + if err != nil { + return nil, err + } + entries = etchosts.HostEntries{{IP: ip.String(), Names: names}} + default: + if c.hasNetNone() { + entries = etchosts.HostEntries{{IP: "127.0.0.1", Names: names}} + } + } + return entries, nil +} + +func (c *Container) createHosts() error { + var containerIPsEntries etchosts.HostEntries + var err error + // if we configure the netns after the container create we should not add + // the hosts here since we have no information about the actual ips + // instead we will add them in c.completeNetworkSetup() + if !c.config.PostConfigureNetNS { + containerIPsEntries, err = c.getHostsEntries() + if err != nil { + return fmt.Errorf("failed to get container ip host entries: %w", err) + } + } + baseHostFile, err := etchosts.GetBaseHostFile(c.runtime.config.Containers.BaseHostsFile, c.state.Mountpoint) + if err != nil { + return err + } + + targetFile := filepath.Join(c.state.RunDir, "hosts") + err = etchosts.New(&etchosts.Params{ + BaseFile: baseHostFile, + ExtraHosts: c.config.HostAdd, + ContainerIPs: containerIPsEntries, + HostContainersInternalIP: etchosts.GetHostContainersInternalIP(c.runtime.config, c.state.NetworkStatus, c.runtime.network), + TargetFile: targetFile, + }) + if err != nil { + return err + } + + return c.bindMountRootFile(targetFile, config.DefaultHostsFile) +} + +// bindMountRootFile will chown and relabel the source file to make it usable in the container. +// It will also add the path to the container bind mount map. +// source is the path on the host, dest is the path in the container. +func (c *Container) bindMountRootFile(source, dest string) error { + if err := os.Chown(source, c.RootUID(), c.RootGID()); err != nil { + return err + } + if err := label.Relabel(source, c.MountLabel(), false); err != nil { + return err + } + + return c.mountIntoRootDirs(dest, source) +} + +// generateGroupEntry generates an entry or entries into /etc/group as +// required by container configuration. +// Generally speaking, we will make an entry under two circumstances: +// 1. The container is started as a specific user:group, and that group is both +// numeric, and does not already exist in /etc/group. +// 2. It is requested that Libpod add the group that launched Podman to +// /etc/group via AddCurrentUserPasswdEntry (though this does not trigger if +// the group in question already exists in /etc/passwd). +// +// Returns group entry (as a string that can be appended to /etc/group) and any +// error that occurred. +func (c *Container) generateGroupEntry() (string, error) { + groupString := "" + + // Things we *can't* handle: adding the user we added in + // generatePasswdEntry to any *existing* groups. + addedGID := 0 + if c.config.AddCurrentUserPasswdEntry { + entry, gid, err := c.generateCurrentUserGroupEntry() + if err != nil { + return "", err + } + groupString += entry + addedGID = gid + } + if c.config.User != "" { + entry, err := c.generateUserGroupEntry(addedGID) + if err != nil { + return "", err + } + groupString += entry + } + + return groupString, nil +} + +// Make an entry in /etc/group for the group of the user running podman iff we +// are rootless. +func (c *Container) generateCurrentUserGroupEntry() (string, int, error) { + gid := rootless.GetRootlessGID() + if gid == 0 { + return "", 0, nil + } + + g, err := user.LookupGroupId(strconv.Itoa(gid)) + if err != nil { + return "", 0, fmt.Errorf("failed to get current group: %w", err) + } + + // Look up group name to see if it exists in the image. + _, err = lookup.GetGroup(c.state.Mountpoint, g.Name) + if err != runcuser.ErrNoGroupEntries { + return "", 0, err + } + + // Look up GID to see if it exists in the image. + _, err = lookup.GetGroup(c.state.Mountpoint, g.Gid) + if err != runcuser.ErrNoGroupEntries { + return "", 0, err + } + + // We need to get the username of the rootless user so we can add it to + // the group. + username := "" + uid := rootless.GetRootlessUID() + if uid != 0 { + u, err := user.LookupId(strconv.Itoa(uid)) + if err != nil { + return "", 0, fmt.Errorf("failed to get current user to make group entry: %w", err) + } + username = u.Username + } + + // Make the entry. + return fmt.Sprintf("%s:x:%s:%s\n", g.Name, g.Gid, username), gid, nil +} + +// Make an entry in /etc/group for the group the container was specified to run +// as. +func (c *Container) generateUserGroupEntry(addedGID int) (string, error) { + if c.config.User == "" { + return "", nil + } + + splitUser := strings.SplitN(c.config.User, ":", 2) + group := splitUser[0] + if len(splitUser) > 1 { + group = splitUser[1] + } + + gid, err := strconv.ParseUint(group, 10, 32) + if err != nil { + return "", nil //nolint: nilerr + } + + if addedGID != 0 && addedGID == int(gid) { + return "", nil + } + + // Check if the group already exists + _, err = lookup.GetGroup(c.state.Mountpoint, group) + if err != runcuser.ErrNoGroupEntries { + return "", err + } + + return fmt.Sprintf("%d:x:%d:%s\n", gid, gid, splitUser[0]), nil +} + +// generatePasswdEntry generates an entry or entries into /etc/passwd as +// required by container configuration. +// Generally speaking, we will make an entry under two circumstances: +// 1. The container is started as a specific user who is not in /etc/passwd. +// This only triggers if the user is given as a *numeric* ID. +// 2. It is requested that Libpod add the user that launched Podman to +// /etc/passwd via AddCurrentUserPasswdEntry (though this does not trigger if +// the user in question already exists in /etc/passwd) or the UID to be added +// is 0). +// 3. The user specified additional host user accounts to add the the /etc/passwd file +// +// Returns password entry (as a string that can be appended to /etc/passwd) and +// any error that occurred. +func (c *Container) generatePasswdEntry() (string, error) { + passwdString := "" + + addedUID := 0 + for _, userid := range c.config.HostUsers { + // Look up User on host + u, err := util.LookupUser(userid) + if err != nil { + return "", err + } + entry, err := c.userPasswdEntry(u) + if err != nil { + return "", err + } + passwdString += entry + } + if c.config.AddCurrentUserPasswdEntry { + entry, uid, _, err := c.generateCurrentUserPasswdEntry() + if err != nil { + return "", err + } + passwdString += entry + addedUID = uid + } + if c.config.User != "" { + entry, err := c.generateUserPasswdEntry(addedUID) + if err != nil { + return "", err + } + passwdString += entry + } + + return passwdString, nil +} + +// generateCurrentUserPasswdEntry generates an /etc/passwd entry for the user +// running the container engine. +// Returns a passwd entry for the user, and the UID and GID of the added entry. +func (c *Container) generateCurrentUserPasswdEntry() (string, int, int, error) { + uid := rootless.GetRootlessUID() + if uid == 0 { + return "", 0, 0, nil + } + + u, err := user.LookupId(strconv.Itoa(uid)) + if err != nil { + return "", 0, 0, fmt.Errorf("failed to get current user: %w", err) + } + pwd, err := c.userPasswdEntry(u) + if err != nil { + return "", 0, 0, err + } + + return pwd, uid, rootless.GetRootlessGID(), nil +} + +func (c *Container) userPasswdEntry(u *user.User) (string, error) { + // Look up the user to see if it exists in the container image. + _, err := lookup.GetUser(c.state.Mountpoint, u.Username) + if err != runcuser.ErrNoPasswdEntries { + return "", err + } + + // Look up the UID to see if it exists in the container image. + _, err = lookup.GetUser(c.state.Mountpoint, u.Uid) + if err != runcuser.ErrNoPasswdEntries { + return "", err + } + + // If the user's actual home directory exists, or was mounted in - use + // that. + homeDir := c.WorkingDir() + hDir := u.HomeDir + for hDir != "/" { + if MountExists(c.config.Spec.Mounts, hDir) { + homeDir = u.HomeDir + break + } + hDir = filepath.Dir(hDir) + } + if homeDir != u.HomeDir { + for _, hDir := range c.UserVolumes() { + if hDir == u.HomeDir { + homeDir = u.HomeDir + break + } + } + } + // Set HOME environment if not already set + hasHomeSet := false + for _, s := range c.config.Spec.Process.Env { + if strings.HasPrefix(s, "HOME=") { + hasHomeSet = true + break + } + } + if !hasHomeSet { + c.config.Spec.Process.Env = append(c.config.Spec.Process.Env, fmt.Sprintf("HOME=%s", homeDir)) + } + if c.config.PasswdEntry != "" { + return c.passwdEntry(u.Username, u.Uid, u.Gid, u.Name, homeDir), nil + } + + return fmt.Sprintf("%s:*:%s:%s:%s:%s:/bin/sh\n", u.Username, u.Uid, u.Gid, u.Name, homeDir), nil +} + +// generateUserPasswdEntry generates an /etc/passwd entry for the container user +// to run in the container. +// The UID and GID of the added entry will also be returned. +// Accepts one argument, that being any UID that has already been added to the +// passwd file by other functions; if it matches the UID we were given, we don't +// need to do anything. +func (c *Container) generateUserPasswdEntry(addedUID int) (string, error) { + var ( + groupspec string + gid int + ) + if c.config.User == "" { + return "", nil + } + splitSpec := strings.SplitN(c.config.User, ":", 2) + userspec := splitSpec[0] + if len(splitSpec) > 1 { + groupspec = splitSpec[1] + } + // If a non numeric User, then don't generate passwd + uid, err := strconv.ParseUint(userspec, 10, 32) + if err != nil { + return "", nil //nolint: nilerr + } + + if addedUID != 0 && int(uid) == addedUID { + return "", nil + } + + // Look up the user to see if it exists in the container image + _, err = lookup.GetUser(c.state.Mountpoint, userspec) + if err != runcuser.ErrNoPasswdEntries { + return "", err + } + + if groupspec != "" { + ugid, err := strconv.ParseUint(groupspec, 10, 32) + if err == nil { + gid = int(ugid) + } else { + group, err := lookup.GetGroup(c.state.Mountpoint, groupspec) + if err != nil { + return "", fmt.Errorf("unable to get gid %s from group file: %w", groupspec, err) + } + gid = group.Gid + } + } + + if c.config.PasswdEntry != "" { + entry := c.passwdEntry(fmt.Sprintf("%d", uid), fmt.Sprintf("%d", uid), fmt.Sprintf("%d", gid), "container user", c.WorkingDir()) + return entry, nil + } + + return fmt.Sprintf("%d:*:%d:%d:container user:%s:/bin/sh\n", uid, uid, gid, c.WorkingDir()), nil +} + +func (c *Container) passwdEntry(username string, uid, gid, name, homeDir string) string { + s := c.config.PasswdEntry + s = strings.ReplaceAll(s, "$USERNAME", username) + s = strings.ReplaceAll(s, "$UID", uid) + s = strings.ReplaceAll(s, "$GID", gid) + s = strings.ReplaceAll(s, "$NAME", name) + s = strings.ReplaceAll(s, "$HOME", homeDir) + return s + "\n" +} + +// generatePasswdAndGroup generates container-specific passwd and group files +// iff g.config.User is a number or we are configured to make a passwd entry for +// the current user or the user specified HostsUsers +// Returns path to file to mount at /etc/passwd, path to file to mount at +// /etc/group, and any error that occurred. If no passwd/group file were +// required, the empty string will be returned for those path (this may occur +// even if no error happened). +// This may modify the mounted container's /etc/passwd and /etc/group instead of +// making copies to bind-mount in, so we don't break useradd (it wants to make a +// copy of /etc/passwd and rename the copy to /etc/passwd, which is impossible +// with a bind mount). This is done in cases where the container is *not* +// read-only. In this case, the function will return nothing ("", "", nil). +func (c *Container) generatePasswdAndGroup() (string, string, error) { + if !c.config.AddCurrentUserPasswdEntry && c.config.User == "" && + len(c.config.HostUsers) == 0 { + return "", "", nil + } + + needPasswd := true + needGroup := true + + // First, check if there's a mount at /etc/passwd or group, we don't + // want to interfere with user mounts. + if MountExists(c.config.Spec.Mounts, "/etc/passwd") { + needPasswd = false + } + if MountExists(c.config.Spec.Mounts, "/etc/group") { + needGroup = false + } + + // Next, check if we already made the files. If we didn't, don't need to + // do anything more. + if needPasswd { + passwdPath := filepath.Join(c.config.StaticDir, "passwd") + if _, err := os.Stat(passwdPath); err == nil { + needPasswd = false + } + } + if needGroup { + groupPath := filepath.Join(c.config.StaticDir, "group") + if _, err := os.Stat(groupPath); err == nil { + needGroup = false + } + } + + // If we don't need a /etc/passwd or /etc/group at this point we can + // just return. + if !needPasswd && !needGroup { + return "", "", nil + } + + passwdPath := "" + groupPath := "" + + ro := c.IsReadOnly() + + if needPasswd { + passwdEntry, err := c.generatePasswdEntry() + if err != nil { + return "", "", err + } + + needsWrite := passwdEntry != "" + switch { + case ro && needsWrite: + logrus.Debugf("Making /etc/passwd for container %s", c.ID()) + originPasswdFile, err := securejoin.SecureJoin(c.state.Mountpoint, "/etc/passwd") + if err != nil { + return "", "", fmt.Errorf("creating path to container %s /etc/passwd: %w", c.ID(), err) + } + orig, err := ioutil.ReadFile(originPasswdFile) + if err != nil && !os.IsNotExist(err) { + return "", "", err + } + passwdFile, err := c.writeStringToStaticDir("passwd", string(orig)+passwdEntry) + if err != nil { + return "", "", fmt.Errorf("failed to create temporary passwd file: %w", err) + } + if err := os.Chmod(passwdFile, 0644); err != nil { + return "", "", err + } + passwdPath = passwdFile + case !ro && needsWrite: + logrus.Debugf("Modifying container %s /etc/passwd", c.ID()) + containerPasswd, err := securejoin.SecureJoin(c.state.Mountpoint, "/etc/passwd") + if err != nil { + return "", "", fmt.Errorf("looking up location of container %s /etc/passwd: %w", c.ID(), err) + } + + f, err := os.OpenFile(containerPasswd, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0600) + if err != nil { + return "", "", fmt.Errorf("container %s: %w", c.ID(), err) + } + defer f.Close() + + if _, err := f.WriteString(passwdEntry); err != nil { + return "", "", fmt.Errorf("unable to append to container %s /etc/passwd: %w", c.ID(), err) + } + default: + logrus.Debugf("Not modifying container %s /etc/passwd", c.ID()) + } + } + if needGroup { + groupEntry, err := c.generateGroupEntry() + if err != nil { + return "", "", err + } + + needsWrite := groupEntry != "" + switch { + case ro && needsWrite: + logrus.Debugf("Making /etc/group for container %s", c.ID()) + originGroupFile, err := securejoin.SecureJoin(c.state.Mountpoint, "/etc/group") + if err != nil { + return "", "", fmt.Errorf("creating path to container %s /etc/group: %w", c.ID(), err) + } + orig, err := ioutil.ReadFile(originGroupFile) + if err != nil && !os.IsNotExist(err) { + return "", "", err + } + groupFile, err := c.writeStringToStaticDir("group", string(orig)+groupEntry) + if err != nil { + return "", "", fmt.Errorf("failed to create temporary group file: %w", err) + } + if err := os.Chmod(groupFile, 0644); err != nil { + return "", "", err + } + groupPath = groupFile + case !ro && needsWrite: + logrus.Debugf("Modifying container %s /etc/group", c.ID()) + containerGroup, err := securejoin.SecureJoin(c.state.Mountpoint, "/etc/group") + if err != nil { + return "", "", fmt.Errorf("looking up location of container %s /etc/group: %w", c.ID(), err) + } + + f, err := os.OpenFile(containerGroup, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0600) + if err != nil { + return "", "", fmt.Errorf("container %s: %w", c.ID(), err) + } + defer f.Close() + + if _, err := f.WriteString(groupEntry); err != nil { + return "", "", fmt.Errorf("unable to append to container %s /etc/group: %w", c.ID(), err) + } + default: + logrus.Debugf("Not modifying container %s /etc/group", c.ID()) + } + } + + return passwdPath, groupPath, nil +} + +func (c *Container) copyTimezoneFile(zonePath string) (string, error) { + localtimeCopy := filepath.Join(c.state.RunDir, "localtime") + file, err := os.Stat(zonePath) + if err != nil { + return "", err + } + if file.IsDir() { + return "", errors.New("invalid timezone: is a directory") + } + src, err := os.Open(zonePath) + if err != nil { + return "", err + } + defer src.Close() + dest, err := os.Create(localtimeCopy) + if err != nil { + return "", err + } + defer dest.Close() + _, err = io.Copy(dest, src) + if err != nil { + return "", err + } + if err := c.relabel(localtimeCopy, c.config.MountLabel, false); err != nil { + return "", err + } + if err := dest.Chown(c.RootUID(), c.RootGID()); err != nil { + return "", err + } + return localtimeCopy, err +} + +func (c *Container) cleanupOverlayMounts() error { + return overlay.CleanupContent(c.config.StaticDir) +} + +// Creates and mounts an empty dir to mount secrets into, if it does not already exist +func (c *Container) createSecretMountDir() error { + src := filepath.Join(c.state.RunDir, "/run/secrets") + _, err := os.Stat(src) + if os.IsNotExist(err) { + oldUmask := umask.Set(0) + defer umask.Set(oldUmask) + + if err := os.MkdirAll(src, 0755); err != nil { + return err + } + if err := label.Relabel(src, c.config.MountLabel, false); err != nil { + return err + } + if err := os.Chown(src, c.RootUID(), c.RootGID()); err != nil { + return err + } + c.state.BindMounts["/run/secrets"] = src + return nil + } + + return err +} + +// Fix ownership and permissions of the specified volume if necessary. +func (c *Container) fixVolumePermissions(v *ContainerNamedVolume) error { + vol, err := c.runtime.state.Volume(v.Name) + if err != nil { + return fmt.Errorf("retrieving named volume %s for container %s: %w", v.Name, c.ID(), err) + } + + vol.lock.Lock() + defer vol.lock.Unlock() + + // The volume may need a copy-up. Check the state. + if err := vol.update(); err != nil { + return err + } + + // Volumes owned by a volume driver are not chowned - we don't want to + // mess with a mount not managed by us. + if vol.state.NeedsChown && !vol.UsesVolumeDriver() { + vol.state.NeedsChown = false + + uid := int(c.config.Spec.Process.User.UID) + gid := int(c.config.Spec.Process.User.GID) + + if c.config.IDMappings.UIDMap != nil { + p := idtools.IDPair{ + UID: uid, + GID: gid, + } + mappings := idtools.NewIDMappingsFromMaps(c.config.IDMappings.UIDMap, c.config.IDMappings.GIDMap) + newPair, err := mappings.ToHost(p) + if err != nil { + return fmt.Errorf("mapping user %d:%d: %w", uid, gid, err) + } + uid = newPair.UID + gid = newPair.GID + } + + vol.state.UIDChowned = uid + vol.state.GIDChowned = gid + + if err := vol.save(); err != nil { + return err + } + + mountPoint, err := vol.MountPoint() + if err != nil { + return err + } + + if err := os.Lchown(mountPoint, uid, gid); err != nil { + return err + } + + // Make sure the new volume matches the permissions of the target directory. + // https://github.com/containers/podman/issues/10188 + st, err := os.Lstat(filepath.Join(c.state.Mountpoint, v.Dest)) + if err == nil { + if stat, ok := st.Sys().(*syscall.Stat_t); ok { + if err := os.Lchown(mountPoint, int(stat.Uid), int(stat.Gid)); err != nil { + return err + } + } + if err := os.Chmod(mountPoint, st.Mode()); err != nil { + return err + } + if err := setVolumeAtime(mountPoint, st); err != nil { + return err + } + } else if !os.IsNotExist(err) { + return err + } + } + return nil +} + +func (c *Container) relabel(src, mountLabel string, recurse bool) error { + if !selinux.GetEnabled() || mountLabel == "" { + return nil + } + // only relabel on initial creation of container + if !c.ensureState(define.ContainerStateConfigured, define.ContainerStateUnknown) { + label, err := label.FileLabel(src) + if err != nil { + return err + } + // If labels are different, might be on a tmpfs + if label == mountLabel { + return nil + } + } + return label.Relabel(src, mountLabel, recurse) +} + +func (c *Container) ChangeHostPathOwnership(src string, recurse bool, uid, gid int) error { + // only chown on initial creation of container + if !c.ensureState(define.ContainerStateConfigured, define.ContainerStateUnknown) { + st, err := os.Stat(src) + if err != nil { + return err + } + + // If labels are different, might be on a tmpfs + if int(st.Sys().(*syscall.Stat_t).Uid) == uid && int(st.Sys().(*syscall.Stat_t).Gid) == gid { + return nil + } + } + return chown.ChangeHostPathOwnership(src, recurse, uid, gid) +} diff --git a/libpod/container_internal_freebsd.go b/libpod/container_internal_freebsd.go new file mode 100644 index 000000000..67f87a98d --- /dev/null +++ b/libpod/container_internal_freebsd.go @@ -0,0 +1,274 @@ +//go:build freebsd +// +build freebsd + +package libpod + +import ( + "fmt" + "os" + "strings" + "sync" + "syscall" + "time" + + "github.com/containers/common/libnetwork/types" + "github.com/containers/podman/v4/pkg/rootless" + spec "github.com/opencontainers/runtime-spec/specs-go" + "github.com/opencontainers/runtime-tools/generate" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +var ( + bindOptions = []string{} +) + +func (c *Container) mountSHM(shmOptions string) error { + return nil +} + +func (c *Container) unmountSHM(path string) error { + return nil +} + +// prepare mounts the container and sets up other required resources like net +// namespaces +func (c *Container) prepare() error { + var ( + wg sync.WaitGroup + ctrNS *jailNetNS + networkStatus map[string]types.StatusBlock + createNetNSErr, mountStorageErr error + mountPoint string + tmpStateLock sync.Mutex + ) + + wg.Add(2) + + go func() { + defer wg.Done() + // Set up network namespace if not already set up + noNetNS := c.state.NetNS == nil + if c.config.CreateNetNS && noNetNS && !c.config.PostConfigureNetNS { + ctrNS, networkStatus, createNetNSErr = c.runtime.createNetNS(c) + if createNetNSErr != nil { + return + } + + tmpStateLock.Lock() + defer tmpStateLock.Unlock() + + // Assign NetNS attributes to container + c.state.NetNS = ctrNS + c.state.NetworkStatus = networkStatus + } + }() + // Mount storage if not mounted + go func() { + defer wg.Done() + mountPoint, mountStorageErr = c.mountStorage() + + if mountStorageErr != nil { + return + } + + tmpStateLock.Lock() + defer tmpStateLock.Unlock() + + // Finish up mountStorage + c.state.Mounted = true + c.state.Mountpoint = mountPoint + + logrus.Debugf("Created root filesystem for container %s at %s", c.ID(), c.state.Mountpoint) + }() + + wg.Wait() + + var createErr error + if mountStorageErr != nil { + if createErr != nil { + logrus.Errorf("Preparing container %s: %v", c.ID(), createErr) + } + createErr = mountStorageErr + } + + if createErr != nil { + return createErr + } + + // Save changes to container state + if err := c.save(); err != nil { + return err + } + + return nil +} + +// cleanupNetwork unmounts and cleans up the container's network +func (c *Container) cleanupNetwork() error { + if c.config.NetNsCtr != "" { + return nil + } + netDisabled, err := c.NetworkDisabled() + if err != nil { + return err + } + if netDisabled { + return nil + } + + // Stop the container's network namespace (if it has one) + if err := c.runtime.teardownNetNS(c); err != nil { + logrus.Errorf("Unable to cleanup network for container %s: %q", c.ID(), err) + } + + if c.valid { + return c.save() + } + + return nil +} + +// reloadNetwork reloads the network for the given container, recreating +// firewall rules. +func (c *Container) reloadNetwork() error { + result, err := c.runtime.reloadContainerNetwork(c) + if err != nil { + return err + } + + c.state.NetworkStatus = result + + return c.save() +} + +// Add an existing container's network jail +func (c *Container) addNetworkContainer(g *generate.Generator, ctr string) error { + nsCtr, err := c.runtime.state.Container(ctr) + c.runtime.state.UpdateContainer(nsCtr) + if err != nil { + return fmt.Errorf("retrieving dependency %s of container %s from state: %w", ctr, c.ID(), err) + } + g.AddAnnotation("org.freebsd.parentJail", nsCtr.state.NetNS.Name) + return nil +} + +func isRootlessCgroupSet(cgroup string) bool { + return false +} + +func (c *Container) expectPodCgroup() (bool, error) { + return false, nil +} + +func (c *Container) getOCICgroupPath() (string, error) { + return "", nil +} + +func openDirectory(path string) (fd int, err error) { + const O_PATH = 0x00400000 + return unix.Open(path, unix.O_RDONLY|O_PATH, 0) +} + +func (c *Container) addNetworkNamespace(g *generate.Generator) error { + if c.config.CreateNetNS { + g.AddAnnotation("org.freebsd.parentJail", c.state.NetNS.Name) + } + return nil +} + +func (c *Container) addSystemdMounts(g *generate.Generator) error { + return nil +} + +func (c *Container) addSharedNamespaces(g *generate.Generator) error { + if c.config.NetNsCtr != "" { + if err := c.addNetworkContainer(g, c.config.NetNsCtr); err != nil { + return err + } + } + + availableUIDs, availableGIDs, err := rootless.GetAvailableIDMaps() + if err != nil { + if os.IsNotExist(err) { + // The kernel-provided files only exist if user namespaces are supported + logrus.Debugf("User or group ID mappings not available: %s", err) + } else { + return err + } + } else { + g.Config.Linux.UIDMappings = rootless.MaybeSplitMappings(g.Config.Linux.UIDMappings, availableUIDs) + g.Config.Linux.GIDMappings = rootless.MaybeSplitMappings(g.Config.Linux.GIDMappings, availableGIDs) + } + + // Hostname handling: + // If we have a UTS namespace, set Hostname in the OCI spec. + // Set the HOSTNAME environment variable unless explicitly overridden by + // the user (already present in OCI spec). If we don't have a UTS ns, + // set it to the host's hostname instead. + hostname := c.Hostname() + foundUTS := false + + // TODO: make this optional, needs progress on adding FreeBSD section to the spec + foundUTS = true + g.SetHostname(hostname) + + if !foundUTS { + tmpHostname, err := os.Hostname() + if err != nil { + return err + } + hostname = tmpHostname + } + needEnv := true + for _, checkEnv := range g.Config.Process.Env { + if strings.SplitN(checkEnv, "=", 2)[0] == "HOSTNAME" { + needEnv = false + break + } + } + if needEnv { + g.AddProcessEnv("HOSTNAME", hostname) + } + return nil +} + +func (c *Container) addRootPropagation(g *generate.Generator, mounts []spec.Mount) error { + return nil +} + +func (c *Container) setProcessLabel(g *generate.Generator) { +} + +func (c *Container) setMountLabel(g *generate.Generator) { +} + +func (c *Container) setCgroupsPath(g *generate.Generator) error { + return nil +} + +func (c *Container) addSlirp4netnsDNS(nameservers []string) []string { + return nameservers +} + +func (c *Container) isSlirp4netnsIPv6() (bool, error) { + return false, nil +} + +// check for net=none +func (c *Container) hasNetNone() bool { + return c.state.NetNS == nil +} + +func setVolumeAtime(mountPoint string, st os.FileInfo) error { + stat := st.Sys().(*syscall.Stat_t) + atime := time.Unix(int64(stat.Atimespec.Sec), int64(stat.Atimespec.Nsec)) //nolint: unconvert + if err := os.Chtimes(mountPoint, atime, st.ModTime()); err != nil { + return err + } + return nil +} + +func (c *Container) makePlatformBindMounts() error { + return nil +} diff --git a/libpod/container_internal_linux.go b/libpod/container_internal_linux.go index a131ab367..ef8649776 100644 --- a/libpod/container_internal_linux.go +++ b/libpod/container_internal_linux.go @@ -4,64 +4,34 @@ package libpod import ( - "context" "errors" "fmt" - "io" - "io/ioutil" - "math" "os" - "os/user" "path" "path/filepath" - "strconv" "strings" "sync" "syscall" "time" - metadata "github.com/checkpoint-restore/checkpointctl/lib" - "github.com/checkpoint-restore/go-criu/v5/stats" - cdi "github.com/container-orchestrated-devices/container-device-interface/pkg/cdi" "github.com/containernetworking/plugins/pkg/ns" - "github.com/containers/buildah" - "github.com/containers/buildah/pkg/chrootuser" - "github.com/containers/buildah/pkg/overlay" - butil "github.com/containers/buildah/util" - "github.com/containers/common/libnetwork/etchosts" - "github.com/containers/common/libnetwork/resolvconf" "github.com/containers/common/libnetwork/types" - "github.com/containers/common/pkg/apparmor" "github.com/containers/common/pkg/cgroups" - "github.com/containers/common/pkg/chown" "github.com/containers/common/pkg/config" - "github.com/containers/common/pkg/subscriptions" - "github.com/containers/common/pkg/umask" - cutil "github.com/containers/common/pkg/util" - is "github.com/containers/image/v5/storage" "github.com/containers/podman/v4/libpod/define" - "github.com/containers/podman/v4/libpod/events" - "github.com/containers/podman/v4/pkg/annotations" - "github.com/containers/podman/v4/pkg/checkpoint/crutils" - "github.com/containers/podman/v4/pkg/criu" - "github.com/containers/podman/v4/pkg/lookup" "github.com/containers/podman/v4/pkg/rootless" - "github.com/containers/podman/v4/pkg/util" "github.com/containers/podman/v4/utils" - "github.com/containers/podman/v4/version" - "github.com/containers/storage/pkg/archive" - "github.com/containers/storage/pkg/idtools" - "github.com/containers/storage/pkg/lockfile" - securejoin "github.com/cyphar/filepath-securejoin" - runcuser "github.com/opencontainers/runc/libcontainer/user" spec "github.com/opencontainers/runtime-spec/specs-go" "github.com/opencontainers/runtime-tools/generate" - "github.com/opencontainers/selinux/go-selinux" "github.com/opencontainers/selinux/go-selinux/label" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) +var ( + bindOptions = []string{"bind", "rprivate"} +) + func (c *Container) mountSHM(shmOptions string) error { if err := unix.Mount("shm", c.config.ShmDir, "tmpfs", unix.MS_NOEXEC|unix.MS_NOSUID|unix.MS_NODEV, label.FormatMountLabel(shmOptions, c.config.MountLabel)); err != nil { @@ -73,7 +43,7 @@ func (c *Container) mountSHM(shmOptions string) error { func (c *Container) unmountSHM(mount string) error { if err := unix.Unmount(mount, 0); err != nil { if err != syscall.EINVAL && err != syscall.ENOENT { - return fmt.Errorf("error unmounting container %s SHM mount %s: %w", c.ID(), mount, err) + return fmt.Errorf("unmounting container %s SHM mount %s: %w", c.ID(), mount, err) } // If it's just an EINVAL or ENOENT, debug logs only logrus.Debugf("Container %s failed to unmount %s : %v", c.ID(), mount, err) @@ -152,7 +122,7 @@ func (c *Container) prepare() error { // createErr is guaranteed non-nil, so print // unconditionally logrus.Errorf("Preparing container %s: %v", c.ID(), createErr) - createErr = fmt.Errorf("error unmounting storage for container %s after network create failure: %w", c.ID(), err) + createErr = fmt.Errorf("unmounting storage for container %s after network create failure: %w", c.ID(), err) } } @@ -161,7 +131,7 @@ func (c *Container) prepare() error { if createErr != nil { if err := c.cleanupNetwork(); err != nil { logrus.Errorf("Preparing container %s: %v", c.ID(), createErr) - createErr = fmt.Errorf("error cleaning up container %s network after setup failure: %w", c.ID(), err) + createErr = fmt.Errorf("cleaning up container %s network after setup failure: %w", c.ID(), err) } } @@ -177,118 +147,6 @@ func (c *Container) prepare() error { return nil } -// isWorkDirSymlink returns true if resolved workdir is symlink or a chain of symlinks, -// and final resolved target is present either on volume, mount or inside of container -// otherwise it returns false. Following function is meant for internal use only and -// can change at any point of time. -func (c *Container) isWorkDirSymlink(resolvedPath string) bool { - // We cannot create workdir since explicit --workdir is - // set in config but workdir could also be a symlink. - // If it's a symlink, check if the resolved target is present in the container. - // If so, that's a valid use case: return nil. - - maxSymLinks := 0 - for { - // Linux only supports a chain of 40 links. - // Reference: https://github.com/torvalds/linux/blob/master/include/linux/namei.h#L13 - if maxSymLinks > 40 { - break - } - resolvedSymlink, err := os.Readlink(resolvedPath) - if err != nil { - // End sym-link resolution loop. - break - } - if resolvedSymlink != "" { - _, resolvedSymlinkWorkdir, err := c.resolvePath(c.state.Mountpoint, resolvedSymlink) - if isPathOnVolume(c, resolvedSymlinkWorkdir) || isPathOnBindMount(c, resolvedSymlinkWorkdir) { - // Resolved symlink exists on external volume or mount - return true - } - if err != nil { - // Could not resolve path so end sym-link resolution loop. - break - } - if resolvedSymlinkWorkdir != "" { - resolvedPath = resolvedSymlinkWorkdir - _, err := os.Stat(resolvedSymlinkWorkdir) - if err == nil { - // Symlink resolved successfully and resolved path exists on container, - // this is a valid use-case so return nil. - logrus.Debugf("Workdir is a symlink with target to %q and resolved symlink exists on container", resolvedSymlink) - return true - } - } - } - maxSymLinks++ - } - return false -} - -// resolveWorkDir resolves the container's workdir and, depending on the -// configuration, will create it, or error out if it does not exist. -// Note that the container must be mounted before. -func (c *Container) resolveWorkDir() error { - workdir := c.WorkingDir() - - // If the specified workdir is a subdir of a volume or mount, - // we don't need to do anything. The runtime is taking care of - // that. - if isPathOnVolume(c, workdir) || isPathOnBindMount(c, workdir) { - logrus.Debugf("Workdir %q resolved to a volume or mount", workdir) - return nil - } - - _, resolvedWorkdir, err := c.resolvePath(c.state.Mountpoint, workdir) - if err != nil { - return err - } - logrus.Debugf("Workdir %q resolved to host path %q", workdir, resolvedWorkdir) - - st, err := os.Stat(resolvedWorkdir) - if err == nil { - if !st.IsDir() { - return fmt.Errorf("workdir %q exists on container %s, but is not a directory", workdir, c.ID()) - } - return nil - } - if !c.config.CreateWorkingDir { - // No need to create it (e.g., `--workdir=/foo`), so let's make sure - // the path exists on the container. - if err != nil { - if os.IsNotExist(err) { - // If resolved Workdir path gets marked as a valid symlink, - // return nil cause this is valid use-case. - if c.isWorkDirSymlink(resolvedWorkdir) { - return nil - } - return fmt.Errorf("workdir %q does not exist on container %s", workdir, c.ID()) - } - // This might be a serious error (e.g., permission), so - // we need to return the full error. - return fmt.Errorf("error detecting workdir %q on container %s: %w", workdir, c.ID(), err) - } - return nil - } - if err := os.MkdirAll(resolvedWorkdir, 0755); err != nil { - if os.IsExist(err) { - return nil - } - return fmt.Errorf("error creating container %s workdir: %w", c.ID(), err) - } - - // Ensure container entrypoint is created (if required). - uid, gid, _, err := chrootuser.GetUser(c.state.Mountpoint, c.User()) - if err != nil { - return fmt.Errorf("error looking up %s inside of the container %s: %w", c.User(), c.ID(), err) - } - if err := os.Chown(resolvedWorkdir, int(uid), int(gid)); err != nil { - return fmt.Errorf("error chowning container %s workdir to container root: %w", c.ID(), err) - } - - return nil -} - // cleanupNetwork unmounts and cleans up the container's network func (c *Container) cleanupNetwork() error { if c.config.NetNsCtr != "" { @@ -335,671 +193,6 @@ func (c *Container) reloadNetwork() error { return c.save() } -func (c *Container) getUserOverrides() *lookup.Overrides { - var hasPasswdFile, hasGroupFile bool - overrides := lookup.Overrides{} - for _, m := range c.config.Spec.Mounts { - if m.Destination == "/etc/passwd" { - overrides.ContainerEtcPasswdPath = m.Source - hasPasswdFile = true - } - if m.Destination == "/etc/group" { - overrides.ContainerEtcGroupPath = m.Source - hasGroupFile = true - } - if m.Destination == "/etc" { - if !hasPasswdFile { - overrides.ContainerEtcPasswdPath = filepath.Join(m.Source, "passwd") - } - if !hasGroupFile { - overrides.ContainerEtcGroupPath = filepath.Join(m.Source, "group") - } - } - } - if path, ok := c.state.BindMounts["/etc/passwd"]; ok { - overrides.ContainerEtcPasswdPath = path - } - return &overrides -} - -func lookupHostUser(name string) (*runcuser.ExecUser, error) { - var execUser runcuser.ExecUser - // Look up User on host - u, err := util.LookupUser(name) - if err != nil { - return &execUser, err - } - uid, err := strconv.ParseUint(u.Uid, 8, 32) - if err != nil { - return &execUser, err - } - - gid, err := strconv.ParseUint(u.Gid, 8, 32) - if err != nil { - return &execUser, err - } - execUser.Uid = int(uid) - execUser.Gid = int(gid) - execUser.Home = u.HomeDir - return &execUser, nil -} - -// Internal only function which returns upper and work dir from -// overlay options. -func getOverlayUpperAndWorkDir(options []string) (string, string, error) { - upperDir := "" - workDir := "" - for _, o := range options { - if strings.HasPrefix(o, "upperdir") { - splitOpt := strings.SplitN(o, "=", 2) - if len(splitOpt) > 1 { - upperDir = splitOpt[1] - if upperDir == "" { - return "", "", errors.New("cannot accept empty value for upperdir") - } - } - } - if strings.HasPrefix(o, "workdir") { - splitOpt := strings.SplitN(o, "=", 2) - if len(splitOpt) > 1 { - workDir = splitOpt[1] - if workDir == "" { - return "", "", errors.New("cannot accept empty value for workdir") - } - } - } - } - if (upperDir != "" && workDir == "") || (upperDir == "" && workDir != "") { - return "", "", errors.New("must specify both upperdir and workdir") - } - return upperDir, workDir, nil -} - -// Generate spec for a container -// Accepts a map of the container's dependencies -func (c *Container) generateSpec(ctx context.Context) (*spec.Spec, error) { - overrides := c.getUserOverrides() - execUser, err := lookup.GetUserGroupInfo(c.state.Mountpoint, c.config.User, overrides) - if err != nil { - if cutil.StringInSlice(c.config.User, c.config.HostUsers) { - execUser, err = lookupHostUser(c.config.User) - } - if err != nil { - return nil, err - } - } - - // NewFromSpec() is deprecated according to its comment - // however the recommended replace just causes a nil map panic - //nolint:staticcheck - g := generate.NewFromSpec(c.config.Spec) - - // If the flag to mount all devices is set for a privileged container, add - // all the devices from the host's machine into the container - if c.config.MountAllDevices { - if err := util.AddPrivilegedDevices(&g); err != nil { - return nil, err - } - } - - // If network namespace was requested, add it now - if c.config.CreateNetNS { - if c.config.PostConfigureNetNS { - if err := g.AddOrReplaceLinuxNamespace(string(spec.NetworkNamespace), ""); err != nil { - return nil, err - } - } else { - if err := g.AddOrReplaceLinuxNamespace(string(spec.NetworkNamespace), c.state.NetNS.Path()); err != nil { - return nil, err - } - } - } - - // Apply AppArmor checks and load the default profile if needed. - if len(c.config.Spec.Process.ApparmorProfile) > 0 { - updatedProfile, err := apparmor.CheckProfileAndLoadDefault(c.config.Spec.Process.ApparmorProfile) - if err != nil { - return nil, err - } - g.SetProcessApparmorProfile(updatedProfile) - } - - if err := c.makeBindMounts(); err != nil { - return nil, err - } - - if err := c.mountNotifySocket(g); err != nil { - return nil, err - } - - // Get host UID and GID based on the container process UID and GID. - hostUID, hostGID, err := butil.GetHostIDs(util.IDtoolsToRuntimeSpec(c.config.IDMappings.UIDMap), util.IDtoolsToRuntimeSpec(c.config.IDMappings.GIDMap), uint32(execUser.Uid), uint32(execUser.Gid)) - if err != nil { - return nil, err - } - - // Add named volumes - for _, namedVol := range c.config.NamedVolumes { - volume, err := c.runtime.GetVolume(namedVol.Name) - if err != nil { - return nil, fmt.Errorf("error retrieving volume %s to add to container %s: %w", namedVol.Name, c.ID(), err) - } - mountPoint, err := volume.MountPoint() - if err != nil { - return nil, err - } - - overlayFlag := false - upperDir := "" - workDir := "" - for _, o := range namedVol.Options { - if o == "O" { - overlayFlag = true - upperDir, workDir, err = getOverlayUpperAndWorkDir(namedVol.Options) - if err != nil { - return nil, err - } - } - } - - if overlayFlag { - var overlayMount spec.Mount - var overlayOpts *overlay.Options - contentDir, err := overlay.TempDir(c.config.StaticDir, c.RootUID(), c.RootGID()) - if err != nil { - return nil, err - } - - overlayOpts = &overlay.Options{RootUID: c.RootUID(), - RootGID: c.RootGID(), - UpperDirOptionFragment: upperDir, - WorkDirOptionFragment: workDir, - GraphOpts: c.runtime.store.GraphOptions(), - } - - overlayMount, err = overlay.MountWithOptions(contentDir, mountPoint, namedVol.Dest, overlayOpts) - if err != nil { - return nil, fmt.Errorf("mounting overlay failed %q: %w", mountPoint, err) - } - - for _, o := range namedVol.Options { - if o == "U" { - if err := c.ChangeHostPathOwnership(mountPoint, true, int(hostUID), int(hostGID)); err != nil { - return nil, err - } - - if err := c.ChangeHostPathOwnership(contentDir, true, int(hostUID), int(hostGID)); err != nil { - return nil, err - } - } - } - g.AddMount(overlayMount) - } else { - volMount := spec.Mount{ - Type: "bind", - Source: mountPoint, - Destination: namedVol.Dest, - Options: namedVol.Options, - } - g.AddMount(volMount) - } - } - - // Check if the spec file mounts contain the options z, Z or U. - // If they have z or Z, relabel the source directory and then remove the option. - // If they have U, chown the source directory and them remove the option. - for i := range g.Config.Mounts { - m := &g.Config.Mounts[i] - var options []string - for _, o := range m.Options { - switch o { - case "U": - if m.Type == "tmpfs" { - options = append(options, []string{fmt.Sprintf("uid=%d", execUser.Uid), fmt.Sprintf("gid=%d", execUser.Gid)}...) - } else { - // only chown on initial creation of container - if err := c.ChangeHostPathOwnership(m.Source, true, int(hostUID), int(hostGID)); err != nil { - return nil, err - } - } - case "z": - fallthrough - case "Z": - if err := c.relabel(m.Source, c.MountLabel(), label.IsShared(o)); err != nil { - return nil, err - } - - default: - options = append(options, o) - } - } - m.Options = options - } - - g.SetProcessSelinuxLabel(c.ProcessLabel()) - g.SetLinuxMountLabel(c.MountLabel()) - - // Add bind mounts to container - for dstPath, srcPath := range c.state.BindMounts { - newMount := spec.Mount{ - Type: "bind", - Source: srcPath, - Destination: dstPath, - Options: []string{"bind", "rprivate"}, - } - if c.IsReadOnly() && dstPath != "/dev/shm" { - newMount.Options = append(newMount.Options, "ro", "nosuid", "noexec", "nodev") - } - if dstPath == "/dev/shm" && c.state.BindMounts["/dev/shm"] == c.config.ShmDir { - newMount.Options = append(newMount.Options, "nosuid", "noexec", "nodev") - } - if !MountExists(g.Mounts(), dstPath) { - g.AddMount(newMount) - } else { - logrus.Infof("User mount overriding libpod mount at %q", dstPath) - } - } - - // Add overlay volumes - for _, overlayVol := range c.config.OverlayVolumes { - upperDir, workDir, err := getOverlayUpperAndWorkDir(overlayVol.Options) - if err != nil { - return nil, err - } - contentDir, err := overlay.TempDir(c.config.StaticDir, c.RootUID(), c.RootGID()) - if err != nil { - return nil, err - } - overlayOpts := &overlay.Options{RootUID: c.RootUID(), - RootGID: c.RootGID(), - UpperDirOptionFragment: upperDir, - WorkDirOptionFragment: workDir, - GraphOpts: c.runtime.store.GraphOptions(), - } - - overlayMount, err := overlay.MountWithOptions(contentDir, overlayVol.Source, overlayVol.Dest, overlayOpts) - if err != nil { - return nil, fmt.Errorf("mounting overlay failed %q: %w", overlayVol.Source, err) - } - - // Check overlay volume options - for _, o := range overlayVol.Options { - if o == "U" { - if err := c.ChangeHostPathOwnership(overlayVol.Source, true, int(hostUID), int(hostGID)); err != nil { - return nil, err - } - - if err := c.ChangeHostPathOwnership(contentDir, true, int(hostUID), int(hostGID)); err != nil { - return nil, err - } - } - } - - g.AddMount(overlayMount) - } - - // Add image volumes as overlay mounts - for _, volume := range c.config.ImageVolumes { - // Mount the specified image. - img, _, err := c.runtime.LibimageRuntime().LookupImage(volume.Source, nil) - if err != nil { - return nil, fmt.Errorf("error creating image volume %q:%q: %w", volume.Source, volume.Dest, err) - } - mountPoint, err := img.Mount(ctx, nil, "") - if err != nil { - return nil, fmt.Errorf("error mounting image volume %q:%q: %w", volume.Source, volume.Dest, err) - } - - contentDir, err := overlay.TempDir(c.config.StaticDir, c.RootUID(), c.RootGID()) - if err != nil { - return nil, fmt.Errorf("failed to create TempDir in the %s directory: %w", c.config.StaticDir, err) - } - - var overlayMount spec.Mount - if volume.ReadWrite { - overlayMount, err = overlay.Mount(contentDir, mountPoint, volume.Dest, c.RootUID(), c.RootGID(), c.runtime.store.GraphOptions()) - } else { - overlayMount, err = overlay.MountReadOnly(contentDir, mountPoint, volume.Dest, c.RootUID(), c.RootGID(), c.runtime.store.GraphOptions()) - } - if err != nil { - return nil, fmt.Errorf("creating overlay mount for image %q failed: %w", volume.Source, err) - } - g.AddMount(overlayMount) - } - - hasHomeSet := false - for _, s := range c.config.Spec.Process.Env { - if strings.HasPrefix(s, "HOME=") { - hasHomeSet = true - break - } - } - if !hasHomeSet && execUser.Home != "" { - c.config.Spec.Process.Env = append(c.config.Spec.Process.Env, fmt.Sprintf("HOME=%s", execUser.Home)) - } - - if c.config.User != "" { - // User and Group must go together - g.SetProcessUID(uint32(execUser.Uid)) - g.SetProcessGID(uint32(execUser.Gid)) - } - - if c.config.Umask != "" { - decVal, err := strconv.ParseUint(c.config.Umask, 8, 32) - if err != nil { - return nil, fmt.Errorf("invalid Umask Value: %w", err) - } - umask := uint32(decVal) - g.Config.Process.User.Umask = &umask - } - - // Add addition groups if c.config.GroupAdd is not empty - if len(c.config.Groups) > 0 { - gids, err := lookup.GetContainerGroups(c.config.Groups, c.state.Mountpoint, overrides) - if err != nil { - return nil, fmt.Errorf("error looking up supplemental groups for container %s: %w", c.ID(), err) - } - for _, gid := range gids { - g.AddProcessAdditionalGid(gid) - } - } - - if c.Systemd() { - if err := c.setupSystemd(g.Mounts(), g); err != nil { - return nil, fmt.Errorf("error adding systemd-specific mounts: %w", err) - } - } - - // Look up and add groups the user belongs to, if a group wasn't directly specified - if !strings.Contains(c.config.User, ":") { - // the gidMappings that are present inside the container user namespace - var gidMappings []idtools.IDMap - - switch { - case len(c.config.IDMappings.GIDMap) > 0: - gidMappings = c.config.IDMappings.GIDMap - case rootless.IsRootless(): - // Check whether the current user namespace has enough gids available. - availableGids, err := rootless.GetAvailableGids() - if err != nil { - return nil, fmt.Errorf("cannot read number of available GIDs: %w", err) - } - gidMappings = []idtools.IDMap{{ - ContainerID: 0, - HostID: 0, - Size: int(availableGids), - }} - default: - gidMappings = []idtools.IDMap{{ - ContainerID: 0, - HostID: 0, - Size: math.MaxInt32, - }} - } - for _, gid := range execUser.Sgids { - isGIDAvailable := false - for _, m := range gidMappings { - if gid >= m.ContainerID && gid < m.ContainerID+m.Size { - isGIDAvailable = true - break - } - } - if isGIDAvailable { - g.AddProcessAdditionalGid(uint32(gid)) - } else { - logrus.Warnf("Additional gid=%d is not present in the user namespace, skip setting it", gid) - } - } - } - - // Add shared namespaces from other containers - if c.config.IPCNsCtr != "" { - if err := c.addNamespaceContainer(&g, IPCNS, c.config.IPCNsCtr, spec.IPCNamespace); err != nil { - return nil, err - } - } - if c.config.MountNsCtr != "" { - if err := c.addNamespaceContainer(&g, MountNS, c.config.MountNsCtr, spec.MountNamespace); err != nil { - return nil, err - } - } - if c.config.NetNsCtr != "" { - if err := c.addNamespaceContainer(&g, NetNS, c.config.NetNsCtr, spec.NetworkNamespace); err != nil { - return nil, err - } - } - if c.config.PIDNsCtr != "" { - if err := c.addNamespaceContainer(&g, PIDNS, c.config.PIDNsCtr, spec.PIDNamespace); err != nil { - return nil, err - } - } - if c.config.UserNsCtr != "" { - if err := c.addNamespaceContainer(&g, UserNS, c.config.UserNsCtr, spec.UserNamespace); err != nil { - return nil, err - } - if len(g.Config.Linux.UIDMappings) == 0 { - // runc complains if no mapping is specified, even if we join another ns. So provide a dummy mapping - g.AddLinuxUIDMapping(uint32(0), uint32(0), uint32(1)) - g.AddLinuxGIDMapping(uint32(0), uint32(0), uint32(1)) - } - } - - availableUIDs, availableGIDs, err := rootless.GetAvailableIDMaps() - if err != nil { - if os.IsNotExist(err) { - // The kernel-provided files only exist if user namespaces are supported - logrus.Debugf("User or group ID mappings not available: %s", err) - } else { - return nil, err - } - } else { - g.Config.Linux.UIDMappings = rootless.MaybeSplitMappings(g.Config.Linux.UIDMappings, availableUIDs) - g.Config.Linux.GIDMappings = rootless.MaybeSplitMappings(g.Config.Linux.GIDMappings, availableGIDs) - } - - // Hostname handling: - // If we have a UTS namespace, set Hostname in the OCI spec. - // Set the HOSTNAME environment variable unless explicitly overridden by - // the user (already present in OCI spec). If we don't have a UTS ns, - // set it to the host's hostname instead. - hostname := c.Hostname() - foundUTS := false - - for _, i := range c.config.Spec.Linux.Namespaces { - if i.Type == spec.UTSNamespace && i.Path == "" { - foundUTS = true - g.SetHostname(hostname) - break - } - } - if !foundUTS { - tmpHostname, err := os.Hostname() - if err != nil { - return nil, err - } - hostname = tmpHostname - } - needEnv := true - for _, checkEnv := range g.Config.Process.Env { - if strings.SplitN(checkEnv, "=", 2)[0] == "HOSTNAME" { - needEnv = false - break - } - } - if needEnv { - g.AddProcessEnv("HOSTNAME", hostname) - } - - if c.config.UTSNsCtr != "" { - if err := c.addNamespaceContainer(&g, UTSNS, c.config.UTSNsCtr, spec.UTSNamespace); err != nil { - return nil, err - } - } - if c.config.CgroupNsCtr != "" { - if err := c.addNamespaceContainer(&g, CgroupNS, c.config.CgroupNsCtr, spec.CgroupNamespace); err != nil { - return nil, err - } - } - - if c.config.UserNsCtr == "" && c.config.IDMappings.AutoUserNs { - if err := g.AddOrReplaceLinuxNamespace(string(spec.UserNamespace), ""); err != nil { - return nil, err - } - g.ClearLinuxUIDMappings() - for _, uidmap := range c.config.IDMappings.UIDMap { - g.AddLinuxUIDMapping(uint32(uidmap.HostID), uint32(uidmap.ContainerID), uint32(uidmap.Size)) - } - g.ClearLinuxGIDMappings() - for _, gidmap := range c.config.IDMappings.GIDMap { - g.AddLinuxGIDMapping(uint32(gidmap.HostID), uint32(gidmap.ContainerID), uint32(gidmap.Size)) - } - } - - g.SetRootPath(c.state.Mountpoint) - g.AddAnnotation(annotations.Created, c.config.CreatedTime.Format(time.RFC3339Nano)) - g.AddAnnotation("org.opencontainers.image.stopSignal", fmt.Sprintf("%d", c.config.StopSignal)) - - if _, exists := g.Config.Annotations[annotations.ContainerManager]; !exists { - g.AddAnnotation(annotations.ContainerManager, annotations.ContainerManagerLibpod) - } - - cgroupPath, err := c.getOCICgroupPath() - if err != nil { - return nil, err - } - - g.SetLinuxCgroupsPath(cgroupPath) - - // Warning: CDI may alter g.Config in place. - if len(c.config.CDIDevices) > 0 { - registry := cdi.GetRegistry() - if errs := registry.GetErrors(); len(errs) > 0 { - logrus.Debugf("The following errors were triggered when creating the CDI registry: %v", errs) - } - _, err := registry.InjectDevices(g.Config, c.config.CDIDevices...) - if err != nil { - return nil, fmt.Errorf("error setting up CDI devices: %w", err) - } - } - - // Mounts need to be sorted so paths will not cover other paths - mounts := sortMounts(g.Mounts()) - g.ClearMounts() - - // Determine property of RootPropagation based on volume properties. If - // a volume is shared, then keep root propagation shared. This should - // work for slave and private volumes too. - // - // For slave volumes, it can be either [r]shared/[r]slave. - // - // For private volumes any root propagation value should work. - rootPropagation := "" - for _, m := range mounts { - // We need to remove all symlinks from tmpfs mounts. - // Runc and other runtimes may choke on them. - // Easy solution: use securejoin to do a scoped evaluation of - // the links, then trim off the mount prefix. - if m.Type == "tmpfs" { - finalPath, err := securejoin.SecureJoin(c.state.Mountpoint, m.Destination) - if err != nil { - return nil, fmt.Errorf("error resolving symlinks for mount destination %s: %w", m.Destination, err) - } - trimmedPath := strings.TrimPrefix(finalPath, strings.TrimSuffix(c.state.Mountpoint, "/")) - m.Destination = trimmedPath - } - g.AddMount(m) - for _, opt := range m.Options { - switch opt { - case MountShared, MountRShared: - if rootPropagation != MountShared && rootPropagation != MountRShared { - rootPropagation = MountShared - } - case MountSlave, MountRSlave: - if rootPropagation != MountShared && rootPropagation != MountRShared && rootPropagation != MountSlave && rootPropagation != MountRSlave { - rootPropagation = MountRSlave - } - } - } - } - - if rootPropagation != "" { - logrus.Debugf("Set root propagation to %q", rootPropagation) - if err := g.SetLinuxRootPropagation(rootPropagation); err != nil { - return nil, err - } - } - - // Warning: precreate hooks may alter g.Config in place. - if c.state.ExtensionStageHooks, err = c.setupOCIHooks(ctx, g.Config); err != nil { - return nil, fmt.Errorf("error setting up OCI Hooks: %w", err) - } - if len(c.config.EnvSecrets) > 0 { - manager, err := c.runtime.SecretsManager() - if err != nil { - return nil, err - } - if err != nil { - return nil, err - } - for name, secr := range c.config.EnvSecrets { - _, data, err := manager.LookupSecretData(secr.Name) - if err != nil { - return nil, err - } - g.AddProcessEnv(name, string(data)) - } - } - - // Pass down the LISTEN_* environment (see #10443). - for _, key := range []string{"LISTEN_PID", "LISTEN_FDS", "LISTEN_FDNAMES"} { - if val, ok := os.LookupEnv(key); ok { - // Force the PID to `1` since we cannot rely on (all - // versions of) all runtimes to do it for us. - if key == "LISTEN_PID" { - val = "1" - } - g.AddProcessEnv(key, val) - } - } - - return g.Config, nil -} - -// mountNotifySocket mounts the NOTIFY_SOCKET into the container if it's set -// and if the sdnotify mode is set to container. It also sets c.notifySocket -// to avoid redundantly looking up the env variable. -func (c *Container) mountNotifySocket(g generate.Generator) error { - notify, ok := os.LookupEnv("NOTIFY_SOCKET") - if !ok { - return nil - } - c.notifySocket = notify - - if c.config.SdNotifyMode != define.SdNotifyModeContainer { - return nil - } - - notifyDir := filepath.Join(c.bundlePath(), "notify") - logrus.Debugf("Checking notify %q dir", notifyDir) - if err := os.MkdirAll(notifyDir, 0755); err != nil { - if !os.IsExist(err) { - return fmt.Errorf("unable to create notify %q dir: %w", notifyDir, err) - } - } - if err := label.Relabel(notifyDir, c.MountLabel(), true); err != nil { - return fmt.Errorf("relabel failed %q: %w", notifyDir, err) - } - logrus.Debugf("Add bindmount notify %q dir", notifyDir) - if _, ok := c.state.BindMounts["/run/notify"]; !ok { - c.state.BindMounts["/run/notify"] = notifyDir - } - - // Set the container's notify socket to the proxy socket created by conmon - g.AddProcessEnv("NOTIFY_SOCKET", "/run/notify/notify.sock") - - return nil -} - // systemd expects to have /run, /run/lock and /tmp on tmpfs // It also expects to be able to write to /sys/fs/cgroup/systemd and /var/log/journal func (c *Container) setupSystemd(mounts []spec.Mount, g generate.Generator) error { @@ -1074,9 +267,15 @@ func (c *Container) setupSystemd(mounts []spec.Mount, g generate.Generator) erro g.AddMount(systemdMnt) } else { mountOptions := []string{"bind", "rprivate"} + skipMount := false var statfs unix.Statfs_t if err := unix.Statfs("/sys/fs/cgroup/systemd", &statfs); err != nil { + if errors.Is(err, os.ErrNotExist) { + // If the mount is missing on the host, we cannot bind mount it so + // just skip it. + skipMount = true + } mountOptions = append(mountOptions, "nodev", "noexec", "nosuid") } else { if statfs.Flags&unix.MS_NODEV == unix.MS_NODEV { @@ -1092,15 +291,16 @@ func (c *Container) setupSystemd(mounts []spec.Mount, g generate.Generator) erro mountOptions = append(mountOptions, "ro") } } - - systemdMnt := spec.Mount{ - Destination: "/sys/fs/cgroup/systemd", - Type: "bind", - Source: "/sys/fs/cgroup/systemd", - Options: mountOptions, + if !skipMount { + systemdMnt := spec.Mount{ + Destination: "/sys/fs/cgroup/systemd", + Type: "bind", + Source: "/sys/fs/cgroup/systemd", + Options: mountOptions, + } + g.AddMount(systemdMnt) + g.AddLinuxMaskedPaths("/sys/fs/cgroup/systemd/release_agent") } - g.AddMount(systemdMnt) - g.AddLinuxMaskedPaths("/sys/fs/cgroup/systemd/release_agent") } return nil @@ -1110,7 +310,7 @@ func (c *Container) setupSystemd(mounts []spec.Mount, g generate.Generator) erro func (c *Container) addNamespaceContainer(g *generate.Generator, ns LinuxNS, ctr string, specNS spec.LinuxNamespaceType) error { nsCtr, err := c.runtime.state.Container(ctr) if err != nil { - return fmt.Errorf("error retrieving dependency %s of container %s from state: %w", ctr, c.ID(), err) + return fmt.Errorf("retrieving dependency %s of container %s from state: %w", ctr, c.ID(), err) } if specNS == spec.UTSNamespace { @@ -1132,1287 +332,289 @@ func (c *Container) addNamespaceContainer(g *generate.Generator, ns LinuxNS, ctr return nil } -func (c *Container) addCheckpointImageMetadata(importBuilder *buildah.Builder) error { - // Get information about host environment - hostInfo, err := c.Runtime().hostInfo() - if err != nil { - return fmt.Errorf("getting host info: %v", err) - } - - criuVersion, err := criu.GetCriuVersion() - if err != nil { - return fmt.Errorf("getting criu version: %v", err) - } - - rootfsImageID, rootfsImageName := c.Image() - - // Add image annotations with information about the container and the host. - // This information is useful to check compatibility before restoring the checkpoint - - checkpointImageAnnotations := map[string]string{ - define.CheckpointAnnotationName: c.config.Name, - define.CheckpointAnnotationRawImageName: c.config.RawImageName, - define.CheckpointAnnotationRootfsImageID: rootfsImageID, - define.CheckpointAnnotationRootfsImageName: rootfsImageName, - define.CheckpointAnnotationPodmanVersion: version.Version.String(), - define.CheckpointAnnotationCriuVersion: strconv.Itoa(criuVersion), - define.CheckpointAnnotationRuntimeName: hostInfo.OCIRuntime.Name, - define.CheckpointAnnotationRuntimeVersion: hostInfo.OCIRuntime.Version, - define.CheckpointAnnotationConmonVersion: hostInfo.Conmon.Version, - define.CheckpointAnnotationHostArch: hostInfo.Arch, - define.CheckpointAnnotationHostKernel: hostInfo.Kernel, - define.CheckpointAnnotationCgroupVersion: hostInfo.CgroupsVersion, - define.CheckpointAnnotationDistributionVersion: hostInfo.Distribution.Version, - define.CheckpointAnnotationDistributionName: hostInfo.Distribution.Distribution, - } - - for key, value := range checkpointImageAnnotations { - importBuilder.SetAnnotation(key, value) - } - - return nil -} - -func (c *Container) resolveCheckpointImageName(options *ContainerCheckpointOptions) error { - if options.CreateImage == "" { - return nil - } - - // Resolve image name - resolvedImageName, err := c.runtime.LibimageRuntime().ResolveName(options.CreateImage) - if err != nil { - return err - } - - options.CreateImage = resolvedImageName - return nil +func isRootlessCgroupSet(cgroup string) bool { + // old versions of podman were setting the CgroupParent to CgroupfsDefaultCgroupParent + // by default. Avoid breaking these versions and check whether the cgroup parent is + // set to the default and in this case enable the old behavior. It should not be a real + // problem because the default CgroupParent is usually owned by root so rootless users + // cannot access it. + // This check might be lifted in a future version of Podman. + // Check both that the cgroup or its parent is set to the default value (used by pods). + return cgroup != CgroupfsDefaultCgroupParent && filepath.Dir(cgroup) != CgroupfsDefaultCgroupParent } -func (c *Container) createCheckpointImage(ctx context.Context, options ContainerCheckpointOptions) error { - if options.CreateImage == "" { - return nil - } - logrus.Debugf("Create checkpoint image %s", options.CreateImage) - - // Create storage reference - imageRef, err := is.Transport.ParseStoreReference(c.runtime.store, options.CreateImage) - if err != nil { - return errors.New("failed to parse image name") - } - - // Build an image scratch - builderOptions := buildah.BuilderOptions{ - FromImage: "scratch", - } - importBuilder, err := buildah.NewBuilder(ctx, c.runtime.store, builderOptions) - if err != nil { - return err - } - // Clean up buildah working container - defer func() { - if err := importBuilder.Delete(); err != nil { - logrus.Errorf("Image builder delete failed: %v", err) - } - }() - - if err := c.prepareCheckpointExport(); err != nil { - return err - } - - // Export checkpoint into temporary tar file - tmpDir, err := ioutil.TempDir("", "checkpoint_image_") +func (c *Container) expectPodCgroup() (bool, error) { + unified, err := cgroups.IsCgroup2UnifiedMode() if err != nil { - return err - } - defer os.RemoveAll(tmpDir) - - options.TargetFile = path.Join(tmpDir, "checkpoint.tar") - - if err := c.exportCheckpoint(options); err != nil { - return err - } - - // Copy checkpoint from temporary tar file in the image - addAndCopyOptions := buildah.AddAndCopyOptions{} - if err := importBuilder.Add("", true, addAndCopyOptions, options.TargetFile); err != nil { - return err - } - - if err := c.addCheckpointImageMetadata(importBuilder); err != nil { - return err + return false, err } - - commitOptions := buildah.CommitOptions{ - Squash: true, - SystemContext: c.runtime.imageContext, + cgroupManager := c.CgroupManager() + switch { + case c.config.NoCgroups: + return false, nil + case cgroupManager == config.SystemdCgroupsManager: + return !rootless.IsRootless() || unified, nil + case cgroupManager == config.CgroupfsCgroupsManager: + return !rootless.IsRootless(), nil + default: + return false, fmt.Errorf("invalid cgroup mode %s requested for pods: %w", cgroupManager, define.ErrInvalidArg) } +} - // Create checkpoint image - id, _, _, err := importBuilder.Commit(ctx, imageRef, commitOptions) +// Get cgroup path in a format suitable for the OCI spec +func (c *Container) getOCICgroupPath() (string, error) { + unified, err := cgroups.IsCgroup2UnifiedMode() if err != nil { - return err + return "", err } - logrus.Debugf("Created checkpoint image: %s", id) - return nil -} - -func (c *Container) exportCheckpoint(options ContainerCheckpointOptions) error { - if len(c.Dependencies()) == 1 { - // Check if the dependency is an infra container. If it is we can checkpoint - // the container out of the Pod. - if c.config.Pod == "" { - return errors.New("cannot export checkpoints of containers with dependencies") - } - - pod, err := c.runtime.state.Pod(c.config.Pod) - if err != nil { - return fmt.Errorf("container %s is in pod %s, but pod cannot be retrieved: %w", c.ID(), c.config.Pod, err) - } - infraID, err := pod.InfraContainerID() + cgroupManager := c.CgroupManager() + switch { + case c.config.NoCgroups: + return "", nil + case c.config.CgroupsMode == cgroupSplit: + selfCgroup, err := utils.GetOwnCgroupDisallowRoot() if err != nil { - return fmt.Errorf("cannot retrieve infra container ID for pod %s: %w", c.config.Pod, err) + return "", err } - if c.Dependencies()[0] != infraID { - return errors.New("cannot export checkpoints of containers with dependencies") + return filepath.Join(selfCgroup, fmt.Sprintf("libpod-payload-%s", c.ID())), nil + case cgroupManager == config.SystemdCgroupsManager: + // When the OCI runtime is set to use Systemd as a cgroup manager, it + // expects cgroups to be passed as follows: + // slice:prefix:name + systemdCgroups := fmt.Sprintf("%s:libpod:%s", path.Base(c.config.CgroupParent), c.ID()) + logrus.Debugf("Setting Cgroups for container %s to %s", c.ID(), systemdCgroups) + return systemdCgroups, nil + case (rootless.IsRootless() && (cgroupManager == config.CgroupfsCgroupsManager || !unified)): + if c.config.CgroupParent == "" || !isRootlessCgroupSet(c.config.CgroupParent) { + return "", nil } + fallthrough + case cgroupManager == config.CgroupfsCgroupsManager: + cgroupPath := filepath.Join(c.config.CgroupParent, fmt.Sprintf("libpod-%s", c.ID())) + logrus.Debugf("Setting Cgroup path for container %s to %s", c.ID(), cgroupPath) + return cgroupPath, nil + default: + return "", fmt.Errorf("invalid cgroup manager %s requested: %w", cgroupManager, define.ErrInvalidArg) } - if len(c.Dependencies()) > 1 { - return errors.New("cannot export checkpoints of containers with dependencies") - } - logrus.Debugf("Exporting checkpoint image of container %q to %q", c.ID(), options.TargetFile) - - includeFiles := []string{ - "artifacts", - metadata.DevShmCheckpointTar, - metadata.ConfigDumpFile, - metadata.SpecDumpFile, - metadata.NetworkStatusFile, - stats.StatsDump, - } +} - if c.LogDriver() == define.KubernetesLogging || - c.LogDriver() == define.JSONLogging { - includeFiles = append(includeFiles, "ctr.log") - } - if options.PreCheckPoint { - includeFiles = append(includeFiles, preCheckpointDir) - } else { - includeFiles = append(includeFiles, metadata.CheckpointDirectory) - } - // Get root file-system changes included in the checkpoint archive - var addToTarFiles []string - if !options.IgnoreRootfs { - // To correctly track deleted files, let's go through the output of 'podman diff' - rootFsChanges, err := c.runtime.GetDiff("", c.ID(), define.DiffContainer) +// If the container is rootless, set up the slirp4netns network +func (c *Container) setupRootlessNetwork() error { + // set up slirp4netns again because slirp4netns will die when conmon exits + if c.config.NetMode.IsSlirp4netns() { + err := c.runtime.setupSlirp4netns(c, c.state.NetNS) if err != nil { - return fmt.Errorf("error exporting root file-system diff for %q: %w", c.ID(), err) + return err } + } - addToTarFiles, err := crutils.CRCreateRootFsDiffTar(&rootFsChanges, c.state.Mountpoint, c.bundlePath()) + // set up rootlesskit port forwarder again since it dies when conmon exits + // we use rootlesskit port forwarder only as rootless and when bridge network is used + if rootless.IsRootless() && c.config.NetMode.IsBridge() && len(c.config.PortMappings) > 0 { + err := c.runtime.setupRootlessPortMappingViaRLK(c, c.state.NetNS.Path(), c.state.NetworkStatus) if err != nil { return err } - - includeFiles = append(includeFiles, addToTarFiles...) } + return nil +} - // Folder containing archived volumes that will be included in the export - expVolDir := filepath.Join(c.bundlePath(), metadata.CheckpointVolumesDirectory) - - // Create an archive for each volume associated with the container - if !options.IgnoreVolumes { - if err := os.MkdirAll(expVolDir, 0700); err != nil { - return fmt.Errorf("error creating volumes export directory %q: %w", expVolDir, err) - } - - for _, v := range c.config.NamedVolumes { - volumeTarFilePath := filepath.Join(metadata.CheckpointVolumesDirectory, v.Name+".tar") - volumeTarFileFullPath := filepath.Join(c.bundlePath(), volumeTarFilePath) - - volumeTarFile, err := os.Create(volumeTarFileFullPath) - if err != nil { - return fmt.Errorf("error creating %q: %w", volumeTarFileFullPath, err) - } - - volume, err := c.runtime.GetVolume(v.Name) - if err != nil { - return err - } +func openDirectory(path string) (fd int, err error) { + return unix.Open(path, unix.O_RDONLY|unix.O_PATH, 0) +} - mp, err := volume.MountPoint() - if err != nil { +func (c *Container) addNetworkNamespace(g *generate.Generator) error { + if c.config.CreateNetNS { + if c.config.PostConfigureNetNS { + if err := g.AddOrReplaceLinuxNamespace(string(spec.NetworkNamespace), ""); err != nil { return err } - if mp == "" { - return fmt.Errorf("volume %s is not mounted, cannot export: %w", volume.Name(), define.ErrInternal) - } - - input, err := archive.TarWithOptions(mp, &archive.TarOptions{ - Compression: archive.Uncompressed, - IncludeSourceDir: true, - }) - if err != nil { - return fmt.Errorf("error reading volume directory %q: %w", v.Dest, err) - } - - _, err = io.Copy(volumeTarFile, input) - if err != nil { + } else { + if err := g.AddOrReplaceLinuxNamespace(string(spec.NetworkNamespace), c.state.NetNS.Path()); err != nil { return err } - volumeTarFile.Close() - - includeFiles = append(includeFiles, volumeTarFilePath) } } - - input, err := archive.TarWithOptions(c.bundlePath(), &archive.TarOptions{ - Compression: options.Compression, - IncludeSourceDir: true, - IncludeFiles: includeFiles, - }) - - if err != nil { - return fmt.Errorf("error reading checkpoint directory %q: %w", c.ID(), err) - } - - outFile, err := os.Create(options.TargetFile) - if err != nil { - return fmt.Errorf("error creating checkpoint export file %q: %w", options.TargetFile, err) - } - defer outFile.Close() - - if err := os.Chmod(options.TargetFile, 0600); err != nil { - return err - } - - _, err = io.Copy(outFile, input) - if err != nil { - return err - } - - for _, file := range addToTarFiles { - os.Remove(filepath.Join(c.bundlePath(), file)) - } - - if !options.IgnoreVolumes { - os.RemoveAll(expVolDir) - } - - return nil -} - -func (c *Container) checkpointRestoreSupported(version int) error { - if !criu.CheckForCriu(version) { - return fmt.Errorf("checkpoint/restore requires at least CRIU %d", version) - } - if !c.ociRuntime.SupportsCheckpoint() { - return errors.New("configured runtime does not support checkpoint/restore") - } return nil } -func (c *Container) checkpoint(ctx context.Context, options ContainerCheckpointOptions) (*define.CRIUCheckpointRestoreStatistics, int64, error) { - if err := c.checkpointRestoreSupported(criu.MinCriuVersion); err != nil { - return nil, 0, err - } - - if c.state.State != define.ContainerStateRunning { - return nil, 0, fmt.Errorf("%q is not running, cannot checkpoint: %w", c.state.State, define.ErrCtrStateInvalid) - } - - if c.AutoRemove() && options.TargetFile == "" { - return nil, 0, errors.New("cannot checkpoint containers that have been started with '--rm' unless '--export' is used") - } - - if err := c.resolveCheckpointImageName(&options); err != nil { - return nil, 0, err - } - - if err := crutils.CRCreateFileWithLabel(c.bundlePath(), "dump.log", c.MountLabel()); err != nil { - return nil, 0, err - } - - // Setting CheckpointLog early in case there is a failure. - c.state.CheckpointLog = path.Join(c.bundlePath(), "dump.log") - c.state.CheckpointPath = c.CheckpointPath() - - runtimeCheckpointDuration, err := c.ociRuntime.CheckpointContainer(c, options) - if err != nil { - return nil, 0, err - } - - // Keep the content of /dev/shm directory - if c.config.ShmDir != "" && c.state.BindMounts["/dev/shm"] == c.config.ShmDir { - shmDirTarFileFullPath := filepath.Join(c.bundlePath(), metadata.DevShmCheckpointTar) - - shmDirTarFile, err := os.Create(shmDirTarFileFullPath) - if err != nil { - return nil, 0, err - } - defer shmDirTarFile.Close() - - input, err := archive.TarWithOptions(c.config.ShmDir, &archive.TarOptions{ - Compression: archive.Uncompressed, - IncludeSourceDir: true, - }) - if err != nil { - return nil, 0, err - } - - if _, err = io.Copy(shmDirTarFile, input); err != nil { - return nil, 0, err - } - } - - // Save network.status. This is needed to restore the container with - // the same IP. Currently limited to one IP address in a container - // with one interface. - // FIXME: will this break something? - if _, err := metadata.WriteJSONFile(c.getNetworkStatus(), c.bundlePath(), metadata.NetworkStatusFile); err != nil { - return nil, 0, err - } - - defer c.newContainerEvent(events.Checkpoint) - - // There is a bug from criu: https://github.com/checkpoint-restore/criu/issues/116 - // We have to change the symbolic link from absolute path to relative path - if options.WithPrevious { - os.Remove(path.Join(c.CheckpointPath(), "parent")) - if err := os.Symlink("../pre-checkpoint", path.Join(c.CheckpointPath(), "parent")); err != nil { - return nil, 0, err - } - } - - if options.TargetFile != "" { - if err := c.exportCheckpoint(options); err != nil { - return nil, 0, err - } - } else { - if err := c.createCheckpointImage(ctx, options); err != nil { - return nil, 0, err - } - } - - logrus.Debugf("Checkpointed container %s", c.ID()) - - if !options.KeepRunning && !options.PreCheckPoint { - c.state.State = define.ContainerStateStopped - c.state.Checkpointed = true - c.state.CheckpointedTime = time.Now() - c.state.Restored = false - c.state.RestoredTime = time.Time{} - - // Clean up Storage and Network - if err := c.cleanup(ctx); err != nil { - return nil, 0, err - } - } - - criuStatistics, err := func() (*define.CRIUCheckpointRestoreStatistics, error) { - if !options.PrintStats { - return nil, nil - } - statsDirectory, err := os.Open(c.bundlePath()) - if err != nil { - return nil, fmt.Errorf("not able to open %q: %w", c.bundlePath(), err) - } - - dumpStatistics, err := stats.CriuGetDumpStats(statsDirectory) - if err != nil { - return nil, fmt.Errorf("displaying checkpointing statistics not possible: %w", err) - } - - return &define.CRIUCheckpointRestoreStatistics{ - FreezingTime: dumpStatistics.GetFreezingTime(), - FrozenTime: dumpStatistics.GetFrozenTime(), - MemdumpTime: dumpStatistics.GetMemdumpTime(), - MemwriteTime: dumpStatistics.GetMemwriteTime(), - PagesScanned: dumpStatistics.GetPagesScanned(), - PagesWritten: dumpStatistics.GetPagesWritten(), - }, nil - }() - if err != nil { - return nil, 0, err - } - - if !options.Keep && !options.PreCheckPoint { - cleanup := []string{ - "dump.log", - stats.StatsDump, - metadata.ConfigDumpFile, - metadata.SpecDumpFile, - } - for _, del := range cleanup { - file := filepath.Join(c.bundlePath(), del) - if err := os.Remove(file); err != nil { - logrus.Debugf("Unable to remove file %s", file) - } +func (c *Container) addSystemdMounts(g *generate.Generator) error { + if c.Systemd() { + if err := c.setupSystemd(g.Mounts(), *g); err != nil { + return fmt.Errorf("adding systemd-specific mounts: %w", err) } - // The file has been deleted. Do not mention it. - c.state.CheckpointLog = "" - } - - c.state.FinishedTime = time.Now() - return criuStatistics, runtimeCheckpointDuration, c.save() -} - -func (c *Container) generateContainerSpec() error { - // Make sure the newly created config.json exists on disk - - // NewFromSpec() is deprecated according to its comment - // however the recommended replace just causes a nil map panic - //nolint:staticcheck - g := generate.NewFromSpec(c.config.Spec) - - if err := c.saveSpec(g.Config); err != nil { - return fmt.Errorf("saving imported container specification for restore failed: %w", err) } - return nil } -func (c *Container) importCheckpointImage(ctx context.Context, imageID string) error { - img, _, err := c.Runtime().LibimageRuntime().LookupImage(imageID, nil) - if err != nil { - return err - } - - mountPoint, err := img.Mount(ctx, nil, "") - defer func() { - if err := c.unmount(true); err != nil { - logrus.Errorf("Failed to unmount container: %v", err) +func (c *Container) addSharedNamespaces(g *generate.Generator) error { + if c.config.IPCNsCtr != "" { + if err := c.addNamespaceContainer(g, IPCNS, c.config.IPCNsCtr, spec.IPCNamespace); err != nil { + return err } - }() - if err != nil { - return err - } - - // Import all checkpoint files except ConfigDumpFile and SpecDumpFile. We - // generate new container config files to enable to specifying a new - // container name. - checkpoint := []string{ - "artifacts", - metadata.CheckpointDirectory, - metadata.CheckpointVolumesDirectory, - metadata.DevShmCheckpointTar, - metadata.RootFsDiffTar, - metadata.DeletedFilesFile, - metadata.PodOptionsFile, - metadata.PodDumpFile, } - - for _, name := range checkpoint { - src := filepath.Join(mountPoint, name) - dst := filepath.Join(c.bundlePath(), name) - if err := archive.NewDefaultArchiver().CopyWithTar(src, dst); err != nil { - logrus.Debugf("Can't import '%s' from checkpoint image", name) + if c.config.MountNsCtr != "" { + if err := c.addNamespaceContainer(g, MountNS, c.config.MountNsCtr, spec.MountNamespace); err != nil { + return err } } - - return c.generateContainerSpec() -} - -func (c *Container) importCheckpointTar(input string) error { - if err := crutils.CRImportCheckpointWithoutConfig(c.bundlePath(), input); err != nil { - return err - } - - return c.generateContainerSpec() -} - -func (c *Container) importPreCheckpoint(input string) error { - archiveFile, err := os.Open(input) - if err != nil { - return fmt.Errorf("failed to open pre-checkpoint archive for import: %w", err) - } - - defer archiveFile.Close() - - err = archive.Untar(archiveFile, c.bundlePath(), nil) - if err != nil { - return fmt.Errorf("unpacking of pre-checkpoint archive %s failed: %w", input, err) - } - return nil -} - -func (c *Container) restore(ctx context.Context, options ContainerCheckpointOptions) (criuStatistics *define.CRIUCheckpointRestoreStatistics, runtimeRestoreDuration int64, retErr error) { - minCriuVersion := func() int { - if options.Pod == "" { - return criu.MinCriuVersion + if c.config.NetNsCtr != "" { + if err := c.addNamespaceContainer(g, NetNS, c.config.NetNsCtr, spec.NetworkNamespace); err != nil { + return err } - return criu.PodCriuVersion - }() - if err := c.checkpointRestoreSupported(minCriuVersion); err != nil { - return nil, 0, err - } - - if options.Pod != "" && !crutils.CRRuntimeSupportsPodCheckpointRestore(c.ociRuntime.Path()) { - return nil, 0, fmt.Errorf("runtime %s does not support pod restore", c.ociRuntime.Path()) - } - - if !c.ensureState(define.ContainerStateConfigured, define.ContainerStateExited) { - return nil, 0, fmt.Errorf("container %s is running or paused, cannot restore: %w", c.ID(), define.ErrCtrStateInvalid) } - - if options.ImportPrevious != "" { - if err := c.importPreCheckpoint(options.ImportPrevious); err != nil { - return nil, 0, err + if c.config.PIDNsCtr != "" { + if err := c.addNamespaceContainer(g, PIDNS, c.config.PIDNsCtr, spec.PIDNamespace); err != nil { + return err } } - - if options.TargetFile != "" { - if err := c.importCheckpointTar(options.TargetFile); err != nil { - return nil, 0, err + if c.config.UserNsCtr != "" { + if err := c.addNamespaceContainer(g, UserNS, c.config.UserNsCtr, spec.UserNamespace); err != nil { + return err } - } else if options.CheckpointImageID != "" { - if err := c.importCheckpointImage(ctx, options.CheckpointImageID); err != nil { - return nil, 0, err + if len(g.Config.Linux.UIDMappings) == 0 { + // runc complains if no mapping is specified, even if we join another ns. So provide a dummy mapping + g.AddLinuxUIDMapping(uint32(0), uint32(0), uint32(1)) + g.AddLinuxGIDMapping(uint32(0), uint32(0), uint32(1)) } } - // Let's try to stat() CRIU's inventory file. If it does not exist, it makes - // no sense to try a restore. This is a minimal check if a checkpoint exist. - if _, err := os.Stat(filepath.Join(c.CheckpointPath(), "inventory.img")); os.IsNotExist(err) { - return nil, 0, fmt.Errorf("a complete checkpoint for this container cannot be found, cannot restore: %w", err) - } - - if err := crutils.CRCreateFileWithLabel(c.bundlePath(), "restore.log", c.MountLabel()); err != nil { - return nil, 0, err - } - - // Setting RestoreLog early in case there is a failure. - c.state.RestoreLog = path.Join(c.bundlePath(), "restore.log") - c.state.CheckpointPath = c.CheckpointPath() - - // Read network configuration from checkpoint - var netStatus map[string]types.StatusBlock - _, err := metadata.ReadJSONFile(&netStatus, c.bundlePath(), metadata.NetworkStatusFile) + availableUIDs, availableGIDs, err := rootless.GetAvailableIDMaps() if err != nil { - logrus.Infof("Failed to unmarshal network status, cannot restore the same ip/mac: %v", err) - } - // If the restored container should get a new name, the IP address of - // the container will not be restored. This assumes that if a new name is - // specified, the container is restored multiple times. - // TODO: This implicit restoring with or without IP depending on an - // unrelated restore parameter (--name) does not seem like the - // best solution. - if err == nil && options.Name == "" && (!options.IgnoreStaticIP || !options.IgnoreStaticMAC) { - // The file with the network.status does exist. Let's restore the - // container with the same networks settings as during checkpointing. - networkOpts, err := c.networks() - if err != nil { - return nil, 0, err - } - - netOpts := make(map[string]types.PerNetworkOptions, len(netStatus)) - for network, perNetOpts := range networkOpts { - // unset mac and ips before we start adding the ones from the status - perNetOpts.StaticMAC = nil - perNetOpts.StaticIPs = nil - for name, netInt := range netStatus[network].Interfaces { - perNetOpts.InterfaceName = name - if !options.IgnoreStaticIP { - perNetOpts.StaticMAC = netInt.MacAddress - } - if !options.IgnoreStaticIP { - for _, netAddress := range netInt.Subnets { - perNetOpts.StaticIPs = append(perNetOpts.StaticIPs, netAddress.IPNet.IP) - } - } - // Normally interfaces have a length of 1, only for some special cni configs we could get more. - // For now just use the first interface to get the ips this should be good enough for most cases. - break - } - netOpts[network] = perNetOpts - } - c.perNetworkOpts = netOpts - } - - defer func() { - if retErr != nil { - if err := c.cleanup(ctx); err != nil { - logrus.Errorf("Cleaning up container %s: %v", c.ID(), err) - } + if os.IsNotExist(err) { + // The kernel-provided files only exist if user namespaces are supported + logrus.Debugf("User or group ID mappings not available: %s", err) + } else { + return err } - }() - - if err := c.prepare(); err != nil { - return nil, 0, err - } - - // Read config - jsonPath := filepath.Join(c.bundlePath(), "config.json") - logrus.Debugf("generate.NewFromFile at %v", jsonPath) - g, err := generate.NewFromFile(jsonPath) - if err != nil { - logrus.Debugf("generate.NewFromFile failed with %v", err) - return nil, 0, err - } - - // Restoring from an import means that we are doing migration - if options.TargetFile != "" || options.CheckpointImageID != "" { - g.SetRootPath(c.state.Mountpoint) + } else { + g.Config.Linux.UIDMappings = rootless.MaybeSplitMappings(g.Config.Linux.UIDMappings, availableUIDs) + g.Config.Linux.GIDMappings = rootless.MaybeSplitMappings(g.Config.Linux.GIDMappings, availableGIDs) } - // We want to have the same network namespace as before. - if c.config.CreateNetNS { - netNSPath := "" - if !c.config.PostConfigureNetNS { - netNSPath = c.state.NetNS.Path() - } + // Hostname handling: + // If we have a UTS namespace, set Hostname in the OCI spec. + // Set the HOSTNAME environment variable unless explicitly overridden by + // the user (already present in OCI spec). If we don't have a UTS ns, + // set it to the host's hostname instead. + hostname := c.Hostname() + foundUTS := false - if err := g.AddOrReplaceLinuxNamespace(string(spec.NetworkNamespace), netNSPath); err != nil { - return nil, 0, err + for _, i := range c.config.Spec.Linux.Namespaces { + if i.Type == spec.UTSNamespace && i.Path == "" { + foundUTS = true + g.SetHostname(hostname) + break } } - - if options.Pod != "" { - // Running in a Pod means that we have to change all namespace settings to - // the ones from the infrastructure container. - pod, err := c.runtime.LookupPod(options.Pod) - if err != nil { - return nil, 0, fmt.Errorf("pod %q cannot be retrieved: %w", options.Pod, err) - } - - infraContainer, err := pod.InfraContainer() + if !foundUTS { + tmpHostname, err := os.Hostname() if err != nil { - return nil, 0, fmt.Errorf("cannot retrieved infra container from pod %q: %w", options.Pod, err) - } - - infraContainer.lock.Lock() - if err := infraContainer.syncContainer(); err != nil { - infraContainer.lock.Unlock() - return nil, 0, fmt.Errorf("error syncing infrastructure container %s status: %w", infraContainer.ID(), err) - } - if infraContainer.state.State != define.ContainerStateRunning { - if err := infraContainer.initAndStart(ctx); err != nil { - infraContainer.lock.Unlock() - return nil, 0, fmt.Errorf("error starting infrastructure container %s status: %w", infraContainer.ID(), err) - } - } - infraContainer.lock.Unlock() - - if c.config.IPCNsCtr != "" { - nsPath, err := infraContainer.namespacePath(IPCNS) - if err != nil { - return nil, 0, fmt.Errorf("cannot retrieve IPC namespace path for Pod %q: %w", options.Pod, err) - } - if err := g.AddOrReplaceLinuxNamespace(string(spec.IPCNamespace), nsPath); err != nil { - return nil, 0, err - } - } - - if c.config.NetNsCtr != "" { - nsPath, err := infraContainer.namespacePath(NetNS) - if err != nil { - return nil, 0, fmt.Errorf("cannot retrieve network namespace path for Pod %q: %w", options.Pod, err) - } - if err := g.AddOrReplaceLinuxNamespace(string(spec.NetworkNamespace), nsPath); err != nil { - return nil, 0, err - } - } - - if c.config.PIDNsCtr != "" { - nsPath, err := infraContainer.namespacePath(PIDNS) - if err != nil { - return nil, 0, fmt.Errorf("cannot retrieve PID namespace path for Pod %q: %w", options.Pod, err) - } - if err := g.AddOrReplaceLinuxNamespace(string(spec.PIDNamespace), nsPath); err != nil { - return nil, 0, err - } - } - - if c.config.UTSNsCtr != "" { - nsPath, err := infraContainer.namespacePath(UTSNS) - if err != nil { - return nil, 0, fmt.Errorf("cannot retrieve UTS namespace path for Pod %q: %w", options.Pod, err) - } - if err := g.AddOrReplaceLinuxNamespace(string(spec.UTSNamespace), nsPath); err != nil { - return nil, 0, err - } - } - - if c.config.CgroupNsCtr != "" { - nsPath, err := infraContainer.namespacePath(CgroupNS) - if err != nil { - return nil, 0, fmt.Errorf("cannot retrieve Cgroup namespace path for Pod %q: %w", options.Pod, err) - } - if err := g.AddOrReplaceLinuxNamespace(string(spec.CgroupNamespace), nsPath); err != nil { - return nil, 0, err - } - } - } - - if err := c.makeBindMounts(); err != nil { - return nil, 0, err - } - - if options.TargetFile != "" || options.CheckpointImageID != "" { - for dstPath, srcPath := range c.state.BindMounts { - newMount := spec.Mount{ - Type: "bind", - Source: srcPath, - Destination: dstPath, - Options: []string{"bind", "private"}, - } - if c.IsReadOnly() && dstPath != "/dev/shm" { - newMount.Options = append(newMount.Options, "ro", "nosuid", "noexec", "nodev") - } - if dstPath == "/dev/shm" && c.state.BindMounts["/dev/shm"] == c.config.ShmDir { - newMount.Options = append(newMount.Options, "nosuid", "noexec", "nodev") - } - if !MountExists(g.Mounts(), dstPath) { - g.AddMount(newMount) - } - } - } - - // Restore /dev/shm content - if c.config.ShmDir != "" && c.state.BindMounts["/dev/shm"] == c.config.ShmDir { - shmDirTarFileFullPath := filepath.Join(c.bundlePath(), metadata.DevShmCheckpointTar) - if _, err := os.Stat(shmDirTarFileFullPath); err != nil { - logrus.Debug("Container checkpoint doesn't contain dev/shm: ", err.Error()) - } else { - shmDirTarFile, err := os.Open(shmDirTarFileFullPath) - if err != nil { - return nil, 0, err - } - defer shmDirTarFile.Close() - - if err := archive.UntarUncompressed(shmDirTarFile, c.config.ShmDir, nil); err != nil { - return nil, 0, err - } - } - } - - // Cleanup for a working restore. - if err := c.removeConmonFiles(); err != nil { - return nil, 0, err - } - - // Save the OCI spec to disk - if err := c.saveSpec(g.Config); err != nil { - return nil, 0, err - } - - // When restoring from an imported archive, allow restoring the content of volumes. - // Volumes are created in setupContainer() - if !options.IgnoreVolumes && (options.TargetFile != "" || options.CheckpointImageID != "") { - for _, v := range c.config.NamedVolumes { - volumeFilePath := filepath.Join(c.bundlePath(), metadata.CheckpointVolumesDirectory, v.Name+".tar") - - volumeFile, err := os.Open(volumeFilePath) - if err != nil { - return nil, 0, fmt.Errorf("failed to open volume file %s: %w", volumeFilePath, err) - } - defer volumeFile.Close() - - volume, err := c.runtime.GetVolume(v.Name) - if err != nil { - return nil, 0, fmt.Errorf("failed to retrieve volume %s: %w", v.Name, err) - } - - mountPoint, err := volume.MountPoint() - if err != nil { - return nil, 0, err - } - if mountPoint == "" { - return nil, 0, fmt.Errorf("unable to import volume %s as it is not mounted: %w", volume.Name(), err) - } - if err := archive.UntarUncompressed(volumeFile, mountPoint, nil); err != nil { - return nil, 0, fmt.Errorf("failed to extract volume %s to %s: %w", volumeFilePath, mountPoint, err) - } + return err } + hostname = tmpHostname } - - // Before actually restarting the container, apply the root file-system changes - if !options.IgnoreRootfs { - if err := crutils.CRApplyRootFsDiffTar(c.bundlePath(), c.state.Mountpoint); err != nil { - return nil, 0, err - } - - if err := crutils.CRRemoveDeletedFiles(c.ID(), c.bundlePath(), c.state.Mountpoint); err != nil { - return nil, 0, err + needEnv := true + for _, checkEnv := range g.Config.Process.Env { + if strings.SplitN(checkEnv, "=", 2)[0] == "HOSTNAME" { + needEnv = false + break } } - - runtimeRestoreDuration, err = c.ociRuntime.CreateContainer(c, &options) - if err != nil { - return nil, 0, err + if needEnv { + g.AddProcessEnv("HOSTNAME", hostname) } - criuStatistics, err = func() (*define.CRIUCheckpointRestoreStatistics, error) { - if !options.PrintStats { - return nil, nil - } - statsDirectory, err := os.Open(c.bundlePath()) - if err != nil { - return nil, fmt.Errorf("not able to open %q: %w", c.bundlePath(), err) - } - - restoreStatistics, err := stats.CriuGetRestoreStats(statsDirectory) - if err != nil { - return nil, fmt.Errorf("displaying restore statistics not possible: %w", err) + if c.config.UTSNsCtr != "" { + if err := c.addNamespaceContainer(g, UTSNS, c.config.UTSNsCtr, spec.UTSNamespace); err != nil { + return err } - - return &define.CRIUCheckpointRestoreStatistics{ - PagesCompared: restoreStatistics.GetPagesCompared(), - PagesSkippedCow: restoreStatistics.GetPagesSkippedCow(), - ForkingTime: restoreStatistics.GetForkingTime(), - RestoreTime: restoreStatistics.GetRestoreTime(), - PagesRestored: restoreStatistics.GetPagesRestored(), - }, nil - }() - if err != nil { - return nil, 0, err } - - logrus.Debugf("Restored container %s", c.ID()) - - c.state.State = define.ContainerStateRunning - c.state.Checkpointed = false - c.state.Restored = true - c.state.CheckpointedTime = time.Time{} - c.state.RestoredTime = time.Now() - - if !options.Keep { - // Delete all checkpoint related files. At this point, in theory, all files - // should exist. Still ignoring errors for now as the container should be - // restored and running. Not erroring out just because some cleanup operation - // failed. Starting with the checkpoint directory - err = os.RemoveAll(c.CheckpointPath()) - if err != nil { - logrus.Debugf("Non-fatal: removal of checkpoint directory (%s) failed: %v", c.CheckpointPath(), err) - } - c.state.CheckpointPath = "" - err = os.RemoveAll(c.PreCheckPointPath()) - if err != nil { - logrus.Debugf("Non-fatal: removal of pre-checkpoint directory (%s) failed: %v", c.PreCheckPointPath(), err) - } - err = os.RemoveAll(c.CheckpointVolumesPath()) - if err != nil { - logrus.Debugf("Non-fatal: removal of checkpoint volumes directory (%s) failed: %v", c.CheckpointVolumesPath(), err) - } - cleanup := [...]string{ - "restore.log", - "dump.log", - stats.StatsDump, - stats.StatsRestore, - metadata.DevShmCheckpointTar, - metadata.NetworkStatusFile, - metadata.RootFsDiffTar, - metadata.DeletedFilesFile, - } - for _, del := range cleanup { - file := filepath.Join(c.bundlePath(), del) - err = os.Remove(file) - if err != nil { - logrus.Debugf("Non-fatal: removal of checkpoint file (%s) failed: %v", file, err) - } + if c.config.CgroupNsCtr != "" { + if err := c.addNamespaceContainer(g, CgroupNS, c.config.CgroupNsCtr, spec.CgroupNamespace); err != nil { + return err } - c.state.CheckpointLog = "" - c.state.RestoreLog = "" } - return criuStatistics, runtimeRestoreDuration, c.save() -} - -// Retrieves a container's "root" net namespace container dependency. -func (c *Container) getRootNetNsDepCtr() (depCtr *Container, err error) { - containersVisited := map[string]int{c.config.ID: 1} - nextCtr := c.config.NetNsCtr - for nextCtr != "" { - // Make sure we aren't in a loop - if _, visited := containersVisited[nextCtr]; visited { - return nil, errors.New("loop encountered while determining net namespace container") + if c.config.UserNsCtr == "" && c.config.IDMappings.AutoUserNs { + if err := g.AddOrReplaceLinuxNamespace(string(spec.UserNamespace), ""); err != nil { + return err } - containersVisited[nextCtr] = 1 - - depCtr, err = c.runtime.state.Container(nextCtr) - if err != nil { - return nil, fmt.Errorf("error fetching dependency %s of container %s: %w", c.config.NetNsCtr, c.ID(), err) + g.ClearLinuxUIDMappings() + for _, uidmap := range c.config.IDMappings.UIDMap { + g.AddLinuxUIDMapping(uint32(uidmap.HostID), uint32(uidmap.ContainerID), uint32(uidmap.Size)) } - // This should never happen without an error - if depCtr == nil { - break + g.ClearLinuxGIDMappings() + for _, gidmap := range c.config.IDMappings.GIDMap { + g.AddLinuxGIDMapping(uint32(gidmap.HostID), uint32(gidmap.ContainerID), uint32(gidmap.Size)) } - nextCtr = depCtr.config.NetNsCtr - } - - if depCtr == nil { - return nil, errors.New("unexpected error depCtr is nil without reported error from runtime state") - } - return depCtr, nil -} - -// Ensure standard bind mounts are mounted into all root directories (including chroot directories) -func (c *Container) mountIntoRootDirs(mountName string, mountPath string) error { - c.state.BindMounts[mountName] = mountPath - - for _, chrootDir := range c.config.ChrootDirs { - c.state.BindMounts[filepath.Join(chrootDir, mountName)] = mountPath } - return nil } -// Make standard bind mounts to include in the container -func (c *Container) makeBindMounts() error { - if err := os.Chown(c.state.RunDir, c.RootUID(), c.RootGID()); err != nil { - return fmt.Errorf("cannot chown run directory: %w", err) - } - - if c.state.BindMounts == nil { - c.state.BindMounts = make(map[string]string) - } - netDisabled, err := c.NetworkDisabled() - if err != nil { - return err - } - - if !netDisabled { - // If /etc/resolv.conf and /etc/hosts exist, delete them so we - // will recreate. Only do this if we aren't sharing them with - // another container. - if c.config.NetNsCtr == "" { - if resolvePath, ok := c.state.BindMounts["/etc/resolv.conf"]; ok { - if err := os.Remove(resolvePath); err != nil && !os.IsNotExist(err) { - return fmt.Errorf("container %s: %w", c.ID(), err) - } - delete(c.state.BindMounts, "/etc/resolv.conf") - } - if hostsPath, ok := c.state.BindMounts["/etc/hosts"]; ok { - if err := os.Remove(hostsPath); err != nil && !os.IsNotExist(err) { - return fmt.Errorf("container %s: %w", c.ID(), err) - } - delete(c.state.BindMounts, "/etc/hosts") - } - } - - if c.config.NetNsCtr != "" && (!c.config.UseImageResolvConf || !c.config.UseImageHosts) { - // We share a net namespace. - // We want /etc/resolv.conf and /etc/hosts from the - // other container. Unless we're not creating both of - // them. - depCtr, err := c.getRootNetNsDepCtr() - if err != nil { - return fmt.Errorf("error fetching network namespace dependency container for container %s: %w", c.ID(), err) - } - - // We need that container's bind mounts - bindMounts, err := depCtr.BindMounts() - if err != nil { - return fmt.Errorf("error fetching bind mounts from dependency %s of container %s: %w", depCtr.ID(), c.ID(), err) - } - - // The other container may not have a resolv.conf or /etc/hosts - // If it doesn't, don't copy them - resolvPath, exists := bindMounts["/etc/resolv.conf"] - if !c.config.UseImageResolvConf && exists { - err := c.mountIntoRootDirs("/etc/resolv.conf", resolvPath) - - if err != nil { - return fmt.Errorf("error assigning mounts to container %s: %w", c.ID(), err) - } - } - - // check if dependency container has an /etc/hosts file. - // It may not have one, so only use it if it does. - hostsPath, exists := bindMounts[config.DefaultHostsFile] - if !c.config.UseImageHosts && exists { - // we cannot use the dependency container lock due ABBA deadlocks in cleanup() - lock, err := lockfile.GetLockfile(hostsPath) - if err != nil { - return fmt.Errorf("failed to lock hosts file: %w", err) - } - lock.Lock() - - // add the newly added container to the hosts file - // we always use 127.0.0.1 as ip since they have the same netns - err = etchosts.Add(hostsPath, getLocalhostHostEntry(c)) - lock.Unlock() - if err != nil { - return fmt.Errorf("error creating hosts file for container %s which depends on container %s: %w", c.ID(), depCtr.ID(), err) - } - - // finally, save it in the new container - err = c.mountIntoRootDirs(config.DefaultHostsFile, hostsPath) - if err != nil { - return fmt.Errorf("error assigning mounts to container %s: %w", c.ID(), err) - } - } - - if !hasCurrentUserMapped(c) { - if err := makeAccessible(resolvPath, c.RootUID(), c.RootGID()); err != nil { - return err - } - if err := makeAccessible(hostsPath, c.RootUID(), c.RootGID()); err != nil { - return err - } - } - } else { - if !c.config.UseImageResolvConf { - if err := c.generateResolvConf(); err != nil { - return fmt.Errorf("error creating resolv.conf for container %s: %w", c.ID(), err) - } - } - - if !c.config.UseImageHosts { - if err := c.createHosts(); err != nil { - return fmt.Errorf("error creating hosts file for container %s: %w", c.ID(), err) - } - } - } - - if c.state.BindMounts["/etc/hosts"] != "" { - if err := c.relabel(c.state.BindMounts["/etc/hosts"], c.config.MountLabel, true); err != nil { - return err - } - } - - if c.state.BindMounts["/etc/resolv.conf"] != "" { - if err := c.relabel(c.state.BindMounts["/etc/resolv.conf"], c.config.MountLabel, true); err != nil { - return err - } - } - } else if !c.config.UseImageHosts && c.state.BindMounts["/etc/hosts"] == "" { - if err := c.createHosts(); err != nil { - return fmt.Errorf("error creating hosts file for container %s: %w", c.ID(), err) - } - } - - if c.config.ShmDir != "" { - // If ShmDir has a value SHM is always added when we mount the container - c.state.BindMounts["/dev/shm"] = c.config.ShmDir - } - - if c.config.Passwd == nil || *c.config.Passwd { - newPasswd, newGroup, err := c.generatePasswdAndGroup() - if err != nil { - return fmt.Errorf("error creating temporary passwd file for container %s: %w", c.ID(), err) - } - if newPasswd != "" { - // Make /etc/passwd - // If it already exists, delete so we can recreate - delete(c.state.BindMounts, "/etc/passwd") - c.state.BindMounts["/etc/passwd"] = newPasswd - } - if newGroup != "" { - // Make /etc/group - // If it already exists, delete so we can recreate - delete(c.state.BindMounts, "/etc/group") - c.state.BindMounts["/etc/group"] = newGroup - } - } - - // Make /etc/hostname - // This should never change, so no need to recreate if it exists - if _, ok := c.state.BindMounts["/etc/hostname"]; !ok { - hostnamePath, err := c.writeStringToRundir("hostname", c.Hostname()) - if err != nil { - return fmt.Errorf("error creating hostname file for container %s: %w", c.ID(), err) - } - c.state.BindMounts["/etc/hostname"] = hostnamePath - } - - // Make /etc/localtime - ctrTimezone := c.Timezone() - if ctrTimezone != "" { - // validate the format of the timezone specified if it's not "local" - if ctrTimezone != "local" { - _, err = time.LoadLocation(ctrTimezone) - if err != nil { - return fmt.Errorf("error finding timezone for container %s: %w", c.ID(), err) - } - } - if _, ok := c.state.BindMounts["/etc/localtime"]; !ok { - var zonePath string - if ctrTimezone == "local" { - zonePath, err = filepath.EvalSymlinks("/etc/localtime") - if err != nil { - return fmt.Errorf("error finding local timezone for container %s: %w", c.ID(), err) +func (c *Container) addRootPropagation(g *generate.Generator, mounts []spec.Mount) error { + // Determine property of RootPropagation based on volume properties. If + // a volume is shared, then keep root propagation shared. This should + // work for slave and private volumes too. + // + // For slave volumes, it can be either [r]shared/[r]slave. + // + // For private volumes any root propagation value should work. + rootPropagation := "" + for _, m := range mounts { + for _, opt := range m.Options { + switch opt { + case MountShared, MountRShared: + if rootPropagation != MountShared && rootPropagation != MountRShared { + rootPropagation = MountShared } - } else { - zone := filepath.Join("/usr/share/zoneinfo", ctrTimezone) - zonePath, err = filepath.EvalSymlinks(zone) - if err != nil { - return fmt.Errorf("error setting timezone for container %s: %w", c.ID(), err) + case MountSlave, MountRSlave: + if rootPropagation != MountShared && rootPropagation != MountRShared && rootPropagation != MountSlave && rootPropagation != MountRSlave { + rootPropagation = MountRSlave } } - localtimePath, err := c.copyTimezoneFile(zonePath) - if err != nil { - return fmt.Errorf("error setting timezone for container %s: %w", c.ID(), err) - } - c.state.BindMounts["/etc/localtime"] = localtimePath - } - } - - _, hasRunContainerenv := c.state.BindMounts["/run/.containerenv"] - if !hasRunContainerenv { - // check in the spec mounts - for _, m := range c.config.Spec.Mounts { - if m.Destination == "/run/.containerenv" || m.Destination == "/run" { - hasRunContainerenv = true - break - } } } - - // Make .containerenv if it does not exist - if !hasRunContainerenv { - containerenv := c.runtime.graphRootMountedFlag(c.config.Spec.Mounts) - isRootless := 0 - if rootless.IsRootless() { - isRootless = 1 - } - imageID, imageName := c.Image() - - if c.Privileged() { - // Populate the .containerenv with container information - containerenv = fmt.Sprintf(`engine="podman-%s" -name=%q -id=%q -image=%q -imageid=%q -rootless=%d -%s`, version.Version.String(), c.Name(), c.ID(), imageName, imageID, isRootless, containerenv) - } - containerenvPath, err := c.writeStringToRundir(".containerenv", containerenv) - if err != nil { - return fmt.Errorf("error creating containerenv file for container %s: %w", c.ID(), err) - } - c.state.BindMounts["/run/.containerenv"] = containerenvPath - } - - // Add Subscription Mounts - subscriptionMounts := subscriptions.MountsWithUIDGID(c.config.MountLabel, c.state.RunDir, c.runtime.config.Containers.DefaultMountsFile, c.state.Mountpoint, c.RootUID(), c.RootGID(), rootless.IsRootless(), false) - for _, mount := range subscriptionMounts { - if _, ok := c.state.BindMounts[mount.Destination]; !ok { - c.state.BindMounts[mount.Destination] = mount.Source - } - } - - // Secrets are mounted by getting the secret data from the secrets manager, - // copying the data into the container's static dir, - // then mounting the copied dir into /run/secrets. - // The secrets mounting must come after subscription mounts, since subscription mounts - // creates the /run/secrets dir in the container where we mount as well. - if len(c.Secrets()) > 0 { - // create /run/secrets if subscriptions did not create - if err := c.createSecretMountDir(); err != nil { - return fmt.Errorf("error creating secrets mount: %w", err) - } - for _, secret := range c.Secrets() { - secretFileName := secret.Name - base := "/run/secrets" - if secret.Target != "" { - secretFileName = secret.Target - // If absolute path for target given remove base. - if filepath.IsAbs(secretFileName) { - base = "" - } - } - src := filepath.Join(c.config.SecretsPath, secret.Name) - dest := filepath.Join(base, secretFileName) - c.state.BindMounts[dest] = src + if rootPropagation != "" { + logrus.Debugf("Set root propagation to %q", rootPropagation) + if err := g.SetLinuxRootPropagation(rootPropagation); err != nil { + return err } } - return nil } -// generateResolvConf generates a containers resolv.conf -func (c *Container) generateResolvConf() error { - var ( - networkNameServers []string - networkSearchDomains []string - ) +func (c *Container) setProcessLabel(g *generate.Generator) { + g.SetProcessSelinuxLabel(c.ProcessLabel()) +} - netStatus := c.getNetworkStatus() - for _, status := range netStatus { - if status.DNSServerIPs != nil { - for _, nsIP := range status.DNSServerIPs { - networkNameServers = append(networkNameServers, nsIP.String()) - } - logrus.Debugf("Adding nameserver(s) from network status of '%q'", status.DNSServerIPs) - } - if status.DNSSearchDomains != nil { - networkSearchDomains = append(networkSearchDomains, status.DNSSearchDomains...) - logrus.Debugf("Adding search domain(s) from network status of '%q'", status.DNSSearchDomains) - } - } +func (c *Container) setMountLabel(g *generate.Generator) { + g.SetLinuxMountLabel(c.MountLabel()) +} - ipv6, err := c.checkForIPv6(netStatus) +func (c *Container) setCgroupsPath(g *generate.Generator) error { + cgroupPath, err := c.getOCICgroupPath() if err != nil { return err } - - nameservers := make([]string, 0, len(c.runtime.config.Containers.DNSServers)+len(c.config.DNSServer)) - nameservers = append(nameservers, c.runtime.config.Containers.DNSServers...) - for _, ip := range c.config.DNSServer { - nameservers = append(nameservers, ip.String()) - } - // If the user provided dns, it trumps all; then dns masq; then resolv.conf - var search []string - keepHostServers := false - if len(nameservers) == 0 { - keepHostServers = true - // first add the nameservers from the networks status - nameservers = networkNameServers - // when we add network dns server we also have to add the search domains - search = networkSearchDomains - // slirp4netns has a built in DNS forwarder. - if c.config.NetMode.IsSlirp4netns() { - slirp4netnsDNS, err := GetSlirp4netnsDNS(c.slirp4netnsSubnet) - if err != nil { - logrus.Warn("Failed to determine Slirp4netns DNS: ", err.Error()) - } else { - nameservers = append(nameservers, slirp4netnsDNS.String()) - } - } - } - - if len(c.config.DNSSearch) > 0 || len(c.runtime.config.Containers.DNSSearches) > 0 { - customSearch := make([]string, 0, len(c.config.DNSSearch)+len(c.runtime.config.Containers.DNSSearches)) - customSearch = append(customSearch, c.runtime.config.Containers.DNSSearches...) - customSearch = append(customSearch, c.config.DNSSearch...) - search = customSearch - } - - options := make([]string, 0, len(c.config.DNSOption)+len(c.runtime.config.Containers.DNSOptions)) - options = append(options, c.runtime.config.Containers.DNSOptions...) - options = append(options, c.config.DNSOption...) - - destPath := filepath.Join(c.state.RunDir, "resolv.conf") - - if err := resolvconf.New(&resolvconf.Params{ - IPv6Enabled: ipv6, - KeepHostServers: keepHostServers, - Nameservers: nameservers, - Namespaces: c.config.Spec.Linux.Namespaces, - Options: options, - Path: destPath, - Searches: search, - }); err != nil { - return fmt.Errorf("error building resolv.conf for container %s: %w", c.ID(), err) - } - - return c.bindMountRootFile(destPath, resolvconf.DefaultResolvConf) + g.SetLinuxCgroupsPath(cgroupPath) + return nil } -// Check if a container uses IPv6. -func (c *Container) checkForIPv6(netStatus map[string]types.StatusBlock) (bool, error) { - for _, status := range netStatus { - for _, netInt := range status.Interfaces { - for _, netAddress := range netInt.Subnets { - // Note: only using To16() does not work since it also returns a valid ip for ipv4 - if netAddress.IPNet.IP.To4() == nil && netAddress.IPNet.IP.To16() != nil { - return true, nil - } - } +func (c *Container) addSlirp4netnsDNS(nameservers []string) []string { + // slirp4netns has a built in DNS forwarder. + if c.config.NetMode.IsSlirp4netns() { + slirp4netnsDNS, err := GetSlirp4netnsDNS(c.slirp4netnsSubnet) + if err != nil { + logrus.Warn("Failed to determine Slirp4netns DNS: ", err.Error()) + } else { + nameservers = append(nameservers, slirp4netnsDNS.String()) } } + return nameservers +} +func (c *Container) isSlirp4netnsIPv6() (bool, error) { if c.config.NetMode.IsSlirp4netns() { ctrNetworkSlipOpts := []string{} if c.config.NetworkOptions != nil { @@ -2428,804 +630,38 @@ func (c *Container) checkForIPv6(netStatus map[string]types.StatusBlock) (bool, return false, nil } -// Add a new nameserver to the container's resolv.conf, ensuring that it is the -// first nameserver present. -// Usable only with running containers. -func (c *Container) addNameserver(ips []string) error { - // Take no action if container is not running. - if !c.ensureState(define.ContainerStateRunning, define.ContainerStateCreated) { - return nil - } - - // Do we have a resolv.conf at all? - path, ok := c.state.BindMounts[resolvconf.DefaultResolvConf] - if !ok { - return nil - } - - if err := resolvconf.Add(path, ips); err != nil { - return fmt.Errorf("adding new nameserver to container %s resolv.conf: %w", c.ID(), err) - } - - return nil -} - -// Remove an entry from the existing resolv.conf of the container. -// Usable only with running containers. -func (c *Container) removeNameserver(ips []string) error { - // Take no action if container is not running. - if !c.ensureState(define.ContainerStateRunning, define.ContainerStateCreated) { - return nil - } - - // Do we have a resolv.conf at all? - path, ok := c.state.BindMounts[resolvconf.DefaultResolvConf] - if !ok { - return nil - } - - if err := resolvconf.Remove(path, ips); err != nil { - return fmt.Errorf("removing nameservers from container %s resolv.conf: %w", c.ID(), err) - } - - return nil -} - -func getLocalhostHostEntry(c *Container) etchosts.HostEntries { - return etchosts.HostEntries{{IP: "127.0.0.1", Names: []string{c.Hostname(), c.config.Name}}} -} - -// getHostsEntries returns the container ip host entries for the correct netmode -func (c *Container) getHostsEntries() (etchosts.HostEntries, error) { - var entries etchosts.HostEntries - names := []string{c.Hostname(), c.config.Name} - switch { - case c.config.NetMode.IsBridge(): - entries = etchosts.GetNetworkHostEntries(c.state.NetworkStatus, names...) - case c.config.NetMode.IsSlirp4netns(): - ip, err := GetSlirp4netnsIP(c.slirp4netnsSubnet) - if err != nil { - return nil, err - } - entries = etchosts.HostEntries{{IP: ip.String(), Names: names}} - default: - // check for net=none - if !c.config.CreateNetNS { - for _, ns := range c.config.Spec.Linux.Namespaces { - if ns.Type == spec.NetworkNamespace { - if ns.Path == "" { - entries = etchosts.HostEntries{{IP: "127.0.0.1", Names: names}} - } - break +// check for net=none +func (c *Container) hasNetNone() bool { + if !c.config.CreateNetNS { + for _, ns := range c.config.Spec.Linux.Namespaces { + if ns.Type == spec.NetworkNamespace { + if ns.Path == "" { + return true } } } } - return entries, nil -} - -func (c *Container) createHosts() error { - var containerIPsEntries etchosts.HostEntries - var err error - // if we configure the netns after the container create we should not add - // the hosts here since we have no information about the actual ips - // instead we will add them in c.completeNetworkSetup() - if !c.config.PostConfigureNetNS { - containerIPsEntries, err = c.getHostsEntries() - if err != nil { - return fmt.Errorf("failed to get container ip host entries: %w", err) - } - } - baseHostFile, err := etchosts.GetBaseHostFile(c.runtime.config.Containers.BaseHostsFile, c.state.Mountpoint) - if err != nil { - return err - } - - targetFile := filepath.Join(c.state.RunDir, "hosts") - err = etchosts.New(&etchosts.Params{ - BaseFile: baseHostFile, - ExtraHosts: c.config.HostAdd, - ContainerIPs: containerIPsEntries, - HostContainersInternalIP: etchosts.GetHostContainersInternalIP(c.runtime.config, c.state.NetworkStatus, c.runtime.network), - TargetFile: targetFile, - }) - if err != nil { - return err - } - - return c.bindMountRootFile(targetFile, config.DefaultHostsFile) -} - -// bindMountRootFile will chown and relabel the source file to make it usable in the container. -// It will also add the path to the container bind mount map. -// source is the path on the host, dest is the path in the container. -func (c *Container) bindMountRootFile(source, dest string) error { - if err := os.Chown(source, c.RootUID(), c.RootGID()); err != nil { - return err - } - if err := label.Relabel(source, c.MountLabel(), false); err != nil { - return err - } - - return c.mountIntoRootDirs(dest, source) -} - -// generateGroupEntry generates an entry or entries into /etc/group as -// required by container configuration. -// Generally speaking, we will make an entry under two circumstances: -// 1. The container is started as a specific user:group, and that group is both -// numeric, and does not already exist in /etc/group. -// 2. It is requested that Libpod add the group that launched Podman to -// /etc/group via AddCurrentUserPasswdEntry (though this does not trigger if -// the group in question already exists in /etc/passwd). -// Returns group entry (as a string that can be appended to /etc/group) and any -// error that occurred. -func (c *Container) generateGroupEntry() (string, error) { - groupString := "" - - // Things we *can't* handle: adding the user we added in - // generatePasswdEntry to any *existing* groups. - addedGID := 0 - if c.config.AddCurrentUserPasswdEntry { - entry, gid, err := c.generateCurrentUserGroupEntry() - if err != nil { - return "", err - } - groupString += entry - addedGID = gid - } - if c.config.User != "" { - entry, err := c.generateUserGroupEntry(addedGID) - if err != nil { - return "", err - } - groupString += entry - } - - return groupString, nil -} - -// Make an entry in /etc/group for the group of the user running podman iff we -// are rootless. -func (c *Container) generateCurrentUserGroupEntry() (string, int, error) { - gid := rootless.GetRootlessGID() - if gid == 0 { - return "", 0, nil - } - - g, err := user.LookupGroupId(strconv.Itoa(gid)) - if err != nil { - return "", 0, fmt.Errorf("failed to get current group: %w", err) - } - - // Look up group name to see if it exists in the image. - _, err = lookup.GetGroup(c.state.Mountpoint, g.Name) - if err != runcuser.ErrNoGroupEntries { - return "", 0, err - } - - // Look up GID to see if it exists in the image. - _, err = lookup.GetGroup(c.state.Mountpoint, g.Gid) - if err != runcuser.ErrNoGroupEntries { - return "", 0, err - } - - // We need to get the username of the rootless user so we can add it to - // the group. - username := "" - uid := rootless.GetRootlessUID() - if uid != 0 { - u, err := user.LookupId(strconv.Itoa(uid)) - if err != nil { - return "", 0, fmt.Errorf("failed to get current user to make group entry: %w", err) - } - username = u.Username - } - - // Make the entry. - return fmt.Sprintf("%s:x:%s:%s\n", g.Name, g.Gid, username), gid, nil -} - -// Make an entry in /etc/group for the group the container was specified to run -// as. -func (c *Container) generateUserGroupEntry(addedGID int) (string, error) { - if c.config.User == "" { - return "", nil - } - - splitUser := strings.SplitN(c.config.User, ":", 2) - group := splitUser[0] - if len(splitUser) > 1 { - group = splitUser[1] - } - - gid, err := strconv.ParseUint(group, 10, 32) - if err != nil { - return "", nil //nolint: nilerr - } - - if addedGID != 0 && addedGID == int(gid) { - return "", nil - } - - // Check if the group already exists - _, err = lookup.GetGroup(c.state.Mountpoint, group) - if err != runcuser.ErrNoGroupEntries { - return "", err - } - - return fmt.Sprintf("%d:x:%d:%s\n", gid, gid, splitUser[0]), nil -} - -// generatePasswdEntry generates an entry or entries into /etc/passwd as -// required by container configuration. -// Generally speaking, we will make an entry under two circumstances: -// 1. The container is started as a specific user who is not in /etc/passwd. -// This only triggers if the user is given as a *numeric* ID. -// 2. It is requested that Libpod add the user that launched Podman to -// /etc/passwd via AddCurrentUserPasswdEntry (though this does not trigger if -// the user in question already exists in /etc/passwd) or the UID to be added -// is 0). -// 3. The user specified additional host user accounts to add the the /etc/passwd file -// Returns password entry (as a string that can be appended to /etc/passwd) and -// any error that occurred. -func (c *Container) generatePasswdEntry() (string, error) { - passwdString := "" - - addedUID := 0 - for _, userid := range c.config.HostUsers { - // Look up User on host - u, err := util.LookupUser(userid) - if err != nil { - return "", err - } - entry, err := c.userPasswdEntry(u) - if err != nil { - return "", err - } - passwdString += entry - } - if c.config.AddCurrentUserPasswdEntry { - entry, uid, _, err := c.generateCurrentUserPasswdEntry() - if err != nil { - return "", err - } - passwdString += entry - addedUID = uid - } - if c.config.User != "" { - entry, err := c.generateUserPasswdEntry(addedUID) - if err != nil { - return "", err - } - passwdString += entry - } - - return passwdString, nil -} - -// generateCurrentUserPasswdEntry generates an /etc/passwd entry for the user -// running the container engine. -// Returns a passwd entry for the user, and the UID and GID of the added entry. -func (c *Container) generateCurrentUserPasswdEntry() (string, int, int, error) { - uid := rootless.GetRootlessUID() - if uid == 0 { - return "", 0, 0, nil - } - - u, err := user.LookupId(strconv.Itoa(uid)) - if err != nil { - return "", 0, 0, fmt.Errorf("failed to get current user: %w", err) - } - pwd, err := c.userPasswdEntry(u) - if err != nil { - return "", 0, 0, err - } - - return pwd, uid, rootless.GetRootlessGID(), nil -} - -func (c *Container) userPasswdEntry(u *user.User) (string, error) { - // Look up the user to see if it exists in the container image. - _, err := lookup.GetUser(c.state.Mountpoint, u.Username) - if err != runcuser.ErrNoPasswdEntries { - return "", err - } - - // Look up the UID to see if it exists in the container image. - _, err = lookup.GetUser(c.state.Mountpoint, u.Uid) - if err != runcuser.ErrNoPasswdEntries { - return "", err - } - - // If the user's actual home directory exists, or was mounted in - use - // that. - homeDir := c.WorkingDir() - hDir := u.HomeDir - for hDir != "/" { - if MountExists(c.config.Spec.Mounts, hDir) { - homeDir = u.HomeDir - break - } - hDir = filepath.Dir(hDir) - } - if homeDir != u.HomeDir { - for _, hDir := range c.UserVolumes() { - if hDir == u.HomeDir { - homeDir = u.HomeDir - break - } - } - } - // Set HOME environment if not already set - hasHomeSet := false - for _, s := range c.config.Spec.Process.Env { - if strings.HasPrefix(s, "HOME=") { - hasHomeSet = true - break - } - } - if !hasHomeSet { - c.config.Spec.Process.Env = append(c.config.Spec.Process.Env, fmt.Sprintf("HOME=%s", homeDir)) - } - if c.config.PasswdEntry != "" { - return c.passwdEntry(u.Username, u.Uid, u.Gid, u.Name, homeDir), nil - } - - return fmt.Sprintf("%s:*:%s:%s:%s:%s:/bin/sh\n", u.Username, u.Uid, u.Gid, u.Name, homeDir), nil -} - -// generateUserPasswdEntry generates an /etc/passwd entry for the container user -// to run in the container. -// The UID and GID of the added entry will also be returned. -// Accepts one argument, that being any UID that has already been added to the -// passwd file by other functions; if it matches the UID we were given, we don't -// need to do anything. -func (c *Container) generateUserPasswdEntry(addedUID int) (string, error) { - var ( - groupspec string - gid int - ) - if c.config.User == "" { - return "", nil - } - splitSpec := strings.SplitN(c.config.User, ":", 2) - userspec := splitSpec[0] - if len(splitSpec) > 1 { - groupspec = splitSpec[1] - } - // If a non numeric User, then don't generate passwd - uid, err := strconv.ParseUint(userspec, 10, 32) - if err != nil { - return "", nil //nolint: nilerr - } - - if addedUID != 0 && int(uid) == addedUID { - return "", nil - } - - // Look up the user to see if it exists in the container image - _, err = lookup.GetUser(c.state.Mountpoint, userspec) - if err != runcuser.ErrNoPasswdEntries { - return "", err - } - - if groupspec != "" { - ugid, err := strconv.ParseUint(groupspec, 10, 32) - if err == nil { - gid = int(ugid) - } else { - group, err := lookup.GetGroup(c.state.Mountpoint, groupspec) - if err != nil { - return "", fmt.Errorf("unable to get gid %s from group file: %w", groupspec, err) - } - gid = group.Gid - } - } - - if c.config.PasswdEntry != "" { - entry := c.passwdEntry(fmt.Sprintf("%d", uid), fmt.Sprintf("%d", uid), fmt.Sprintf("%d", gid), "container user", c.WorkingDir()) - return entry, nil - } - - return fmt.Sprintf("%d:*:%d:%d:container user:%s:/bin/sh\n", uid, uid, gid, c.WorkingDir()), nil -} - -func (c *Container) passwdEntry(username string, uid, gid, name, homeDir string) string { - s := c.config.PasswdEntry - s = strings.ReplaceAll(s, "$USERNAME", username) - s = strings.ReplaceAll(s, "$UID", uid) - s = strings.ReplaceAll(s, "$GID", gid) - s = strings.ReplaceAll(s, "$NAME", name) - s = strings.ReplaceAll(s, "$HOME", homeDir) - return s + "\n" -} - -// generatePasswdAndGroup generates container-specific passwd and group files -// iff g.config.User is a number or we are configured to make a passwd entry for -// the current user or the user specified HostsUsers -// Returns path to file to mount at /etc/passwd, path to file to mount at -// /etc/group, and any error that occurred. If no passwd/group file were -// required, the empty string will be returned for those path (this may occur -// even if no error happened). -// This may modify the mounted container's /etc/passwd and /etc/group instead of -// making copies to bind-mount in, so we don't break useradd (it wants to make a -// copy of /etc/passwd and rename the copy to /etc/passwd, which is impossible -// with a bind mount). This is done in cases where the container is *not* -// read-only. In this case, the function will return nothing ("", "", nil). -func (c *Container) generatePasswdAndGroup() (string, string, error) { - if !c.config.AddCurrentUserPasswdEntry && c.config.User == "" && - len(c.config.HostUsers) == 0 { - return "", "", nil - } - - needPasswd := true - needGroup := true - - // First, check if there's a mount at /etc/passwd or group, we don't - // want to interfere with user mounts. - if MountExists(c.config.Spec.Mounts, "/etc/passwd") { - needPasswd = false - } - if MountExists(c.config.Spec.Mounts, "/etc/group") { - needGroup = false - } - - // Next, check if we already made the files. If we didn't, don't need to - // do anything more. - if needPasswd { - passwdPath := filepath.Join(c.config.StaticDir, "passwd") - if _, err := os.Stat(passwdPath); err == nil { - needPasswd = false - } - } - if needGroup { - groupPath := filepath.Join(c.config.StaticDir, "group") - if _, err := os.Stat(groupPath); err == nil { - needGroup = false - } - } - - // If we don't need a /etc/passwd or /etc/group at this point we can - // just return. - if !needPasswd && !needGroup { - return "", "", nil - } - - passwdPath := "" - groupPath := "" - - ro := c.IsReadOnly() - - if needPasswd { - passwdEntry, err := c.generatePasswdEntry() - if err != nil { - return "", "", err - } - - needsWrite := passwdEntry != "" - switch { - case ro && needsWrite: - logrus.Debugf("Making /etc/passwd for container %s", c.ID()) - originPasswdFile, err := securejoin.SecureJoin(c.state.Mountpoint, "/etc/passwd") - if err != nil { - return "", "", fmt.Errorf("error creating path to container %s /etc/passwd: %w", c.ID(), err) - } - orig, err := ioutil.ReadFile(originPasswdFile) - if err != nil && !os.IsNotExist(err) { - return "", "", err - } - passwdFile, err := c.writeStringToStaticDir("passwd", string(orig)+passwdEntry) - if err != nil { - return "", "", fmt.Errorf("failed to create temporary passwd file: %w", err) - } - if err := os.Chmod(passwdFile, 0644); err != nil { - return "", "", err - } - passwdPath = passwdFile - case !ro && needsWrite: - logrus.Debugf("Modifying container %s /etc/passwd", c.ID()) - containerPasswd, err := securejoin.SecureJoin(c.state.Mountpoint, "/etc/passwd") - if err != nil { - return "", "", fmt.Errorf("error looking up location of container %s /etc/passwd: %w", c.ID(), err) - } - - f, err := os.OpenFile(containerPasswd, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0600) - if err != nil { - return "", "", fmt.Errorf("container %s: %w", c.ID(), err) - } - defer f.Close() - - if _, err := f.WriteString(passwdEntry); err != nil { - return "", "", fmt.Errorf("unable to append to container %s /etc/passwd: %w", c.ID(), err) - } - default: - logrus.Debugf("Not modifying container %s /etc/passwd", c.ID()) - } - } - if needGroup { - groupEntry, err := c.generateGroupEntry() - if err != nil { - return "", "", err - } - - needsWrite := groupEntry != "" - switch { - case ro && needsWrite: - logrus.Debugf("Making /etc/group for container %s", c.ID()) - originGroupFile, err := securejoin.SecureJoin(c.state.Mountpoint, "/etc/group") - if err != nil { - return "", "", fmt.Errorf("error creating path to container %s /etc/group: %w", c.ID(), err) - } - orig, err := ioutil.ReadFile(originGroupFile) - if err != nil && !os.IsNotExist(err) { - return "", "", err - } - groupFile, err := c.writeStringToStaticDir("group", string(orig)+groupEntry) - if err != nil { - return "", "", fmt.Errorf("failed to create temporary group file: %w", err) - } - if err := os.Chmod(groupFile, 0644); err != nil { - return "", "", err - } - groupPath = groupFile - case !ro && needsWrite: - logrus.Debugf("Modifying container %s /etc/group", c.ID()) - containerGroup, err := securejoin.SecureJoin(c.state.Mountpoint, "/etc/group") - if err != nil { - return "", "", fmt.Errorf("error looking up location of container %s /etc/group: %w", c.ID(), err) - } - - f, err := os.OpenFile(containerGroup, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0600) - if err != nil { - return "", "", fmt.Errorf("container %s: %w", c.ID(), err) - } - defer f.Close() - - if _, err := f.WriteString(groupEntry); err != nil { - return "", "", fmt.Errorf("unable to append to container %s /etc/group: %w", c.ID(), err) - } - default: - logrus.Debugf("Not modifying container %s /etc/group", c.ID()) - } - } - - return passwdPath, groupPath, nil -} - -func isRootlessCgroupSet(cgroup string) bool { - // old versions of podman were setting the CgroupParent to CgroupfsDefaultCgroupParent - // by default. Avoid breaking these versions and check whether the cgroup parent is - // set to the default and in this case enable the old behavior. It should not be a real - // problem because the default CgroupParent is usually owned by root so rootless users - // cannot access it. - // This check might be lifted in a future version of Podman. - // Check both that the cgroup or its parent is set to the default value (used by pods). - return cgroup != CgroupfsDefaultCgroupParent && filepath.Dir(cgroup) != CgroupfsDefaultCgroupParent -} - -func (c *Container) expectPodCgroup() (bool, error) { - unified, err := cgroups.IsCgroup2UnifiedMode() - if err != nil { - return false, err - } - cgroupManager := c.CgroupManager() - switch { - case c.config.NoCgroups: - return false, nil - case cgroupManager == config.SystemdCgroupsManager: - return !rootless.IsRootless() || unified, nil - case cgroupManager == config.CgroupfsCgroupsManager: - return !rootless.IsRootless(), nil - default: - return false, fmt.Errorf("invalid cgroup mode %s requested for pods: %w", cgroupManager, define.ErrInvalidArg) - } -} - -// Get cgroup path in a format suitable for the OCI spec -func (c *Container) getOCICgroupPath() (string, error) { - unified, err := cgroups.IsCgroup2UnifiedMode() - if err != nil { - return "", err - } - cgroupManager := c.CgroupManager() - switch { - case c.config.NoCgroups: - return "", nil - case c.config.CgroupsMode == cgroupSplit: - selfCgroup, err := utils.GetOwnCgroupDisallowRoot() - if err != nil { - return "", err - } - return filepath.Join(selfCgroup, fmt.Sprintf("libpod-payload-%s", c.ID())), nil - case cgroupManager == config.SystemdCgroupsManager: - // When the OCI runtime is set to use Systemd as a cgroup manager, it - // expects cgroups to be passed as follows: - // slice:prefix:name - systemdCgroups := fmt.Sprintf("%s:libpod:%s", path.Base(c.config.CgroupParent), c.ID()) - logrus.Debugf("Setting Cgroups for container %s to %s", c.ID(), systemdCgroups) - return systemdCgroups, nil - case (rootless.IsRootless() && (cgroupManager == config.CgroupfsCgroupsManager || !unified)): - if c.config.CgroupParent == "" || !isRootlessCgroupSet(c.config.CgroupParent) { - return "", nil - } - fallthrough - case cgroupManager == config.CgroupfsCgroupsManager: - cgroupPath := filepath.Join(c.config.CgroupParent, fmt.Sprintf("libpod-%s", c.ID())) - logrus.Debugf("Setting Cgroup path for container %s to %s", c.ID(), cgroupPath) - return cgroupPath, nil - default: - return "", fmt.Errorf("invalid cgroup manager %s requested: %w", cgroupManager, define.ErrInvalidArg) - } -} - -func (c *Container) copyTimezoneFile(zonePath string) (string, error) { - localtimeCopy := filepath.Join(c.state.RunDir, "localtime") - file, err := os.Stat(zonePath) - if err != nil { - return "", err - } - if file.IsDir() { - return "", errors.New("invalid timezone: is a directory") - } - src, err := os.Open(zonePath) - if err != nil { - return "", err - } - defer src.Close() - dest, err := os.Create(localtimeCopy) - if err != nil { - return "", err - } - defer dest.Close() - _, err = io.Copy(dest, src) - if err != nil { - return "", err - } - if err := c.relabel(localtimeCopy, c.config.MountLabel, false); err != nil { - return "", err - } - if err := dest.Chown(c.RootUID(), c.RootGID()); err != nil { - return "", err - } - return localtimeCopy, err -} - -func (c *Container) cleanupOverlayMounts() error { - return overlay.CleanupContent(c.config.StaticDir) -} - -// Creates and mounts an empty dir to mount secrets into, if it does not already exist -func (c *Container) createSecretMountDir() error { - src := filepath.Join(c.state.RunDir, "/run/secrets") - _, err := os.Stat(src) - if os.IsNotExist(err) { - oldUmask := umask.Set(0) - defer umask.Set(oldUmask) - - if err := os.MkdirAll(src, 0755); err != nil { - return err - } - if err := label.Relabel(src, c.config.MountLabel, false); err != nil { - return err - } - if err := os.Chown(src, c.RootUID(), c.RootGID()); err != nil { - return err - } - c.state.BindMounts["/run/secrets"] = src - return nil - } - - return err + return false } -// Fix ownership and permissions of the specified volume if necessary. -func (c *Container) fixVolumePermissions(v *ContainerNamedVolume) error { - vol, err := c.runtime.state.Volume(v.Name) - if err != nil { - return fmt.Errorf("error retrieving named volume %s for container %s: %w", v.Name, c.ID(), err) - } - - vol.lock.Lock() - defer vol.lock.Unlock() - - // The volume may need a copy-up. Check the state. - if err := vol.update(); err != nil { +func setVolumeAtime(mountPoint string, st os.FileInfo) error { + stat := st.Sys().(*syscall.Stat_t) + atime := time.Unix(int64(stat.Atim.Sec), int64(stat.Atim.Nsec)) //nolint: unconvert + if err := os.Chtimes(mountPoint, atime, st.ModTime()); err != nil { return err } - - // Volumes owned by a volume driver are not chowned - we don't want to - // mess with a mount not managed by us. - if vol.state.NeedsChown && !vol.UsesVolumeDriver() { - vol.state.NeedsChown = false - - uid := int(c.config.Spec.Process.User.UID) - gid := int(c.config.Spec.Process.User.GID) - - if c.config.IDMappings.UIDMap != nil { - p := idtools.IDPair{ - UID: uid, - GID: gid, - } - mappings := idtools.NewIDMappingsFromMaps(c.config.IDMappings.UIDMap, c.config.IDMappings.GIDMap) - newPair, err := mappings.ToHost(p) - if err != nil { - return fmt.Errorf("error mapping user %d:%d: %w", uid, gid, err) - } - uid = newPair.UID - gid = newPair.GID - } - - vol.state.UIDChowned = uid - vol.state.GIDChowned = gid - - if err := vol.save(); err != nil { - return err - } - - mountPoint, err := vol.MountPoint() - if err != nil { - return err - } - - if err := os.Lchown(mountPoint, uid, gid); err != nil { - return err - } - - // Make sure the new volume matches the permissions of the target directory. - // https://github.com/containers/podman/issues/10188 - st, err := os.Lstat(filepath.Join(c.state.Mountpoint, v.Dest)) - if err == nil { - if stat, ok := st.Sys().(*syscall.Stat_t); ok { - if err := os.Lchown(mountPoint, int(stat.Uid), int(stat.Gid)); err != nil { - return err - } - } - if err := os.Chmod(mountPoint, st.Mode()); err != nil { - return err - } - stat := st.Sys().(*syscall.Stat_t) - atime := time.Unix(int64(stat.Atim.Sec), int64(stat.Atim.Nsec)) //nolint: unconvert - if err := os.Chtimes(mountPoint, atime, st.ModTime()); err != nil { - return err - } - } else if !os.IsNotExist(err) { - return err - } - } return nil } -func (c *Container) relabel(src, mountLabel string, recurse bool) error { - if !selinux.GetEnabled() || mountLabel == "" { - return nil - } - // only relabel on initial creation of container - if !c.ensureState(define.ContainerStateConfigured, define.ContainerStateUnknown) { - label, err := label.FileLabel(src) - if err != nil { - return err - } - // If labels are different, might be on a tmpfs - if label == mountLabel { - return nil - } - } - return label.Relabel(src, mountLabel, recurse) -} - -func (c *Container) ChangeHostPathOwnership(src string, recurse bool, uid, gid int) error { - // only chown on initial creation of container - if !c.ensureState(define.ContainerStateConfigured, define.ContainerStateUnknown) { - st, err := os.Stat(src) +func (c *Container) makePlatformBindMounts() error { + // Make /etc/hostname + // This should never change, so no need to recreate if it exists + if _, ok := c.state.BindMounts["/etc/hostname"]; !ok { + hostnamePath, err := c.writeStringToRundir("hostname", c.Hostname()) if err != nil { - return err - } - - // If labels are different, might be on a tmpfs - if int(st.Sys().(*syscall.Stat_t).Uid) == uid && int(st.Sys().(*syscall.Stat_t).Gid) == gid { - return nil + return fmt.Errorf("creating hostname file for container %s: %w", c.ID(), err) } + c.state.BindMounts["/etc/hostname"] = hostnamePath } - return chown.ChangeHostPathOwnership(src, recurse, uid, gid) + return nil } diff --git a/libpod/container_internal_unsupported.go b/libpod/container_internal_unsupported.go new file mode 100644 index 000000000..1967c577b --- /dev/null +++ b/libpod/container_internal_unsupported.go @@ -0,0 +1,99 @@ +//go:build !linux && !freebsd +// +build !linux,!freebsd + +package libpod + +import ( + "context" + "errors" + + "github.com/containers/common/libnetwork/etchosts" + "github.com/containers/podman/v4/libpod/define" + "github.com/containers/podman/v4/pkg/lookup" + spec "github.com/opencontainers/runtime-spec/specs-go" +) + +func (c *Container) mountSHM(shmOptions string) error { + return errors.New("not implemented (*Container) mountSHM") +} + +func (c *Container) unmountSHM(mount string) error { + return errors.New("not implemented (*Container) unmountSHM") +} + +func (c *Container) cleanupOverlayMounts() error { + return errors.New("not implemented (*Container) cleanupOverlayMounts") +} + +// prepare mounts the container and sets up other required resources like net +// namespaces +func (c *Container) prepare() error { + return errors.New("not implemented (*Container) prepare") +} + +// resolveWorkDir resolves the container's workdir and, depending on the +// configuration, will create it, or error out if it does not exist. +// Note that the container must be mounted before. +func (c *Container) resolveWorkDir() error { + return errors.New("not implemented (*Container) resolveWorkDir") +} + +// cleanupNetwork unmounts and cleans up the container's network +func (c *Container) cleanupNetwork() error { + return errors.New("not implemented (*Container) cleanupNetwork") +} + +// reloadNetwork reloads the network for the given container, recreating +// firewall rules. +func (c *Container) reloadNetwork() error { + return errors.New("not implemented (*Container) reloadNetwork") +} + +// Generate spec for a container +// Accepts a map of the container's dependencies +func (c *Container) generateSpec(ctx context.Context) (*spec.Spec, error) { + return nil, errors.New("not implemented (*Container) generateSpec") +} + +func (c *Container) getUserOverrides() *lookup.Overrides { + return &lookup.Overrides{} +} + +func (c *Container) checkpoint(ctx context.Context, options ContainerCheckpointOptions) (*define.CRIUCheckpointRestoreStatistics, int64, error) { + return nil, 0, errors.New("not implemented (*Container) checkpoint") +} + +func (c *Container) restore(ctx context.Context, options ContainerCheckpointOptions) (criuStatistics *define.CRIUCheckpointRestoreStatistics, runtimeRestoreDuration int64, retErr error) { + return nil, 0, errors.New("not implemented (*Container) restore") +} + +// getHostsEntries returns the container ip host entries for the correct netmode +func (c *Container) getHostsEntries() (etchosts.HostEntries, error) { + return nil, errors.New("unsupported (*Container) getHostsEntries") +} + +// Fix ownership and permissions of the specified volume if necessary. +func (c *Container) fixVolumePermissions(v *ContainerNamedVolume) error { + return errors.New("unsupported (*Container) fixVolumePermissions") +} + +func (c *Container) expectPodCgroup() (bool, error) { + return false, errors.New("unsupported (*Container) expectPodCgroup") +} + +// Get cgroup path in a format suitable for the OCI spec +func (c *Container) getOCICgroupPath() (string, error) { + return "", errors.New("unsupported (*Container) getOCICgroupPath") +} + +func getLocalhostHostEntry(c *Container) etchosts.HostEntries { + return nil +} + +func isRootlessCgroupSet(cgroup string) bool { + return false +} + +func openDirectory(path string) (fd int, err error) { + return -1, errors.New("unsupported openDirectory") +} diff --git a/libpod/container_linux.go b/libpod/container_linux.go index 8b517e69f..9c17a1966 100644 --- a/libpod/container_linux.go +++ b/libpod/container_linux.go @@ -5,6 +5,7 @@ package libpod import ( "github.com/containernetworking/plugins/pkg/ns" + spec "github.com/opencontainers/runtime-spec/specs-go" ) type containerPlatformState struct { @@ -13,3 +14,17 @@ type containerPlatformState struct { // told to join another container's network namespace NetNS ns.NetNS `json:"-"` } + +func networkDisabled(c *Container) (bool, error) { + if c.config.CreateNetNS { + return false, nil + } + if !c.config.PostConfigureNetNS { + for _, ns := range c.config.Spec.Linux.Namespaces { + if ns.Type == spec.NetworkNamespace { + return ns.Path == "", nil + } + } + } + return false, nil +} diff --git a/libpod/container_path_resolution.go b/libpod/container_path_resolution.go index 35622d623..cd86df540 100644 --- a/libpod/container_path_resolution.go +++ b/libpod/container_path_resolution.go @@ -119,15 +119,29 @@ func findVolume(c *Container, containerPath string) (*Volume, error) { return nil, nil } +// isSubDir checks whether path is a subdirectory of root. +func isSubDir(path, root string) bool { + // check if the specified container path is below a bind mount. + rel, err := filepath.Rel(root, path) + if err != nil { + return false + } + return rel != ".." && !strings.HasPrefix(rel, "../") +} + // isPathOnVolume returns true if the specified containerPath is a subdir of any // Volume's destination. func isPathOnVolume(c *Container, containerPath string) bool { cleanedContainerPath := filepath.Clean(containerPath) for _, vol := range c.config.NamedVolumes { - if cleanedContainerPath == filepath.Clean(vol.Dest) { + cleanedDestination := filepath.Clean(vol.Dest) + if cleanedContainerPath == cleanedDestination { return true } - for dest := vol.Dest; dest != "/" && dest != "."; dest = filepath.Dir(dest) { + if isSubDir(cleanedContainerPath, cleanedDestination) { + return true + } + for dest := cleanedDestination; dest != "/" && dest != "."; dest = filepath.Dir(dest) { if cleanedContainerPath == dest { return true } @@ -152,15 +166,19 @@ func findBindMount(c *Container, containerPath string) *specs.Mount { return nil } -/// isPathOnBindMount returns true if the specified containerPath is a subdir of any +/// isPathOnMount returns true if the specified containerPath is a subdir of any // Mount's destination. -func isPathOnBindMount(c *Container, containerPath string) bool { +func isPathOnMount(c *Container, containerPath string) bool { cleanedContainerPath := filepath.Clean(containerPath) for _, m := range c.config.Spec.Mounts { - if cleanedContainerPath == filepath.Clean(m.Destination) { + cleanedDestination := filepath.Clean(m.Destination) + if cleanedContainerPath == cleanedDestination { + return true + } + if isSubDir(cleanedContainerPath, cleanedDestination) { return true } - for dest := m.Destination; dest != "/" && dest != "."; dest = filepath.Dir(dest) { + for dest := cleanedDestination; dest != "/" && dest != "."; dest = filepath.Dir(dest) { if cleanedContainerPath == dest { return true } diff --git a/libpod/container_path_resolution_test.go b/libpod/container_path_resolution_test.go new file mode 100644 index 000000000..f906c752d --- /dev/null +++ b/libpod/container_path_resolution_test.go @@ -0,0 +1,28 @@ +package libpod + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestIsSubDir(t *testing.T) { + assert.True(t, isSubDir("/foo", "/foo")) + assert.True(t, isSubDir("/foo/bar", "/foo")) + assert.True(t, isSubDir("/foo/bar", "/foo/")) + assert.True(t, isSubDir("/foo/bar", "/foo//")) + assert.True(t, isSubDir("/foo/bar/", "/foo")) + assert.True(t, isSubDir("/foo/bar/baz/", "/foo")) + assert.True(t, isSubDir("/foo/bar/baz/", "/foo/bar")) + assert.True(t, isSubDir("/foo/bar/baz/", "/foo/bar/")) + assert.False(t, isSubDir("/foo/bar/baz/", "/foobar/")) + assert.False(t, isSubDir("/foo/bar/baz/../../", "/foobar/")) + assert.False(t, isSubDir("/foo/bar/baz/", "../foo/bar")) + assert.False(t, isSubDir("/foo/bar/baz/", "../foo/")) + assert.False(t, isSubDir("/foo/bar/baz/", "../foo")) + assert.False(t, isSubDir("/", "..")) + assert.False(t, isSubDir("//", "..")) + assert.False(t, isSubDir("//", "../")) + assert.False(t, isSubDir("//", "..//")) + assert.True(t, isSubDir("/foo/bar/baz/../../", "/foo/")) +} diff --git a/libpod/container_stat_linux.go b/libpod/container_stat_linux.go index 72aabb516..dc3a524f5 100644 --- a/libpod/container_stat_linux.go +++ b/libpod/container_stat_linux.go @@ -168,7 +168,7 @@ func secureStat(root string, path string) (*copier.StatForItem, error) { if stat.IsSymlink { target, err := copier.Eval(root, path, copier.EvalOptions{}) if err != nil { - return nil, fmt.Errorf("error evaluating symlink in container: %w", err) + return nil, fmt.Errorf("evaluating symlink in container: %w", err) } // Need to make sure the symlink is relative to the root! target = strings.TrimPrefix(target, root) diff --git a/libpod/container_stat_unsupported.go b/libpod/container_stat_unsupported.go new file mode 100644 index 000000000..2f1acd44d --- /dev/null +++ b/libpod/container_stat_unsupported.go @@ -0,0 +1,14 @@ +//go:build !linux +// +build !linux + +package libpod + +import ( + "errors" + + "github.com/containers/podman/v4/libpod/define" +) + +func (c *Container) stat(containerMountPoint string, containerPath string) (*define.FileInfo, string, string, error) { + return nil, "", "", errors.New("Containers stat not supported on this platform") +} diff --git a/libpod/container_top_linux.go b/libpod/container_top_linux.go index 5571edf73..b9916c3a3 100644 --- a/libpod/container_top_linux.go +++ b/libpod/container_top_linux.go @@ -70,7 +70,7 @@ func (c *Container) Top(descriptors []string) ([]string, error) { output, err = c.execPS(psDescriptors) if err != nil { - return nil, fmt.Errorf("error executing ps(1) in the container: %w", err) + return nil, fmt.Errorf("executing ps(1) in the container: %w", err) } // Trick: filter the ps command from the output instead of diff --git a/libpod/container_top_unsupported.go b/libpod/container_top_unsupported.go new file mode 100644 index 000000000..a8d9b970b --- /dev/null +++ b/libpod/container_top_unsupported.go @@ -0,0 +1,14 @@ +//go:build !linux +// +build !linux + +package libpod + +import ( + "errors" +) + +// Top gathers statistics about the running processes in a container. It returns a +// []string for output +func (c *Container) Top(descriptors []string) ([]string, error) { + return nil, errors.New("not implemented (*Container) Top") +} diff --git a/libpod/container_unsupported.go b/libpod/container_unsupported.go new file mode 100644 index 000000000..16bf11622 --- /dev/null +++ b/libpod/container_unsupported.go @@ -0,0 +1,7 @@ +//go:build !linux && !freebsd +// +build !linux,!freebsd + +package libpod + +type containerPlatformState struct { +} diff --git a/libpod/container_validate.go b/libpod/container_validate.go index e280c60d2..f4611ecce 100644 --- a/libpod/container_validate.go +++ b/libpod/container_validate.go @@ -133,5 +133,13 @@ func (c *Container) validate() error { if len(c.config.InitContainerType) > 0 && len(c.config.Pod) < 1 { return fmt.Errorf("init containers must be created in a pod: %w", define.ErrInvalidArg) } + + if c.config.SdNotifyMode == define.SdNotifyModeIgnore && len(c.config.SdNotifySocket) > 0 { + return fmt.Errorf("cannot set sd-notify socket %q with sd-notify mode %q", c.config.SdNotifySocket, c.config.SdNotifyMode) + } + + if c.config.HealthCheckOnFailureAction != define.HealthCheckOnFailureActionNone && c.config.HealthCheckConfig == nil { + return fmt.Errorf("cannot set on-failure action to %s without a health check", c.config.HealthCheckOnFailureAction.String()) + } return nil } diff --git a/libpod/define/config.go b/libpod/define/config.go index 0181bd31c..1fad5cc9a 100644 --- a/libpod/define/config.go +++ b/libpod/define/config.go @@ -81,15 +81,8 @@ const NoLogging = "none" // PassthroughLogging is the string conmon expects when specifying to use the passthrough driver const PassthroughLogging = "passthrough" -// Strings used for --sdnotify option to podman -const ( - SdNotifyModeContainer = "container" - SdNotifyModeConmon = "conmon" - SdNotifyModeIgnore = "ignore" -) - // DefaultRlimitValue is the value set by default for nofile and nproc const RLimitDefaultValue = uint64(1048576) // BindMountPrefix distinguishes its annotations from others -const BindMountPrefix = "bind-mount-options:" +const BindMountPrefix = "bind-mount-options" diff --git a/libpod/define/container_inspect.go b/libpod/define/container_inspect.go index e6a34ba61..da5c58f27 100644 --- a/libpod/define/container_inspect.go +++ b/libpod/define/container_inspect.go @@ -55,6 +55,8 @@ type InspectContainerConfig struct { StopSignal uint `json:"StopSignal"` // Configured healthcheck for the container Healthcheck *manifest.Schema2HealthConfig `json:"Healthcheck,omitempty"` + // HealthcheckOnFailureAction defines an action to take once the container turns unhealthy. + HealthcheckOnFailureAction string `json:"HealthcheckOnFailureAction,omitempty"` // CreateCommand is the full command plus arguments of the process the // container has been created with. CreateCommand []string `json:"CreateCommand,omitempty"` @@ -79,6 +81,10 @@ type InspectContainerConfig struct { // treated as root directories. Standard bind mounts will be mounted // into paths relative to these directories. ChrootDirs []string `json:"ChrootDirs,omitempty"` + // SdNotifyMode is the sd-notify mode of the container. + SdNotifyMode string `json:"sdNotifyMode,omitempty"` + // SdNotifySocket is the NOTIFY_SOCKET in use by/configured for the container. + SdNotifySocket string `json:"sdNotifySocket,omitempty"` } // InspectRestartPolicy holds information about the container's restart policy. diff --git a/libpod/define/errors.go b/libpod/define/errors.go index fd27e89de..be471c27e 100644 --- a/libpod/define/errors.go +++ b/libpod/define/errors.go @@ -179,6 +179,9 @@ var ( // ErrNetworkInUse indicates the requested operation failed because the network was in use ErrNetworkInUse = errors.New("network is being used") + // ErrNetworkConnected indicates that the required operation failed because the container is already a network endpoint + ErrNetworkConnected = errors.New("network is already connected") + // ErrStoreNotInitialized indicates that the container storage was never // initialized. ErrStoreNotInitialized = errors.New("the container storage was never initialized") diff --git a/libpod/define/exec_codes.go b/libpod/define/exec_codes.go index 3f2da4910..a84730e72 100644 --- a/libpod/define/exec_codes.go +++ b/libpod/define/exec_codes.go @@ -11,8 +11,8 @@ const ( // ExecErrorCodeGeneric is the default error code to return from an exec session if libpod failed // prior to calling the runtime ExecErrorCodeGeneric = 125 - // ExecErrorCodeCannotInvoke is the error code to return when the runtime fails to invoke a command - // an example of this can be found by trying to execute a directory: + // ExecErrorCodeCannotInvoke is the error code to return when the runtime fails to invoke a command. + // An example of this can be found by trying to execute a directory: // `podman exec -l /etc` ExecErrorCodeCannotInvoke = 126 // ExecErrorCodeNotFound is the error code to return when a command cannot be found diff --git a/libpod/define/healthchecks.go b/libpod/define/healthchecks.go index f71274350..274e02561 100644 --- a/libpod/define/healthchecks.go +++ b/libpod/define/healthchecks.go @@ -1,5 +1,10 @@ package define +import ( + "fmt" + "strings" +) + const ( // HealthCheckHealthy describes a healthy container HealthCheckHealthy string = "healthy" @@ -57,3 +62,72 @@ const ( // HealthConfigTestCmdShell runs commands with the system's default shell HealthConfigTestCmdShell = "CMD-SHELL" ) + +// HealthCheckOnFailureAction defines how Podman reacts when a container's health +// status turns unhealthy. +type HealthCheckOnFailureAction int + +// Healthcheck on-failure actions. +const ( + // HealthCheckOnFailureActionNonce instructs Podman to not react on an unhealthy status. + HealthCheckOnFailureActionNone = iota // Must be first iota for backwards compatibility + // HealthCheckOnFailureActionInvalid denotes an invalid on-failure policy. + HealthCheckOnFailureActionInvalid = iota + // HealthCheckOnFailureActionNonce instructs Podman to kill the container on an unhealthy status. + HealthCheckOnFailureActionKill = iota + // HealthCheckOnFailureActionNonce instructs Podman to restart the container on an unhealthy status. + HealthCheckOnFailureActionRestart = iota + // HealthCheckOnFailureActionNonce instructs Podman to stop the container on an unhealthy status. + HealthCheckOnFailureActionStop = iota +) + +// String representations for on-failure actions. +const ( + strHealthCheckOnFailureActionNone = "none" + strHealthCheckOnFailureActionInvalid = "invalid" + strHealthCheckOnFailureActionKill = "kill" + strHealthCheckOnFailureActionRestart = "restart" + strHealthCheckOnFailureActionStop = "stop" +) + +// SupportedHealthCheckOnFailureActions lists all supported healthcheck restart policies. +var SupportedHealthCheckOnFailureActions = []string{ + strHealthCheckOnFailureActionNone, + strHealthCheckOnFailureActionKill, + strHealthCheckOnFailureActionRestart, + strHealthCheckOnFailureActionStop, +} + +// String returns the string representation of the HealthCheckOnFailureAction. +func (h HealthCheckOnFailureAction) String() string { + switch h { + case HealthCheckOnFailureActionNone: + return strHealthCheckOnFailureActionNone + case HealthCheckOnFailureActionKill: + return strHealthCheckOnFailureActionKill + case HealthCheckOnFailureActionRestart: + return strHealthCheckOnFailureActionRestart + case HealthCheckOnFailureActionStop: + return strHealthCheckOnFailureActionStop + default: + return strHealthCheckOnFailureActionInvalid + } +} + +// ParseHealthCheckOnFailureAction parses the specified string into a HealthCheckOnFailureAction. +// An error is returned for an invalid input. +func ParseHealthCheckOnFailureAction(s string) (HealthCheckOnFailureAction, error) { + switch s { + case "", strHealthCheckOnFailureActionNone: + return HealthCheckOnFailureActionNone, nil + case strHealthCheckOnFailureActionKill: + return HealthCheckOnFailureActionKill, nil + case strHealthCheckOnFailureActionRestart: + return HealthCheckOnFailureActionRestart, nil + case strHealthCheckOnFailureActionStop: + return HealthCheckOnFailureActionStop, nil + default: + err := fmt.Errorf("invalid on-failure action %q for health check: supported actions are %s", s, strings.Join(SupportedHealthCheckOnFailureActions, ",")) + return HealthCheckOnFailureActionInvalid, err + } +} diff --git a/libpod/define/mount.go b/libpod/define/mount.go index 1b0d019c8..db444fd83 100644 --- a/libpod/define/mount.go +++ b/libpod/define/mount.go @@ -1,8 +1,6 @@ package define const ( - // TypeBind is the type for mounting host dir - TypeBind = "bind" // TypeVolume is the type for named volumes TypeVolume = "volume" // TypeTmpfs is the type for mounting tmpfs diff --git a/libpod/define/mount_freebsd.go b/libpod/define/mount_freebsd.go new file mode 100644 index 000000000..e080c9ec6 --- /dev/null +++ b/libpod/define/mount_freebsd.go @@ -0,0 +1,8 @@ +//go:build freebsd + +package define + +const ( + // TypeBind is the type for mounting host dir + TypeBind = "nullfs" +) diff --git a/libpod/define/mount_linux.go b/libpod/define/mount_linux.go new file mode 100644 index 000000000..5ef848905 --- /dev/null +++ b/libpod/define/mount_linux.go @@ -0,0 +1,8 @@ +//go:build linux + +package define + +const ( + // TypeBind is the type for mounting host dir + TypeBind = "bind" +) diff --git a/libpod/define/mount_unsupported.go b/libpod/define/mount_unsupported.go new file mode 100644 index 000000000..cb8642fe2 --- /dev/null +++ b/libpod/define/mount_unsupported.go @@ -0,0 +1,8 @@ +//go:build !linux && !freebsd + +package define + +const ( + // TypeBind is the type for mounting host dir + TypeBind = "bind" +) diff --git a/libpod/define/sdnotify.go b/libpod/define/sdnotify.go new file mode 100644 index 000000000..1d548c764 --- /dev/null +++ b/libpod/define/sdnotify.go @@ -0,0 +1,20 @@ +package define + +import "fmt" + +// Strings used for --sdnotify option to podman +const ( + SdNotifyModeContainer = "container" + SdNotifyModeConmon = "conmon" + SdNotifyModeIgnore = "ignore" +) + +// ValidateSdNotifyMode validates the specified mode. +func ValidateSdNotifyMode(mode string) error { + switch mode { + case "", SdNotifyModeContainer, SdNotifyModeConmon, SdNotifyModeIgnore: + return nil + default: + return fmt.Errorf("%w: invalid sdnotify value %q: must be %s, %s or %s", ErrInvalidArg, mode, SdNotifyModeContainer, SdNotifyModeConmon, SdNotifyModeIgnore) + } +} diff --git a/libpod/define/volume_inspect.go b/libpod/define/volume_inspect.go index 9279812da..76120647c 100644 --- a/libpod/define/volume_inspect.go +++ b/libpod/define/volume_inspect.go @@ -57,7 +57,7 @@ type InspectVolumeData struct { // UID/GID. NeedsChown bool `json:"NeedsChown,omitempty"` // Timeout is the specified driver timeout if given - Timeout int `json:"Timeout,omitempty"` + Timeout uint `json:"Timeout,omitempty"` } type VolumeReload struct { diff --git a/libpod/events.go b/libpod/events.go index c9e4c9d26..2f9799114 100644 --- a/libpod/events.go +++ b/libpod/events.go @@ -3,6 +3,7 @@ package libpod import ( "context" "fmt" + "path/filepath" "sync" "github.com/containers/podman/v4/libpod/events" @@ -11,6 +12,10 @@ import ( // newEventer returns an eventer that can be used to read/write events func (r *Runtime) newEventer() (events.Eventer, error) { + if r.config.Engine.EventsLogFilePath == "" { + // default, use path under tmpdir when none was explicitly set by the user + r.config.Engine.EventsLogFilePath = filepath.Join(r.config.Engine.TmpDir, "events", "events.log") + } options := events.EventerOptions{ EventerType: r.config.Engine.EventsLogger, LogFilePath: r.config.Engine.EventsLogFilePath, @@ -55,6 +60,12 @@ func (c *Container) newContainerExitedEvent(exitCode int32) { e.Image = c.config.RootfsImageName e.Type = events.Container e.ContainerExitCode = int(exitCode) + + e.Details = events.Details{ + ID: e.ID, + Attributes: c.Labels(), + } + if err := c.runtime.eventer.Write(e); err != nil { logrus.Errorf("Unable to write container exited event: %q", err) } @@ -70,6 +81,12 @@ func (c *Container) newExecDiedEvent(sessionID string, exitCode int) { e.ContainerExitCode = exitCode e.Attributes = make(map[string]string) e.Attributes["execID"] = sessionID + + e.Details = events.Details{ + ID: e.ID, + Attributes: c.Labels(), + } + if err := c.runtime.eventer.Write(e); err != nil { logrus.Errorf("Unable to write exec died event: %q", err) } @@ -121,11 +138,7 @@ func (v *Volume) newVolumeEvent(status events.Status) { // Events is a wrapper function for everyone to begin tailing the events log // with options func (r *Runtime) Events(ctx context.Context, options events.ReadOptions) error { - eventer, err := r.newEventer() - if err != nil { - return err - } - return eventer.Read(ctx, options) + return r.eventer.Read(ctx, options) } // GetEvents reads the event log and returns events based on input filters @@ -137,10 +150,6 @@ func (r *Runtime) GetEvents(ctx context.Context, filters []string) ([]*events.Ev FromStart: true, Stream: false, } - eventer, err := r.newEventer() - if err != nil { - return nil, err - } logEvents := make([]*events.Event, 0, len(eventChannel)) readLock := sync.Mutex{} @@ -152,7 +161,7 @@ func (r *Runtime) GetEvents(ctx context.Context, filters []string) ([]*events.Ev readLock.Unlock() }() - readErr := eventer.Read(ctx, options) + readErr := r.eventer.Read(ctx, options) readLock.Lock() // Wait for the events to be consumed. return logEvents, readErr } diff --git a/libpod/events/events_freebsd.go b/libpod/events/events_freebsd.go new file mode 100644 index 000000000..90933fa2c --- /dev/null +++ b/libpod/events/events_freebsd.go @@ -0,0 +1,23 @@ +package events + +import ( + "fmt" + "strings" + + "github.com/sirupsen/logrus" +) + +// NewEventer creates an eventer based on the eventer type +func NewEventer(options EventerOptions) (Eventer, error) { + logrus.Debugf("Initializing event backend %s", options.EventerType) + switch strings.ToUpper(options.EventerType) { + case strings.ToUpper(LogFile.String()): + return EventLogFile{options}, nil + case strings.ToUpper(Null.String()): + return newNullEventer(), nil + case strings.ToUpper(Memory.String()): + return NewMemoryEventer(), nil + default: + return nil, fmt.Errorf("unknown event logger type: %s", strings.ToUpper(options.EventerType)) + } +} diff --git a/libpod/events/events_linux.go b/libpod/events/events_linux.go index e7801af5b..66b125dd5 100644 --- a/libpod/events/events_linux.go +++ b/libpod/events/events_linux.go @@ -18,9 +18,9 @@ func NewEventer(options EventerOptions) (Eventer, error) { } return eventer, nil case strings.ToUpper(LogFile.String()): - return EventLogFile{options}, nil + return newLogFileEventer(options) case strings.ToUpper(Null.String()): - return NewNullEventer(), nil + return newNullEventer(), nil case strings.ToUpper(Memory.String()): return NewMemoryEventer(), nil default: diff --git a/libpod/events/events_unsupported.go b/libpod/events/events_unsupported.go index d766402a9..01031c225 100644 --- a/libpod/events/events_unsupported.go +++ b/libpod/events/events_unsupported.go @@ -1,5 +1,5 @@ -//go:build !linux -// +build !linux +//go:build !linux && !freebsd +// +build !linux,!freebsd package events diff --git a/libpod/events/journal_linux.go b/libpod/events/journal_linux.go index 16ef6504f..4986502a2 100644 --- a/libpod/events/journal_linux.go +++ b/libpod/events/journal_linux.go @@ -112,57 +112,16 @@ func (e EventJournalD) Read(ctx context.Context, options ReadOptions) error { } } - // the api requires a next|prev before getting a cursor - if _, err := j.Next(); err != nil { - return fmt.Errorf("failed to move journal cursor to next entry: %w", err) - } - - prevCursor, err := j.GetCursor() - if err != nil { - return fmt.Errorf("failed to get journal cursor: %w", err) - } for { - select { - case <-ctx.Done(): - // the consumer has cancelled - return nil - default: - // fallthrough - } - - if _, err := j.Next(); err != nil { - return fmt.Errorf("failed to move journal cursor to next entry: %w", err) - } - newCursor, err := j.GetCursor() + entry, err := getNextEntry(ctx, j, options.Stream, untilTime) if err != nil { - return fmt.Errorf("failed to get journal cursor: %w", err) + return err } - if prevCursor == newCursor { - if !options.Stream || (len(options.Until) > 0 && time.Now().After(untilTime)) { - break - } - - // j.Wait() is blocking, this would cause the goroutine to hang forever - // if no more journal entries are generated and thus if the client - // has closed the connection in the meantime to leak memory. - // Waiting only 5 seconds makes sure we can check if the client closed in the - // meantime at least every 5 seconds. - t := 5 * time.Second - if len(options.Until) > 0 { - until := time.Until(untilTime) - if until < t { - t = until - } - } - _ = j.Wait(t) - continue + // no entry == we hit the end + if entry == nil { + return nil } - prevCursor = newCursor - entry, err := j.GetEntry() - if err != nil { - return fmt.Errorf("failed to read journal entry: %w", err) - } newEvent, err := newEventFromJournalEntry(entry) if err != nil { // We can't decode this event. @@ -177,7 +136,6 @@ func (e EventJournalD) Read(ctx context.Context, options ReadOptions) error { options.EventChannel <- newEvent } } - return nil } func newEventFromJournalEntry(entry *sdjournal.JournalEntry) (*Event, error) { @@ -238,3 +196,51 @@ func newEventFromJournalEntry(entry *sdjournal.JournalEntry) (*Event, error) { func (e EventJournalD) String() string { return Journald.String() } + +// getNextEntry returns the next entry in the journal. If the end of the +// journal is reached and stream is not set or the current time is after +// the until time this function return nil,nil. +func getNextEntry(ctx context.Context, j *sdjournal.Journal, stream bool, untilTime time.Time) (*sdjournal.JournalEntry, error) { + for { + select { + case <-ctx.Done(): + // the consumer has cancelled + return nil, nil + default: + // fallthrough + } + // the api requires a next|prev before reading the event + ret, err := j.Next() + if err != nil { + return nil, fmt.Errorf("failed to move journal cursor to next entry: %w", err) + } + // ret == 0 equals EOF, see sd_journal_next(3) + if ret == 0 { + if !stream || (!untilTime.IsZero() && time.Now().After(untilTime)) { + // we hit the end and should not keep streaming + return nil, nil + } + // keep waiting for the next entry + // j.Wait() is blocking, this would cause the goroutine to hang forever + // if no more journal entries are generated and thus if the client + // has closed the connection in the meantime to leak memory. + // Waiting only 5 seconds makes sure we can check if the client closed in the + // meantime at least every 5 seconds. + t := 5 * time.Second + if !untilTime.IsZero() { + until := time.Until(untilTime) + if until < t { + t = until + } + } + _ = j.Wait(t) + continue + } + + entry, err := j.GetEntry() + if err != nil { + return nil, fmt.Errorf("failed to read journal entry: %w", err) + } + return entry, nil + } +} diff --git a/libpod/events/logfile.go b/libpod/events/logfile.go index c7dbf4850..d749a0d4d 100644 --- a/libpod/events/logfile.go +++ b/libpod/events/logfile.go @@ -1,5 +1,5 @@ -//go:build linux -// +build linux +//go:build linux || freebsd +// +build linux freebsd package events @@ -12,6 +12,7 @@ import ( "io/ioutil" "os" "path" + "path/filepath" "time" "github.com/containers/podman/v4/pkg/util" @@ -27,6 +28,21 @@ type EventLogFile struct { options EventerOptions } +// newLogFileEventer creates a new EventLogFile eventer +func newLogFileEventer(options EventerOptions) (*EventLogFile, error) { + // Create events log dir + if err := os.MkdirAll(filepath.Dir(options.LogFilePath), 0700); err != nil { + return nil, fmt.Errorf("creating events dirs: %w", err) + } + // We have to make sure the file is created otherwise reading events will hang. + // https://github.com/containers/podman/issues/15688 + fd, err := os.OpenFile(options.LogFilePath, os.O_RDONLY|os.O_CREATE, 0700) + if err != nil { + return nil, fmt.Errorf("failed to create event log file: %w", err) + } + return &EventLogFile{options: options}, fd.Close() +} + // Writes to the log file func (e EventLogFile) Write(ee Event) error { // We need to lock events file @@ -108,6 +124,8 @@ func (e EventLogFile) Read(ctx context.Context, options ReadOptions) error { } }() } + logrus.Debugf("Reading events from file %q", e.options.LogFilePath) + var line *tail.Line var ok bool for { diff --git a/libpod/events/nullout.go b/libpod/events/nullout.go index 587a1b98b..da3820c23 100644 --- a/libpod/events/nullout.go +++ b/libpod/events/nullout.go @@ -2,10 +2,11 @@ package events import ( "context" + "errors" ) -// EventToNull is an eventer type that only performs write operations -// and only writes to /dev/null. It is meant for unittests only +// EventToNull is an eventer type that does nothing. +// It is meant for unittests only type EventToNull struct{} // Write eats the event and always returns nil @@ -13,14 +14,14 @@ func (e EventToNull) Write(ee Event) error { return nil } -// Read does nothing. Do not use it. +// Read does nothing and returns an error. func (e EventToNull) Read(ctx context.Context, options ReadOptions) error { - return nil + return errors.New("cannot read events with the \"none\" backend") } -// NewNullEventer returns a new null eventer. You should only do this for +// newNullEventer returns a new null eventer. You should only do this for // the purposes of internal libpod testing. -func NewNullEventer() Eventer { +func newNullEventer() Eventer { return EventToNull{} } diff --git a/libpod/healthcheck.go b/libpod/healthcheck.go index 9b9d12b17..e835af9f0 100644 --- a/libpod/healthcheck.go +++ b/libpod/healthcheck.go @@ -2,6 +2,7 @@ package libpod import ( "bufio" + "context" "errors" "fmt" "io/ioutil" @@ -12,6 +13,7 @@ import ( "github.com/containers/podman/v4/libpod/define" "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" ) const ( @@ -29,9 +31,14 @@ func (r *Runtime) HealthCheck(name string) (define.HealthCheckStatus, error) { if err != nil { return define.HealthCheckContainerNotFound, fmt.Errorf("unable to look up %s to perform a health check: %w", name, err) } + hcStatus, err := checkHealthCheckCanBeRun(container) if err == nil { - return container.runHealthCheck() + hcStatus, err := container.runHealthCheck() + if err := container.processHealthCheckStatus(hcStatus); err != nil { + return hcStatus, err + } + return hcStatus, err } return hcStatus, err } @@ -127,13 +134,45 @@ func (c *Container) runHealthCheck() (define.HealthCheckStatus, error) { hcResult = define.HealthCheckFailure hcErr = fmt.Errorf("healthcheck command exceeded timeout of %s", c.HealthCheckConfig().Timeout.String()) } + hcl := newHealthCheckLog(timeStart, timeEnd, returnCode, eventLog) if err := c.updateHealthCheckLog(hcl, inStartPeriod); err != nil { return hcResult, fmt.Errorf("unable to update health check log %s for %s: %w", c.healthCheckLogPath(), c.ID(), err) } + return hcResult, hcErr } +func (c *Container) processHealthCheckStatus(status define.HealthCheckStatus) error { + if status == define.HealthCheckSuccess { + return nil + } + + switch c.config.HealthCheckOnFailureAction { + case define.HealthCheckOnFailureActionNone: // Nothing to do + + case define.HealthCheckOnFailureActionKill: + if err := c.Kill(uint(unix.SIGKILL)); err != nil { + return fmt.Errorf("killing container health-check turned unhealthy: %w", err) + } + + case define.HealthCheckOnFailureActionRestart: + if err := c.RestartWithTimeout(context.Background(), c.config.StopTimeout); err != nil { + return fmt.Errorf("restarting container after health-check turned unhealthy: %w", err) + } + + case define.HealthCheckOnFailureActionStop: + if err := c.Stop(); err != nil { + return fmt.Errorf("stopping container after health-check turned unhealthy: %w", err) + } + + default: // Should not happen but better be safe than sorry + return fmt.Errorf("unsupported on-failure action %d", c.config.HealthCheckOnFailureAction) + } + + return nil +} + func checkHealthCheckCanBeRun(c *Container) (define.HealthCheckStatus, error) { cstate, err := c.State() if err != nil { diff --git a/libpod/healthcheck_linux.go b/libpod/healthcheck_linux.go index 3fb6dfb91..6948f976a 100644 --- a/libpod/healthcheck_linux.go +++ b/libpod/healthcheck_linux.go @@ -70,7 +70,7 @@ func (c *Container) startTimer() error { startFile := fmt.Sprintf("%s.service", c.ID()) startChan := make(chan string) - if _, err := conn.StartUnitContext(context.Background(), startFile, "fail", startChan); err != nil { + if _, err := conn.RestartUnitContext(context.Background(), startFile, "fail", startChan); err != nil { return err } if err := systemdOpSuccessful(startChan); err != nil { diff --git a/libpod/healthcheck_unsupported.go b/libpod/healthcheck_unsupported.go new file mode 100644 index 000000000..92cd5d0a3 --- /dev/null +++ b/libpod/healthcheck_unsupported.go @@ -0,0 +1,25 @@ +//go:build !linux +// +build !linux + +package libpod + +import ( + "context" + "errors" +) + +// createTimer systemd timers for healthchecks of a container +func (c *Container) createTimer() error { + return errors.New("not implemented (*Container) createTimer") +} + +// startTimer starts a systemd timer for the healthchecks +func (c *Container) startTimer() error { + return errors.New("not implemented (*Container) startTimer") +} + +// removeTransientFiles removes the systemd timer and unit files +// for the container +func (c *Container) removeTransientFiles(ctx context.Context) error { + return errors.New("not implemented (*Container) removeTransientFiles") +} diff --git a/libpod/info.go b/libpod/info.go index c4193b40d..ad8f65432 100644 --- a/libpod/info.go +++ b/libpod/info.go @@ -5,27 +5,21 @@ import ( "bytes" "errors" "fmt" - "io/ioutil" "math" "os" - "os/exec" "runtime" - "strconv" "strings" "syscall" "time" "github.com/containers/buildah" - "github.com/containers/common/pkg/apparmor" - "github.com/containers/common/pkg/cgroups" - "github.com/containers/common/pkg/seccomp" + "github.com/containers/buildah/pkg/util" "github.com/containers/image/v5/pkg/sysregistriesv2" "github.com/containers/podman/v4/libpod/define" "github.com/containers/podman/v4/libpod/linkmode" "github.com/containers/podman/v4/pkg/rootless" "github.com/containers/storage" "github.com/containers/storage/pkg/system" - "github.com/opencontainers/selinux/go-selinux" "github.com/sirupsen/logrus" ) @@ -34,20 +28,20 @@ func (r *Runtime) info() (*define.Info, error) { info := define.Info{} versionInfo, err := define.GetVersion() if err != nil { - return nil, fmt.Errorf("error getting version info: %w", err) + return nil, fmt.Errorf("getting version info: %w", err) } info.Version = versionInfo // get host information hostInfo, err := r.hostInfo() if err != nil { - return nil, fmt.Errorf("error getting host info: %w", err) + return nil, fmt.Errorf("getting host info: %w", err) } info.Host = hostInfo // get store information storeInfo, err := r.storeInfo() if err != nil { - return nil, fmt.Errorf("error getting store info: %w", err) + return nil, fmt.Errorf("getting store info: %w", err) } info.Store = storeInfo registries := make(map[string]interface{}) @@ -55,14 +49,14 @@ func (r *Runtime) info() (*define.Info, error) { sys := r.SystemContext() data, err := sysregistriesv2.GetRegistries(sys) if err != nil { - return nil, fmt.Errorf("error getting registries: %w", err) + return nil, fmt.Errorf("getting registries: %w", err) } for _, reg := range data { registries[reg.Prefix] = reg } regs, err := sysregistriesv2.UnqualifiedSearchRegistries(sys) if err != nil { - return nil, fmt.Errorf("error getting registries: %w", err) + return nil, fmt.Errorf("getting registries: %w", err) } if len(regs) > 0 { registries["search"] = regs @@ -86,109 +80,45 @@ func (r *Runtime) hostInfo() (*define.HostInfo, error) { // lets say OS, arch, number of cpus, amount of memory, maybe os distribution/version, hostname, kernel version, uptime mi, err := system.ReadMemInfo() if err != nil { - return nil, fmt.Errorf("error reading memory info: %w", err) + return nil, fmt.Errorf("reading memory info: %w", err) } hostDistributionInfo := r.GetHostDistributionInfo() - kv, err := readKernelVersion() + kv, err := util.ReadKernelVersion() if err != nil { - return nil, fmt.Errorf("error reading kernel version: %w", err) + return nil, fmt.Errorf("reading kernel version: %w", err) } host, err := os.Hostname() if err != nil { - return nil, fmt.Errorf("error getting hostname: %w", err) + return nil, fmt.Errorf("getting hostname: %w", err) } - seccompProfilePath, err := DefaultSeccompPath() - if err != nil { - return nil, fmt.Errorf("error getting Seccomp profile path: %w", err) - } - - // Cgroups version - unified, err := cgroups.IsCgroup2UnifiedMode() - if err != nil { - return nil, fmt.Errorf("error reading cgroups mode: %w", err) - } - - // Get Map of all available controllers - availableControllers, err := cgroups.GetAvailableControllers(nil, unified) - if err != nil { - return nil, fmt.Errorf("error getting available cgroup controllers: %w", err) - } cpuUtil, err := getCPUUtilization() if err != nil { return nil, err } info := define.HostInfo{ - Arch: runtime.GOARCH, - BuildahVersion: buildah.Version, - CgroupManager: r.config.Engine.CgroupManager, - CgroupControllers: availableControllers, - Linkmode: linkmode.Linkmode(), - CPUs: runtime.NumCPU(), - CPUUtilization: cpuUtil, - Distribution: hostDistributionInfo, - LogDriver: r.config.Containers.LogDriver, - EventLogger: r.eventer.String(), - Hostname: host, - IDMappings: define.IDMappings{}, - Kernel: kv, - MemFree: mi.MemFree, - MemTotal: mi.MemTotal, - NetworkBackend: r.config.Network.NetworkBackend, - OS: runtime.GOOS, - Security: define.SecurityInfo{ - AppArmorEnabled: apparmor.IsEnabled(), - DefaultCapabilities: strings.Join(r.config.Containers.DefaultCapabilities, ","), - Rootless: rootless.IsRootless(), - SECCOMPEnabled: seccomp.IsEnabled(), - SECCOMPProfilePath: seccompProfilePath, - SELinuxEnabled: selinux.GetEnabled(), - }, - Slirp4NetNS: define.SlirpInfo{}, - SwapFree: mi.SwapFree, - SwapTotal: mi.SwapTotal, - } - - cgroupVersion := "v1" - if unified { - cgroupVersion = "v2" - } - info.CgroupsVersion = cgroupVersion - - slirp4netnsPath := r.config.Engine.NetworkCmdPath - if slirp4netnsPath == "" { - slirp4netnsPath, _ = exec.LookPath("slirp4netns") - } - if slirp4netnsPath != "" { - version, err := programVersion(slirp4netnsPath) - if err != nil { - logrus.Warnf("Failed to retrieve program version for %s: %v", slirp4netnsPath, err) - } - program := define.SlirpInfo{ - Executable: slirp4netnsPath, - Package: packageVersion(slirp4netnsPath), - Version: version, - } - info.Slirp4NetNS = program - } - - if rootless.IsRootless() { - uidmappings, err := rootless.ReadMappingsProc("/proc/self/uid_map") - if err != nil { - return nil, fmt.Errorf("error reading uid mappings: %w", err) - } - gidmappings, err := rootless.ReadMappingsProc("/proc/self/gid_map") - if err != nil { - return nil, fmt.Errorf("error reading gid mappings: %w", err) - } - idmappings := define.IDMappings{ - GIDMap: gidmappings, - UIDMap: uidmappings, - } - info.IDMappings = idmappings + Arch: runtime.GOARCH, + BuildahVersion: buildah.Version, + Linkmode: linkmode.Linkmode(), + CPUs: runtime.NumCPU(), + CPUUtilization: cpuUtil, + Distribution: hostDistributionInfo, + LogDriver: r.config.Containers.LogDriver, + EventLogger: r.eventer.String(), + Hostname: host, + Kernel: kv, + MemFree: mi.MemFree, + MemTotal: mi.MemTotal, + NetworkBackend: r.config.Network.NetworkBackend, + OS: runtime.GOOS, + SwapFree: mi.SwapFree, + SwapTotal: mi.SwapTotal, + } + if err := r.setPlatformHostInfo(&info); err != nil { + return nil, err } conmonInfo, ociruntimeInfo, err := r.defaultOCIRuntime.RuntimeInfo() @@ -199,9 +129,9 @@ func (r *Runtime) hostInfo() (*define.HostInfo, error) { info.OCIRuntime = ociruntimeInfo } - duration, err := procUptime() + duration, err := util.ReadUptime() if err != nil { - return nil, fmt.Errorf("error reading up time: %w", err) + return nil, fmt.Errorf("reading up time: %w", err) } uptime := struct { @@ -271,7 +201,7 @@ func (r *Runtime) storeInfo() (*define.StoreInfo, error) { } images, err := r.store.Images() if err != nil { - return nil, fmt.Errorf("error getting number of images: %w", err) + return nil, fmt.Errorf("getting number of images: %w", err) } conInfo, err := r.getContainerStoreInfo() if err != nil { @@ -329,31 +259,6 @@ func (r *Runtime) storeInfo() (*define.StoreInfo, error) { return &info, nil } -func readKernelVersion() (string, error) { - buf, err := ioutil.ReadFile("/proc/version") - if err != nil { - return "", err - } - f := bytes.Fields(buf) - if len(f) < 3 { - return string(bytes.TrimSpace(buf)), nil - } - return string(f[2]), nil -} - -func procUptime() (time.Duration, error) { - var zero time.Duration - buf, err := ioutil.ReadFile("/proc/uptime") - if err != nil { - return zero, err - } - f := bytes.Fields(buf) - if len(f) < 1 { - return zero, errors.New("unable to parse uptime from /proc/uptime") - } - return time.ParseDuration(string(f[0]) + "s") -} - // GetHostDistributionInfo returns a map containing the host's distribution and version func (r *Runtime) GetHostDistributionInfo() define.DistributionInfo { // Populate values in case we cannot find the values @@ -385,43 +290,3 @@ func (r *Runtime) GetHostDistributionInfo() define.DistributionInfo { } return dist } - -// getCPUUtilization Returns a CPUUsage object that summarizes CPU -// usage for userspace, system, and idle time. -func getCPUUtilization() (*define.CPUUsage, error) { - f, err := os.Open("/proc/stat") - if err != nil { - return nil, err - } - defer f.Close() - scanner := bufio.NewScanner(f) - // Read first line of /proc/stat that has entries for system ("cpu" line) - for scanner.Scan() { - break - } - // column 1 is user, column 3 is system, column 4 is idle - stats := strings.Fields(scanner.Text()) - return statToPercent(stats) -} - -func statToPercent(stats []string) (*define.CPUUsage, error) { - userTotal, err := strconv.ParseFloat(stats[1], 64) - if err != nil { - return nil, fmt.Errorf("unable to parse user value %q: %w", stats[1], err) - } - systemTotal, err := strconv.ParseFloat(stats[3], 64) - if err != nil { - return nil, fmt.Errorf("unable to parse system value %q: %w", stats[3], err) - } - idleTotal, err := strconv.ParseFloat(stats[4], 64) - if err != nil { - return nil, fmt.Errorf("unable to parse idle value %q: %w", stats[4], err) - } - total := userTotal + systemTotal + idleTotal - s := define.CPUUsage{ - UserPercent: math.Round((userTotal/total*100)*100) / 100, - SystemPercent: math.Round((systemTotal/total*100)*100) / 100, - IdlePercent: math.Round((idleTotal/total*100)*100) / 100, - } - return &s, nil -} diff --git a/libpod/info_freebsd.go b/libpod/info_freebsd.go new file mode 100644 index 000000000..1be988350 --- /dev/null +++ b/libpod/info_freebsd.go @@ -0,0 +1,40 @@ +package libpod + +import ( + "fmt" + "unsafe" + + "github.com/containers/podman/v4/libpod/define" + "golang.org/x/sys/unix" +) + +func (r *Runtime) setPlatformHostInfo(info *define.HostInfo) error { + return nil +} + +func timeToPercent(time uint64, total uint64) float64 { + return 100.0 * float64(time) / float64(total) +} + +// getCPUUtilization Returns a CPUUsage object that summarizes CPU +// usage for userspace, system, and idle time. +func getCPUUtilization() (*define.CPUUsage, error) { + buf, err := unix.SysctlRaw("kern.cp_time") + if err != nil { + return nil, fmt.Errorf("reading sysctl kern.cp_time: %w", err) + } + + var total uint64 = 0 + var times [unix.CPUSTATES]uint64 + + for i := 0; i < unix.CPUSTATES; i++ { + val := *(*uint64)(unsafe.Pointer(&buf[8*i])) + times[i] = val + total += val + } + return &define.CPUUsage{ + UserPercent: timeToPercent(times[unix.CP_USER], total), + SystemPercent: timeToPercent(times[unix.CP_SYS], total), + IdlePercent: timeToPercent(times[unix.CP_IDLE], total), + }, nil +} diff --git a/libpod/info_linux.go b/libpod/info_linux.go new file mode 100644 index 000000000..44beafa8c --- /dev/null +++ b/libpod/info_linux.go @@ -0,0 +1,132 @@ +package libpod + +import ( + "bufio" + "fmt" + "math" + "os" + "os/exec" + "strconv" + "strings" + + "github.com/containers/common/pkg/apparmor" + "github.com/containers/common/pkg/cgroups" + "github.com/containers/common/pkg/seccomp" + "github.com/containers/podman/v4/libpod/define" + "github.com/containers/podman/v4/pkg/rootless" + "github.com/opencontainers/selinux/go-selinux" + "github.com/sirupsen/logrus" +) + +func (r *Runtime) setPlatformHostInfo(info *define.HostInfo) error { + seccompProfilePath, err := DefaultSeccompPath() + if err != nil { + return fmt.Errorf("getting Seccomp profile path: %w", err) + } + + // Cgroups version + unified, err := cgroups.IsCgroup2UnifiedMode() + if err != nil { + return fmt.Errorf("reading cgroups mode: %w", err) + } + + // Get Map of all available controllers + availableControllers, err := cgroups.GetAvailableControllers(nil, unified) + if err != nil { + return fmt.Errorf("getting available cgroup controllers: %w", err) + } + + info.CgroupManager = r.config.Engine.CgroupManager + info.CgroupControllers = availableControllers + info.IDMappings = define.IDMappings{} + info.Security = define.SecurityInfo{ + AppArmorEnabled: apparmor.IsEnabled(), + DefaultCapabilities: strings.Join(r.config.Containers.DefaultCapabilities, ","), + Rootless: rootless.IsRootless(), + SECCOMPEnabled: seccomp.IsEnabled(), + SECCOMPProfilePath: seccompProfilePath, + SELinuxEnabled: selinux.GetEnabled(), + } + info.Slirp4NetNS = define.SlirpInfo{} + + cgroupVersion := "v1" + if unified { + cgroupVersion = "v2" + } + info.CgroupsVersion = cgroupVersion + + slirp4netnsPath := r.config.Engine.NetworkCmdPath + if slirp4netnsPath == "" { + slirp4netnsPath, _ = exec.LookPath("slirp4netns") + } + if slirp4netnsPath != "" { + version, err := programVersion(slirp4netnsPath) + if err != nil { + logrus.Warnf("Failed to retrieve program version for %s: %v", slirp4netnsPath, err) + } + program := define.SlirpInfo{ + Executable: slirp4netnsPath, + Package: packageVersion(slirp4netnsPath), + Version: version, + } + info.Slirp4NetNS = program + } + + if rootless.IsRootless() { + uidmappings, err := rootless.ReadMappingsProc("/proc/self/uid_map") + if err != nil { + return fmt.Errorf("reading uid mappings: %w", err) + } + gidmappings, err := rootless.ReadMappingsProc("/proc/self/gid_map") + if err != nil { + return fmt.Errorf("reading gid mappings: %w", err) + } + idmappings := define.IDMappings{ + GIDMap: gidmappings, + UIDMap: uidmappings, + } + info.IDMappings = idmappings + } + + return nil +} + +func statToPercent(stats []string) (*define.CPUUsage, error) { + userTotal, err := strconv.ParseFloat(stats[1], 64) + if err != nil { + return nil, fmt.Errorf("unable to parse user value %q: %w", stats[1], err) + } + systemTotal, err := strconv.ParseFloat(stats[3], 64) + if err != nil { + return nil, fmt.Errorf("unable to parse system value %q: %w", stats[3], err) + } + idleTotal, err := strconv.ParseFloat(stats[4], 64) + if err != nil { + return nil, fmt.Errorf("unable to parse idle value %q: %w", stats[4], err) + } + total := userTotal + systemTotal + idleTotal + s := define.CPUUsage{ + UserPercent: math.Round((userTotal/total*100)*100) / 100, + SystemPercent: math.Round((systemTotal/total*100)*100) / 100, + IdlePercent: math.Round((idleTotal/total*100)*100) / 100, + } + return &s, nil +} + +// getCPUUtilization Returns a CPUUsage object that summarizes CPU +// usage for userspace, system, and idle time. +func getCPUUtilization() (*define.CPUUsage, error) { + f, err := os.Open("/proc/stat") + if err != nil { + return nil, err + } + defer f.Close() + scanner := bufio.NewScanner(f) + // Read first line of /proc/stat that has entries for system ("cpu" line) + for scanner.Scan() { + break + } + // column 1 is user, column 3 is system, column 4 is idle + stats := strings.Fields(scanner.Text()) + return statToPercent(stats) +} diff --git a/libpod/info_unsupported.go b/libpod/info_unsupported.go new file mode 100644 index 000000000..0aed51247 --- /dev/null +++ b/libpod/info_unsupported.go @@ -0,0 +1,14 @@ +//go:build !linux && !freebsd +// +build !linux,!freebsd + +package libpod + +import ( + "errors" + + "github.com/containers/podman/v4/libpod/define" +) + +func (r *Runtime) info() (*define.Info, error) { + return nil, errors.New("not implemented (*Runtime) info") +} diff --git a/libpod/kube.go b/libpod/kube.go index 8c09a6bb5..1f4831006 100644 --- a/libpod/kube.go +++ b/libpod/kube.go @@ -62,6 +62,7 @@ func (p *Pod) GenerateForKube(ctx context.Context) (*v1.Pod, []v1.ServicePort, e extraHost := make([]v1.HostAlias, 0) hostNetwork := false + hostUsers := true if p.HasInfraContainer() { infraContainer, err := p.getInfraContainer() if err != nil { @@ -87,8 +88,9 @@ func (p *Pod) GenerateForKube(ctx context.Context) (*v1.Pod, []v1.ServicePort, e return nil, servicePorts, err } hostNetwork = infraContainer.NetworkMode() == string(namespaces.NetworkMode(specgen.Host)) + hostUsers = infraContainer.IDMappings().HostUIDMapping && infraContainer.IDMappings().HostGIDMapping } - pod, err := p.podWithContainers(ctx, allContainers, ports, hostNetwork) + pod, err := p.podWithContainers(ctx, allContainers, ports, hostNetwork, hostUsers) if err != nil { return nil, servicePorts, err } @@ -267,6 +269,8 @@ func GenerateKubeServiceFromV1Pod(pod *v1.Pod, servicePorts []v1.ServicePort) (Y } service.Spec = serviceSpec service.ObjectMeta = pod.ObjectMeta + // Reset the annotations for the service as the pod annotations are not needed for the service + service.ObjectMeta.Annotations = nil tm := v12.TypeMeta{ Kind: "Service", APIVersion: pod.TypeMeta.APIVersion, @@ -346,7 +350,7 @@ func containersToServicePorts(containers []v1.Container) ([]v1.ServicePort, erro return sps, nil } -func (p *Pod) podWithContainers(ctx context.Context, containers []*Container, ports []v1.ContainerPort, hostNetwork bool) (*v1.Pod, error) { +func (p *Pod) podWithContainers(ctx context.Context, containers []*Container, ports []v1.ContainerPort, hostNetwork, hostUsers bool) (*v1.Pod, error) { deDupPodVolumes := make(map[string]*v1.Volume) first := true podContainers := make([]v1.Container, 0, len(containers)) @@ -383,7 +387,7 @@ func (p *Pod) podWithContainers(ctx context.Context, containers []*Container, po return nil, err } for k, v := range annotations { - podAnnotations[define.BindMountPrefix+k] = TruncateKubeAnnotation(v) + podAnnotations[define.BindMountPrefix] = TruncateKubeAnnotation(k + ":" + v) } // Since port bindings for the pod are handled by the // infra container, wipe them here only if we are sharing the net namespace @@ -444,10 +448,11 @@ func (p *Pod) podWithContainers(ctx context.Context, containers []*Container, po podVolumes, &dnsInfo, hostNetwork, + hostUsers, hostname), nil } -func newPodObject(podName string, annotations map[string]string, initCtrs, containers []v1.Container, volumes []v1.Volume, dnsOptions *v1.PodDNSConfig, hostNetwork bool, hostname string) *v1.Pod { +func newPodObject(podName string, annotations map[string]string, initCtrs, containers []v1.Container, volumes []v1.Volume, dnsOptions *v1.PodDNSConfig, hostNetwork, hostUsers bool, hostname string) *v1.Pod { tm := v12.TypeMeta{ Kind: "Pod", APIVersion: "v1", @@ -466,12 +471,21 @@ func newPodObject(podName string, annotations map[string]string, initCtrs, conta CreationTimestamp: v12.Now(), Annotations: annotations, } + // Set enableServiceLinks to false as podman doesn't use the service port environment variables + enableServiceLinks := false + // Set automountServiceAccountToken to false as podman doesn't use service account tokens + automountServiceAccountToken := false ps := v1.PodSpec{ - Containers: containers, - Hostname: hostname, - HostNetwork: hostNetwork, - InitContainers: initCtrs, - Volumes: volumes, + Containers: containers, + Hostname: hostname, + HostNetwork: hostNetwork, + InitContainers: initCtrs, + Volumes: volumes, + EnableServiceLinks: &enableServiceLinks, + AutomountServiceAccountToken: &automountServiceAccountToken, + } + if !hostUsers { + ps.HostUsers = &hostUsers } if dnsOptions != nil && (len(dnsOptions.Nameservers)+len(dnsOptions.Searches)+len(dnsOptions.Options) > 0) { ps.DNSConfig = dnsOptions @@ -490,6 +504,7 @@ func simplePodWithV1Containers(ctx context.Context, ctrs []*Container) (*v1.Pod, kubeCtrs := make([]v1.Container, 0, len(ctrs)) kubeInitCtrs := []v1.Container{} kubeVolumes := make([]v1.Volume, 0) + hostUsers := true hostNetwork := true podDNS := v1.PodDNSConfig{} kubeAnnotations := make(map[string]string) @@ -519,12 +534,15 @@ func simplePodWithV1Containers(ctx context.Context, ctrs []*Container) (*v1.Pod, if !ctr.HostNetwork() { hostNetwork = false } + if !(ctr.IDMappings().HostUIDMapping && ctr.IDMappings().HostGIDMapping) { + hostUsers = false + } kubeCtr, kubeVols, ctrDNS, annotations, err := containerToV1Container(ctx, ctr) if err != nil { return nil, err } for k, v := range annotations { - kubeAnnotations[define.BindMountPrefix+k] = TruncateKubeAnnotation(v) + kubeAnnotations[define.BindMountPrefix] = TruncateKubeAnnotation(k + ":" + v) } if isInit { kubeInitCtrs = append(kubeInitCtrs, kubeCtr) @@ -580,6 +598,7 @@ func simplePodWithV1Containers(ctx context.Context, ctrs []*Container) (*v1.Pod, kubeVolumes, &podDNS, hostNetwork, + hostUsers, hostname), nil } diff --git a/libpod/lock/file/file_lock.go b/libpod/lock/file/file_lock.go index 1379e690a..bcbaea5e6 100644 --- a/libpod/lock/file/file_lock.go +++ b/libpod/lock/file/file_lock.go @@ -2,7 +2,6 @@ package file import ( "fmt" - "io/ioutil" "os" "path/filepath" "strconv" @@ -103,7 +102,7 @@ func (locks *FileLocks) AllocateGivenLock(lck uint32) error { f, err := os.OpenFile(locks.getLockPath(lck), os.O_RDWR|os.O_CREATE|os.O_EXCL, 0666) if err != nil { - return fmt.Errorf("error creating lock %d: %w", lck, err) + return fmt.Errorf("creating lock %d: %w", lck, err) } f.Close() @@ -129,9 +128,9 @@ func (locks *FileLocks) DeallocateAllLocks() error { if !locks.valid { return fmt.Errorf("locks have already been closed: %w", syscall.EINVAL) } - files, err := ioutil.ReadDir(locks.lockPath) + files, err := os.ReadDir(locks.lockPath) if err != nil { - return fmt.Errorf("error reading directory %s: %w", locks.lockPath, err) + return fmt.Errorf("reading directory %s: %w", locks.lockPath, err) } var lastErr error for _, f := range files { @@ -153,7 +152,7 @@ func (locks *FileLocks) LockFileLock(lck uint32) error { l, err := storage.GetLockfile(locks.getLockPath(lck)) if err != nil { - return fmt.Errorf("error acquiring lock: %w", err) + return fmt.Errorf("acquiring lock: %w", err) } l.Lock() @@ -167,7 +166,7 @@ func (locks *FileLocks) UnlockFileLock(lck uint32) error { } l, err := storage.GetLockfile(locks.getLockPath(lck)) if err != nil { - return fmt.Errorf("error acquiring lock: %w", err) + return fmt.Errorf("acquiring lock: %w", err) } l.Unlock() diff --git a/libpod/networking_common.go b/libpod/networking_common.go new file mode 100644 index 000000000..fa444e26a --- /dev/null +++ b/libpod/networking_common.go @@ -0,0 +1,719 @@ +//go:build linux || freebsd +// +build linux freebsd + +package libpod + +import ( + "errors" + "fmt" + "regexp" + "sort" + + "github.com/containers/common/libnetwork/etchosts" + "github.com/containers/common/libnetwork/types" + "github.com/containers/common/pkg/config" + "github.com/containers/common/pkg/machine" + "github.com/containers/common/pkg/util" + "github.com/containers/podman/v4/libpod/define" + "github.com/containers/podman/v4/libpod/events" + "github.com/containers/podman/v4/pkg/namespaces" + "github.com/containers/podman/v4/pkg/rootless" + "github.com/containers/storage/pkg/lockfile" + "github.com/sirupsen/logrus" +) + +// convertPortMappings will remove the HostIP part from the ports when running inside podman machine. +// This is need because a HostIP of 127.0.0.1 would now allow the gvproxy forwarder to reach to open ports. +// For machine the HostIP must only be used by gvproxy and never in the VM. +func (c *Container) convertPortMappings() []types.PortMapping { + if !machine.IsGvProxyBased() || len(c.config.PortMappings) == 0 { + return c.config.PortMappings + } + // if we run in a machine VM we have to ignore the host IP part + newPorts := make([]types.PortMapping, 0, len(c.config.PortMappings)) + for _, port := range c.config.PortMappings { + port.HostIP = "" + newPorts = append(newPorts, port) + } + return newPorts +} + +func (c *Container) getNetworkOptions(networkOpts map[string]types.PerNetworkOptions) types.NetworkOptions { + opts := types.NetworkOptions{ + ContainerID: c.config.ID, + ContainerName: getCNIPodName(c), + } + opts.PortMappings = c.convertPortMappings() + + // If the container requested special network options use this instead of the config. + // This is the case for container restore or network reload. + if c.perNetworkOpts != nil { + opts.Networks = c.perNetworkOpts + } else { + opts.Networks = networkOpts + } + return opts +} + +// setUpNetwork will set up the the networks, on error it will also tear down the cni +// networks. If rootless it will join/create the rootless network namespace. +func (r *Runtime) setUpNetwork(ns string, opts types.NetworkOptions) (map[string]types.StatusBlock, error) { + rootlessNetNS, err := r.GetRootlessNetNs(true) + if err != nil { + return nil, err + } + var results map[string]types.StatusBlock + setUpPod := func() error { + results, err = r.network.Setup(ns, types.SetupOptions{NetworkOptions: opts}) + return err + } + // rootlessNetNS is nil if we are root + if rootlessNetNS != nil { + // execute the setup in the rootless net ns + err = rootlessNetNS.Do(setUpPod) + rootlessNetNS.Lock.Unlock() + } else { + err = setUpPod() + } + return results, err +} + +// getCNIPodName return the pod name (hostname) used by CNI and the dnsname plugin. +// If we are in the pod network namespace use the pod name otherwise the container name +func getCNIPodName(c *Container) string { + if c.config.NetMode.IsPod() || c.IsInfra() { + pod, err := c.runtime.state.Pod(c.PodID()) + if err == nil { + return pod.Name() + } + } + return c.Name() +} + +// Tear down a container's network configuration and joins the +// rootless net ns as rootless user +func (r *Runtime) teardownNetwork(ns string, opts types.NetworkOptions) error { + rootlessNetNS, err := r.GetRootlessNetNs(false) + if err != nil { + return err + } + tearDownPod := func() error { + if err := r.network.Teardown(ns, types.TeardownOptions{NetworkOptions: opts}); err != nil { + return fmt.Errorf("tearing down network namespace configuration for container %s: %w", opts.ContainerID, err) + } + return nil + } + + // rootlessNetNS is nil if we are root + if rootlessNetNS != nil { + // execute the cni setup in the rootless net ns + err = rootlessNetNS.Do(tearDownPod) + if cerr := rootlessNetNS.Cleanup(r); cerr != nil { + logrus.WithError(err).Error("failed to clean up rootless netns") + } + rootlessNetNS.Lock.Unlock() + } else { + err = tearDownPod() + } + return err +} + +// Tear down a container's CNI network configuration, but do not tear down the +// namespace itself. +func (r *Runtime) teardownCNI(ctr *Container) error { + if ctr.state.NetNS == nil { + // The container has no network namespace, we're set + return nil + } + + logrus.Debugf("Tearing down network namespace at %s for container %s", ctr.state.NetNS.Path(), ctr.ID()) + + networks, err := ctr.networks() + if err != nil { + return err + } + + if !ctr.config.NetMode.IsSlirp4netns() && len(networks) > 0 { + netOpts := ctr.getNetworkOptions(networks) + return r.teardownNetwork(ctr.state.NetNS.Path(), netOpts) + } + return nil +} + +// isBridgeNetMode checks if the given network mode is bridge. +// It returns nil when it is set to bridge and an error otherwise. +func isBridgeNetMode(n namespaces.NetworkMode) error { + if !n.IsBridge() { + return fmt.Errorf("%q is not supported: %w", n, define.ErrNetworkModeInvalid) + } + return nil +} + +// Reload only works with containers with a configured network. +// It will tear down, and then reconfigure, the network of the container. +// This is mainly used when a reload of firewall rules wipes out existing +// firewall configuration. +// Efforts will be made to preserve MAC and IP addresses, but this only works if +// the container only joined a single CNI network, and was only assigned a +// single MAC or IP. +// Only works on root containers at present, though in the future we could +// extend this to stop + restart slirp4netns +func (r *Runtime) reloadContainerNetwork(ctr *Container) (map[string]types.StatusBlock, error) { + if ctr.state.NetNS == nil { + return nil, fmt.Errorf("container %s network is not configured, refusing to reload: %w", ctr.ID(), define.ErrCtrStateInvalid) + } + if err := isBridgeNetMode(ctr.config.NetMode); err != nil { + return nil, err + } + logrus.Infof("Going to reload container %s network", ctr.ID()) + + err := r.teardownCNI(ctr) + if err != nil { + // teardownCNI will error if the iptables rules do not exists and this is the case after + // a firewall reload. The purpose of network reload is to recreate the rules if they do + // not exists so we should not log this specific error as error. This would confuse users otherwise. + // iptables-legacy and iptables-nft will create different errors make sure to match both. + b, rerr := regexp.MatchString("Couldn't load target `CNI-[a-f0-9]{24}':No such file or directory|Chain 'CNI-[a-f0-9]{24}' does not exist", err.Error()) + if rerr == nil && !b { + logrus.Error(err) + } else { + logrus.Info(err) + } + } + + networkOpts, err := ctr.networks() + if err != nil { + return nil, err + } + + // Set the same network settings as before.. + netStatus := ctr.getNetworkStatus() + for network, perNetOpts := range networkOpts { + for name, netInt := range netStatus[network].Interfaces { + perNetOpts.InterfaceName = name + perNetOpts.StaticMAC = netInt.MacAddress + for _, netAddress := range netInt.Subnets { + perNetOpts.StaticIPs = append(perNetOpts.StaticIPs, netAddress.IPNet.IP) + } + // Normally interfaces have a length of 1, only for some special cni configs we could get more. + // For now just use the first interface to get the ips this should be good enough for most cases. + break + } + networkOpts[network] = perNetOpts + } + ctr.perNetworkOpts = networkOpts + + return r.configureNetNS(ctr, ctr.state.NetNS) +} + +// Produce an InspectNetworkSettings containing information on the container +// network. +func (c *Container) getContainerNetworkInfo() (*define.InspectNetworkSettings, error) { + if c.config.NetNsCtr != "" { + netNsCtr, err := c.runtime.GetContainer(c.config.NetNsCtr) + if err != nil { + return nil, err + } + // see https://github.com/containers/podman/issues/10090 + // the container has to be locked for syncContainer() + netNsCtr.lock.Lock() + defer netNsCtr.lock.Unlock() + // Have to sync to ensure that state is populated + if err := netNsCtr.syncContainer(); err != nil { + return nil, err + } + logrus.Debugf("Container %s shares network namespace, retrieving network info of container %s", c.ID(), c.config.NetNsCtr) + + return netNsCtr.getContainerNetworkInfo() + } + + settings := new(define.InspectNetworkSettings) + settings.Ports = makeInspectPortBindings(c.config.PortMappings, c.config.ExposedPorts) + + networks, err := c.networks() + if err != nil { + return nil, err + } + + if c.state.NetNS == nil { + if networkNSPath := c.joinedNetworkNSPath(); networkNSPath != "" { + if result, err := c.inspectJoinedNetworkNS(networkNSPath); err == nil { + // fallback to dummy configuration + settings.InspectBasicNetworkConfig = resultToBasicNetworkConfig(result) + return settings, nil + } + // do not propagate error inspecting a joined network ns + logrus.Errorf("Inspecting network namespace: %s of container %s: %v", networkNSPath, c.ID(), err) + } + // We can't do more if the network is down. + + // We still want to make dummy configurations for each CNI net + // the container joined. + if len(networks) > 0 { + settings.Networks = make(map[string]*define.InspectAdditionalNetwork, len(networks)) + for net, opts := range networks { + cniNet := new(define.InspectAdditionalNetwork) + cniNet.NetworkID = net + cniNet.Aliases = opts.Aliases + settings.Networks[net] = cniNet + } + } + + return settings, nil + } + + // Set network namespace path + settings.SandboxKey = c.state.NetNS.Path() + + netStatus := c.getNetworkStatus() + // If this is empty, we're probably slirp4netns + if len(netStatus) == 0 { + return settings, nil + } + + // If we have networks - handle that here + if len(networks) > 0 { + if len(networks) != len(netStatus) { + return nil, fmt.Errorf("network inspection mismatch: asked to join %d network(s) %v, but have information on %d network(s): %w", len(networks), networks, len(netStatus), define.ErrInternal) + } + + settings.Networks = make(map[string]*define.InspectAdditionalNetwork) + + for name, opts := range networks { + result := netStatus[name] + addedNet := new(define.InspectAdditionalNetwork) + addedNet.NetworkID = name + addedNet.Aliases = opts.Aliases + addedNet.InspectBasicNetworkConfig = resultToBasicNetworkConfig(result) + + settings.Networks[name] = addedNet + } + + // if not only the default network is connected we can return here + // otherwise we have to populate the InspectBasicNetworkConfig settings + _, isDefaultNet := networks[c.runtime.config.Network.DefaultNetwork] + if !(len(networks) == 1 && isDefaultNet) { + return settings, nil + } + } + + // If not joining networks, we should have at most 1 result + if len(netStatus) > 1 { + return nil, fmt.Errorf("should have at most 1 network status result if not joining networks, instead got %d: %w", len(netStatus), define.ErrInternal) + } + + if len(netStatus) == 1 { + for _, status := range netStatus { + settings.InspectBasicNetworkConfig = resultToBasicNetworkConfig(status) + } + } + return settings, nil +} + +// resultToBasicNetworkConfig produces an InspectBasicNetworkConfig from a CNI +// result +func resultToBasicNetworkConfig(result types.StatusBlock) define.InspectBasicNetworkConfig { + config := define.InspectBasicNetworkConfig{} + interfaceNames := make([]string, 0, len(result.Interfaces)) + for interfaceName := range result.Interfaces { + interfaceNames = append(interfaceNames, interfaceName) + } + // ensure consistent inspect results by sorting + sort.Strings(interfaceNames) + for _, interfaceName := range interfaceNames { + netInt := result.Interfaces[interfaceName] + for _, netAddress := range netInt.Subnets { + size, _ := netAddress.IPNet.Mask.Size() + if netAddress.IPNet.IP.To4() != nil { + // ipv4 + if config.IPAddress == "" { + config.IPAddress = netAddress.IPNet.IP.String() + config.IPPrefixLen = size + config.Gateway = netAddress.Gateway.String() + } else { + config.SecondaryIPAddresses = append(config.SecondaryIPAddresses, define.Address{Addr: netAddress.IPNet.IP.String(), PrefixLength: size}) + } + } else { + // ipv6 + if config.GlobalIPv6Address == "" { + config.GlobalIPv6Address = netAddress.IPNet.IP.String() + config.GlobalIPv6PrefixLen = size + config.IPv6Gateway = netAddress.Gateway.String() + } else { + config.SecondaryIPv6Addresses = append(config.SecondaryIPv6Addresses, define.Address{Addr: netAddress.IPNet.IP.String(), PrefixLength: size}) + } + } + } + if config.MacAddress == "" { + config.MacAddress = netInt.MacAddress.String() + } else { + config.AdditionalMacAddresses = append(config.AdditionalMacAddresses, netInt.MacAddress.String()) + } + } + return config +} + +// NetworkDisconnect removes a container from the network +func (c *Container) NetworkDisconnect(nameOrID, netName string, force bool) error { + // only the bridge mode supports cni networks + if err := isBridgeNetMode(c.config.NetMode); err != nil { + return err + } + + c.lock.Lock() + defer c.lock.Unlock() + + networks, err := c.networks() + if err != nil { + return err + } + + // check if network exists and if the input is a ID we get the name + // CNI only uses names so it is important that we only use the name + netName, err = c.runtime.normalizeNetworkName(netName) + if err != nil { + return err + } + + _, nameExists := networks[netName] + if !nameExists && len(networks) > 0 { + return fmt.Errorf("container %s is not connected to network %s", nameOrID, netName) + } + + if err := c.syncContainer(); err != nil { + return err + } + // get network status before we disconnect + networkStatus := c.getNetworkStatus() + + if err := c.runtime.state.NetworkDisconnect(c, netName); err != nil { + return err + } + + c.newNetworkEvent(events.NetworkDisconnect, netName) + if !c.ensureState(define.ContainerStateRunning, define.ContainerStateCreated) { + return nil + } + + if c.state.NetNS == nil { + return fmt.Errorf("unable to disconnect %s from %s: %w", nameOrID, netName, define.ErrNoNetwork) + } + + opts := types.NetworkOptions{ + ContainerID: c.config.ID, + ContainerName: getCNIPodName(c), + } + opts.PortMappings = c.convertPortMappings() + opts.Networks = map[string]types.PerNetworkOptions{ + netName: networks[netName], + } + + if err := c.runtime.teardownNetwork(c.state.NetNS.Path(), opts); err != nil { + return err + } + + // update network status if container is running + oldStatus, statusExist := networkStatus[netName] + delete(networkStatus, netName) + c.state.NetworkStatus = networkStatus + err = c.save() + if err != nil { + return err + } + + // Reload ports when there are still connected networks, maybe we removed the network interface with the child ip. + // Reloading without connected networks does not make sense, so we can skip this step. + if rootless.IsRootless() && len(networkStatus) > 0 { + if err := c.reloadRootlessRLKPortMapping(); err != nil { + return err + } + } + + // Update resolv.conf if required + if statusExist { + stringIPs := make([]string, 0, len(oldStatus.DNSServerIPs)) + for _, ip := range oldStatus.DNSServerIPs { + stringIPs = append(stringIPs, ip.String()) + } + if len(stringIPs) > 0 { + logrus.Debugf("Removing DNS Servers %v from resolv.conf", stringIPs) + if err := c.removeNameserver(stringIPs); err != nil { + return err + } + } + + // update /etc/hosts file + if file, ok := c.state.BindMounts[config.DefaultHostsFile]; ok { + // sync the names with c.getHostsEntries() + names := []string{c.Hostname(), c.config.Name} + rm := etchosts.GetNetworkHostEntries(map[string]types.StatusBlock{netName: oldStatus}, names...) + if len(rm) > 0 { + // make sure to lock this file to prevent concurrent writes when + // this is used a net dependency container + lock, err := lockfile.GetLockfile(file) + if err != nil { + return fmt.Errorf("failed to lock hosts file: %w", err) + } + logrus.Debugf("Remove /etc/hosts entries %v", rm) + lock.Lock() + err = etchosts.Remove(file, rm) + lock.Unlock() + if err != nil { + return err + } + } + } + } + return nil +} + +// ConnectNetwork connects a container to a given network +func (c *Container) NetworkConnect(nameOrID, netName string, netOpts types.PerNetworkOptions) error { + // only the bridge mode supports cni networks + if err := isBridgeNetMode(c.config.NetMode); err != nil { + return err + } + + c.lock.Lock() + defer c.lock.Unlock() + + networks, err := c.networks() + if err != nil { + return err + } + + // check if network exists and if the input is a ID we get the name + // CNI only uses names so it is important that we only use the name + netName, err = c.runtime.normalizeNetworkName(netName) + if err != nil { + return err + } + + if err := c.syncContainer(); err != nil { + return err + } + + // get network status before we connect + networkStatus := c.getNetworkStatus() + + // always add the short id as alias for docker compat + netOpts.Aliases = append(netOpts.Aliases, c.config.ID[:12]) + + if netOpts.InterfaceName == "" { + netOpts.InterfaceName = getFreeInterfaceName(networks) + if netOpts.InterfaceName == "" { + return errors.New("could not find free network interface name") + } + } + + if err := c.runtime.state.NetworkConnect(c, netName, netOpts); err != nil { + // Docker compat: treat requests to attach already attached networks as a no-op, ignoring opts + if errors.Is(err, define.ErrNetworkConnected) && c.ensureState(define.ContainerStateConfigured) { + return nil + } + + return err + } + c.newNetworkEvent(events.NetworkConnect, netName) + if !c.ensureState(define.ContainerStateRunning, define.ContainerStateCreated) { + return nil + } + if c.state.NetNS == nil { + return fmt.Errorf("unable to connect %s to %s: %w", nameOrID, netName, define.ErrNoNetwork) + } + + opts := types.NetworkOptions{ + ContainerID: c.config.ID, + ContainerName: getCNIPodName(c), + } + opts.PortMappings = c.convertPortMappings() + opts.Networks = map[string]types.PerNetworkOptions{ + netName: netOpts, + } + + results, err := c.runtime.setUpNetwork(c.state.NetNS.Path(), opts) + if err != nil { + return err + } + if len(results) != 1 { + return errors.New("when adding aliases, results must be of length 1") + } + + // we need to get the old host entries before we add the new one to the status + // if we do not add do it here we will get the wrong existing entries which will throw of the logic + // we could also copy the map but this does not seem worth it + // sync the hostNames with c.getHostsEntries() + hostNames := []string{c.Hostname(), c.config.Name} + oldHostEntries := etchosts.GetNetworkHostEntries(networkStatus, hostNames...) + + // update network status + if networkStatus == nil { + networkStatus = make(map[string]types.StatusBlock, 1) + } + networkStatus[netName] = results[netName] + c.state.NetworkStatus = networkStatus + + err = c.save() + if err != nil { + return err + } + + // The first network needs a port reload to set the correct child ip for the rootlessport process. + // Adding a second network does not require a port reload because the child ip is still valid. + if rootless.IsRootless() && len(networks) == 0 { + if err := c.reloadRootlessRLKPortMapping(); err != nil { + return err + } + } + + ipv6, err := c.checkForIPv6(networkStatus) + if err != nil { + return err + } + + // Update resolv.conf if required + stringIPs := make([]string, 0, len(results[netName].DNSServerIPs)) + for _, ip := range results[netName].DNSServerIPs { + if (ip.To4() == nil) && !ipv6 { + continue + } + stringIPs = append(stringIPs, ip.String()) + } + if len(stringIPs) > 0 { + logrus.Debugf("Adding DNS Servers %v to resolv.conf", stringIPs) + if err := c.addNameserver(stringIPs); err != nil { + return err + } + } + + // update /etc/hosts file + if file, ok := c.state.BindMounts[config.DefaultHostsFile]; ok { + // make sure to lock this file to prevent concurrent writes when + // this is used a net dependency container + lock, err := lockfile.GetLockfile(file) + if err != nil { + return fmt.Errorf("failed to lock hosts file: %w", err) + } + new := etchosts.GetNetworkHostEntries(results, hostNames...) + logrus.Debugf("Add /etc/hosts entries %v", new) + // use special AddIfExists API to make sure we only add new entries if an old one exists + // see the AddIfExists() comment for more information + lock.Lock() + err = etchosts.AddIfExists(file, oldHostEntries, new) + lock.Unlock() + if err != nil { + return err + } + } + + return nil +} + +// get a free interface name for a new network +// return an empty string if no free name was found +func getFreeInterfaceName(networks map[string]types.PerNetworkOptions) string { + ifNames := make([]string, 0, len(networks)) + for _, opts := range networks { + ifNames = append(ifNames, opts.InterfaceName) + } + for i := 0; i < 100000; i++ { + ifName := fmt.Sprintf("eth%d", i) + if !util.StringInSlice(ifName, ifNames) { + return ifName + } + } + return "" +} + +// DisconnectContainerFromNetwork removes a container from its CNI network +func (r *Runtime) DisconnectContainerFromNetwork(nameOrID, netName string, force bool) error { + ctr, err := r.LookupContainer(nameOrID) + if err != nil { + return err + } + return ctr.NetworkDisconnect(nameOrID, netName, force) +} + +// ConnectContainerToNetwork connects a container to a CNI network +func (r *Runtime) ConnectContainerToNetwork(nameOrID, netName string, netOpts types.PerNetworkOptions) error { + ctr, err := r.LookupContainer(nameOrID) + if err != nil { + return err + } + return ctr.NetworkConnect(nameOrID, netName, netOpts) +} + +// normalizeNetworkName takes a network name, a partial or a full network ID and returns the network name. +// If the network is not found a errors is returned. +func (r *Runtime) normalizeNetworkName(nameOrID string) (string, error) { + net, err := r.network.NetworkInspect(nameOrID) + if err != nil { + return "", err + } + return net.Name, nil +} + +// ocicniPortsToNetTypesPorts convert the old port format to the new one +// while deduplicating ports into ranges +func ocicniPortsToNetTypesPorts(ports []types.OCICNIPortMapping) []types.PortMapping { + if len(ports) == 0 { + return nil + } + + newPorts := make([]types.PortMapping, 0, len(ports)) + + // first sort the ports + sort.Slice(ports, func(i, j int) bool { + return compareOCICNIPorts(ports[i], ports[j]) + }) + + // we already check if the slice is empty so we can use the first element + currentPort := types.PortMapping{ + HostIP: ports[0].HostIP, + HostPort: uint16(ports[0].HostPort), + ContainerPort: uint16(ports[0].ContainerPort), + Protocol: ports[0].Protocol, + Range: 1, + } + + for i := 1; i < len(ports); i++ { + if ports[i].HostIP == currentPort.HostIP && + ports[i].Protocol == currentPort.Protocol && + ports[i].HostPort-int32(currentPort.Range) == int32(currentPort.HostPort) && + ports[i].ContainerPort-int32(currentPort.Range) == int32(currentPort.ContainerPort) { + currentPort.Range++ + } else { + newPorts = append(newPorts, currentPort) + currentPort = types.PortMapping{ + HostIP: ports[i].HostIP, + HostPort: uint16(ports[i].HostPort), + ContainerPort: uint16(ports[i].ContainerPort), + Protocol: ports[i].Protocol, + Range: 1, + } + } + } + newPorts = append(newPorts, currentPort) + return newPorts +} + +// compareOCICNIPorts will sort the ocicni ports by +// 1) host ip +// 2) protocol +// 3) hostPort +// 4) container port +func compareOCICNIPorts(i, j types.OCICNIPortMapping) bool { + if i.HostIP != j.HostIP { + return i.HostIP < j.HostIP + } + + if i.Protocol != j.Protocol { + return i.Protocol < j.Protocol + } + + if i.HostPort != j.HostPort { + return i.HostPort < j.HostPort + } + + return i.ContainerPort < j.ContainerPort +} diff --git a/libpod/networking_freebsd.go b/libpod/networking_freebsd.go new file mode 100644 index 000000000..230efc99d --- /dev/null +++ b/libpod/networking_freebsd.go @@ -0,0 +1,268 @@ +//go:build freebsd +// +build freebsd + +package libpod + +import ( + "crypto/rand" + jdec "encoding/json" + "errors" + "fmt" + "net" + "os/exec" + "path/filepath" + + "github.com/containers/buildah/pkg/jail" + "github.com/containers/common/libnetwork/types" + "github.com/containers/storage/pkg/lockfile" + "github.com/sirupsen/logrus" +) + +type Netstat struct { + Statistics NetstatInterface `json:"statistics"` +} + +type NetstatInterface struct { + Interface []NetstatAddress `json:"interface"` +} + +type NetstatAddress struct { + Name string `json:"name"` + Flags string `json:"flags"` + Mtu int `json:"mtu"` + Network string `json:"network"` + Address string `json:"address"` + + ReceivedPackets uint64 `json:"received-packets"` + ReceivedBytes uint64 `json:"received-bytes"` + ReceivedErrors uint64 `json:"received-errors"` + + SentPackets uint64 `json:"sent-packets"` + SentBytes uint64 `json:"sent-bytes"` + SentErrors uint64 `json:"send-errors"` + + DroppedPackets uint64 `json:"dropped-packets"` + + Collisions uint64 `json:"collisions"` +} + +// copied from github.com/vishvanada/netlink which does not build on freebsd +type LinkStatistics64 struct { + RxPackets uint64 + TxPackets uint64 + RxBytes uint64 + TxBytes uint64 + RxErrors uint64 + TxErrors uint64 + RxDropped uint64 + TxDropped uint64 + Multicast uint64 + Collisions uint64 + RxLengthErrors uint64 + RxOverErrors uint64 + RxCrcErrors uint64 + RxFrameErrors uint64 + RxFifoErrors uint64 + RxMissedErrors uint64 + TxAbortedErrors uint64 + TxCarrierErrors uint64 + TxFifoErrors uint64 + TxHeartbeatErrors uint64 + TxWindowErrors uint64 + RxCompressed uint64 + TxCompressed uint64 +} + +type RootlessNetNS struct { + dir string + Lock lockfile.Locker +} + +// getPath will join the given path to the rootless netns dir +func (r *RootlessNetNS) getPath(path string) string { + return filepath.Join(r.dir, path) +} + +// Do - run the given function in the rootless netns. +// It does not lock the rootlessCNI lock, the caller +// should only lock when needed, e.g. for cni operations. +func (r *RootlessNetNS) Do(toRun func() error) error { + return errors.New("not supported on freebsd") +} + +// Cleanup the rootless network namespace if needed. +// It checks if we have running containers with the bridge network mode. +// Cleanup() expects that r.Lock is locked +func (r *RootlessNetNS) Cleanup(runtime *Runtime) error { + return errors.New("not supported on freebsd") +} + +// GetRootlessNetNs returns the rootless netns object. If create is set to true +// the rootless network namespace will be created if it does not exists already. +// If called as root it returns always nil. +// On success the returned RootlessCNI lock is locked and must be unlocked by the caller. +func (r *Runtime) GetRootlessNetNs(new bool) (*RootlessNetNS, error) { + return nil, nil +} + +func GetSlirp4netnsIP(subnet *net.IPNet) (*net.IP, error) { + return nil, errors.New("not implemented GetSlirp4netnsIP") +} + +// While there is code in container_internal.go which calls this, in +// my testing network creation always seems to go through createNetNS. +func (r *Runtime) setupNetNS(ctr *Container) error { + return errors.New("not implemented (*Runtime) setupNetNS") +} + +// Create and configure a new network namespace for a container +func (r *Runtime) configureNetNS(ctr *Container, ctrNS *jailNetNS) (status map[string]types.StatusBlock, rerr error) { + if err := r.exposeMachinePorts(ctr.config.PortMappings); err != nil { + return nil, err + } + defer func() { + // make sure to unexpose the gvproxy ports when an error happens + if rerr != nil { + if err := r.unexposeMachinePorts(ctr.config.PortMappings); err != nil { + logrus.Errorf("failed to free gvproxy machine ports: %v", err) + } + } + }() + networks, err := ctr.networks() + if err != nil { + return nil, err + } + // All networks have been removed from the container. + // This is effectively forcing net=none. + if len(networks) == 0 { + return nil, nil + } + + netOpts := ctr.getNetworkOptions(networks) + netStatus, err := r.setUpNetwork(ctrNS.Name, netOpts) + if err != nil { + return nil, err + } + + return netStatus, err +} + +// Create and configure a new network namespace for a container +func (r *Runtime) createNetNS(ctr *Container) (n *jailNetNS, q map[string]types.StatusBlock, retErr error) { + b := make([]byte, 16) + _, err := rand.Reader.Read(b) + if err != nil { + return nil, nil, fmt.Errorf("failed to generate random vnet name: %v", err) + } + ctrNS := &jailNetNS{Name: fmt.Sprintf("vnet-%x-%x-%x-%x-%x", b[0:4], b[4:6], b[6:8], b[8:10], b[10:])} + + jconf := jail.NewConfig() + jconf.Set("name", ctrNS.Name) + jconf.Set("vnet", jail.NEW) + jconf.Set("children.max", 1) + jconf.Set("persist", true) + jconf.Set("enforce_statfs", 0) + jconf.Set("devfs_ruleset", 4) + jconf.Set("allow.raw_sockets", true) + jconf.Set("allow.chflags", true) + jconf.Set("securelevel", -1) + if _, err := jail.Create(jconf); err != nil { + logrus.Debugf("Failed to create vnet jail %s for container %s", ctrNS.Name, ctr.ID()) + } + + logrus.Debugf("Created vnet jail %s for container %s", ctrNS.Name, ctr.ID()) + + var networkStatus map[string]types.StatusBlock + networkStatus, err = r.configureNetNS(ctr, ctrNS) + return ctrNS, networkStatus, err +} + +// Tear down a network namespace, undoing all state associated with it. +func (r *Runtime) teardownNetNS(ctr *Container) error { + if err := r.unexposeMachinePorts(ctr.config.PortMappings); err != nil { + // do not return an error otherwise we would prevent network cleanup + logrus.Errorf("failed to free gvproxy machine ports: %v", err) + } + if err := r.teardownCNI(ctr); err != nil { + return err + } + + if ctr.state.NetNS != nil { + // Rather than destroying the jail immediately, reset the + // persist flag so that it will live until the container is + // done. + netjail, err := jail.FindByName(ctr.state.NetNS.Name) + if err != nil { + return fmt.Errorf("finding network jail %s: %w", ctr.state.NetNS.Name, err) + } + jconf := jail.NewConfig() + jconf.Set("persist", false) + if err := netjail.Set(jconf); err != nil { + return fmt.Errorf("releasing network jail %s: %w", ctr.state.NetNS.Name, err) + } + + ctr.state.NetNS = nil + } + + return nil +} + +func getContainerNetIO(ctr *Container) (*LinkStatistics64, error) { + if ctr.state.NetNS == nil { + // If NetNS is nil, it was set as none, and no netNS + // was set up this is a valid state and thus return no + // error, nor any statistics + return nil, nil + } + + // FIXME get the interface from the container netstatus + cmd := exec.Command("jexec", ctr.state.NetNS.Name, "netstat", "-bI", "eth0", "--libxo", "json") + out, err := cmd.Output() + if err != nil { + return nil, err + } + stats := Netstat{} + if err := jdec.Unmarshal(out, &stats); err != nil { + return nil, err + } + + // Find the link stats + for _, ifaddr := range stats.Statistics.Interface { + if ifaddr.Mtu > 0 { + return &LinkStatistics64{ + RxPackets: ifaddr.ReceivedPackets, + TxPackets: ifaddr.SentPackets, + RxBytes: ifaddr.ReceivedBytes, + TxBytes: ifaddr.SentBytes, + RxErrors: ifaddr.ReceivedErrors, + TxErrors: ifaddr.SentErrors, + RxDropped: ifaddr.DroppedPackets, + Collisions: ifaddr.Collisions, + }, nil + } + } + + return &LinkStatistics64{}, nil +} + +func (c *Container) joinedNetworkNSPath() string { + if c.state.NetNS != nil { + return c.state.NetNS.Name + } else { + return "" + } +} + +func (c *Container) inspectJoinedNetworkNS(networkns string) (q types.StatusBlock, retErr error) { + // TODO: extract interface information from the vnet jail + return types.StatusBlock{}, nil + +} + +func (c *Container) reloadRootlessRLKPortMapping() error { + return errors.New("unsupported (*Container).reloadRootlessRLKPortMapping") +} + +func (c *Container) setupRootlessNetwork() error { + return nil +} diff --git a/libpod/networking_linux.go b/libpod/networking_linux.go index c05796768..e27ec8e9d 100644 --- a/libpod/networking_linux.go +++ b/libpod/networking_linux.go @@ -13,25 +13,17 @@ import ( "os" "os/exec" "path/filepath" - "regexp" - "sort" "strconv" "strings" "syscall" "time" "github.com/containernetworking/plugins/pkg/ns" - "github.com/containers/common/libnetwork/etchosts" "github.com/containers/common/libnetwork/resolvconf" "github.com/containers/common/libnetwork/types" - "github.com/containers/common/pkg/config" - "github.com/containers/common/pkg/machine" "github.com/containers/common/pkg/netns" "github.com/containers/common/pkg/util" - "github.com/containers/podman/v4/libpod/define" - "github.com/containers/podman/v4/libpod/events" "github.com/containers/podman/v4/pkg/errorhandling" - "github.com/containers/podman/v4/pkg/namespaces" "github.com/containers/podman/v4/pkg/rootless" "github.com/containers/podman/v4/utils" "github.com/containers/storage/pkg/lockfile" @@ -59,39 +51,6 @@ const ( persistentCNIDir = "/var/lib/cni" ) -// convertPortMappings will remove the HostIP part from the ports when running inside podman machine. -// This is need because a HostIP of 127.0.0.1 would now allow the gvproxy forwarder to reach to open ports. -// For machine the HostIP must only be used by gvproxy and never in the VM. -func (c *Container) convertPortMappings() []types.PortMapping { - if !machine.IsGvProxyBased() || len(c.config.PortMappings) == 0 { - return c.config.PortMappings - } - // if we run in a machine VM we have to ignore the host IP part - newPorts := make([]types.PortMapping, 0, len(c.config.PortMappings)) - for _, port := range c.config.PortMappings { - port.HostIP = "" - newPorts = append(newPorts, port) - } - return newPorts -} - -func (c *Container) getNetworkOptions(networkOpts map[string]types.PerNetworkOptions) types.NetworkOptions { - opts := types.NetworkOptions{ - ContainerID: c.config.ID, - ContainerName: getCNIPodName(c), - } - opts.PortMappings = c.convertPortMappings() - - // If the container requested special network options use this instead of the config. - // This is the case for container restore or network reload. - if c.perNetworkOpts != nil { - opts.Networks = c.perNetworkOpts - } else { - opts.Networks = networkOpts - } - return opts -} - type RootlessNetNS struct { ns ns.NetNS dir string @@ -354,7 +313,7 @@ func (r *RootlessNetNS) Cleanup(runtime *Runtime) error { } } if err != nil { - logrus.Errorf("Failed to kill slirp4netns process: %s", err) + logrus.Errorf("Failed to kill slirp4netns process: %v", err) } err = os.RemoveAll(r.dir) if err != nil { @@ -411,13 +370,13 @@ func (r *Runtime) GetRootlessNetNs(new bool) (*RootlessNetNS, error) { if err != nil { if !new { // return a error if we could not get the namespace and should no create one - return nil, fmt.Errorf("error getting rootless network namespace: %w", err) + return nil, fmt.Errorf("getting rootless network namespace: %w", err) } // create a new namespace logrus.Debugf("creating rootless network namespace with name %q", netnsName) ns, err = netns.NewNSWithName(netnsName) if err != nil { - return nil, fmt.Errorf("error creating rootless network namespace: %w", err) + return nil, fmt.Errorf("creating rootless network namespace: %w", err) } // set up slirp4netns here path := r.config.Engine.NetworkCmdPath @@ -442,7 +401,7 @@ func (r *Runtime) GetRootlessNetNs(new bool) (*RootlessNetNS, error) { } slirpFeatures, err := checkSlirpFlags(path) if err != nil { - return nil, fmt.Errorf("error checking slirp4netns binary %s: %q: %w", path, err, err) + return nil, fmt.Errorf("checking slirp4netns binary %s: %q: %w", path, err, err) } cmdArgs, err := createBasicSlirp4netnsCmdArgs(netOptions, slirpFeatures) if err != nil { @@ -589,41 +548,6 @@ func (r *Runtime) GetRootlessNetNs(new bool) (*RootlessNetNS, error) { return rootlessNetNS, nil } -// setUpNetwork will set up the the networks, on error it will also tear down the cni -// networks. If rootless it will join/create the rootless network namespace. -func (r *Runtime) setUpNetwork(ns string, opts types.NetworkOptions) (map[string]types.StatusBlock, error) { - rootlessNetNS, err := r.GetRootlessNetNs(true) - if err != nil { - return nil, err - } - var results map[string]types.StatusBlock - setUpPod := func() error { - results, err = r.network.Setup(ns, types.SetupOptions{NetworkOptions: opts}) - return err - } - // rootlessNetNS is nil if we are root - if rootlessNetNS != nil { - // execute the setup in the rootless net ns - err = rootlessNetNS.Do(setUpPod) - rootlessNetNS.Lock.Unlock() - } else { - err = setUpPod() - } - return results, err -} - -// getCNIPodName return the pod name (hostname) used by CNI and the dnsname plugin. -// If we are in the pod network namespace use the pod name otherwise the container name -func getCNIPodName(c *Container) string { - if c.config.NetMode.IsPod() || c.IsInfra() { - pod, err := c.runtime.state.Pod(c.PodID()) - if err == nil { - return pod.Name() - } - } - return c.Name() -} - // Create and configure a new network namespace for a container func (r *Runtime) configureNetNS(ctr *Container, ctrNS ns.NetNS) (status map[string]types.StatusBlock, rerr error) { if err := r.exposeMachinePorts(ctr.config.PortMappings); err != nil { @@ -675,7 +599,7 @@ func (r *Runtime) configureNetNS(ctr *Container, ctrNS ns.NetNS) (status map[str func (r *Runtime) createNetNS(ctr *Container) (n ns.NetNS, q map[string]types.StatusBlock, retErr error) { ctrNS, err := netns.NewNS() if err != nil { - return nil, nil, fmt.Errorf("error creating network namespace for container %s: %w", ctr.ID(), err) + return nil, nil, fmt.Errorf("creating network namespace for container %s: %w", ctr.ID(), err) } defer func() { if retErr != nil { @@ -742,7 +666,7 @@ func (r *Runtime) setupNetNS(ctr *Container) error { func joinNetNS(path string) (ns.NetNS, error) { netNS, err := ns.GetNS(path) if err != nil { - return nil, fmt.Errorf("error retrieving network namespace at %s: %w", path, err) + return nil, fmt.Errorf("retrieving network namespace at %s: %w", path, err) } return netNS, nil @@ -758,7 +682,7 @@ func (r *Runtime) closeNetNS(ctr *Container) error { } if err := ctr.state.NetNS.Close(); err != nil { - return fmt.Errorf("error closing network namespace for container %s: %w", ctr.ID(), err) + return fmt.Errorf("closing network namespace for container %s: %w", ctr.ID(), err) } ctr.state.NetNS = nil @@ -766,56 +690,6 @@ func (r *Runtime) closeNetNS(ctr *Container) error { return nil } -// Tear down a container's network configuration and joins the -// rootless net ns as rootless user -func (r *Runtime) teardownNetwork(ns string, opts types.NetworkOptions) error { - rootlessNetNS, err := r.GetRootlessNetNs(false) - if err != nil { - return err - } - tearDownPod := func() error { - if err := r.network.Teardown(ns, types.TeardownOptions{NetworkOptions: opts}); err != nil { - return fmt.Errorf("error tearing down network namespace configuration for container %s: %w", opts.ContainerID, err) - } - return nil - } - - // rootlessNetNS is nil if we are root - if rootlessNetNS != nil { - // execute the cni setup in the rootless net ns - err = rootlessNetNS.Do(tearDownPod) - if cerr := rootlessNetNS.Cleanup(r); cerr != nil { - logrus.WithError(err).Error("failed to clean up rootless netns") - } - rootlessNetNS.Lock.Unlock() - } else { - err = tearDownPod() - } - return err -} - -// Tear down a container's CNI network configuration, but do not tear down the -// namespace itself. -func (r *Runtime) teardownCNI(ctr *Container) error { - if ctr.state.NetNS == nil { - // The container has no network namespace, we're set - return nil - } - - logrus.Debugf("Tearing down network namespace at %s for container %s", ctr.state.NetNS.Path(), ctr.ID()) - - networks, err := ctr.networks() - if err != nil { - return err - } - - if !ctr.config.NetMode.IsSlirp4netns() && len(networks) > 0 { - netOpts := ctr.getNetworkOptions(networks) - return r.teardownNetwork(ctr.state.NetNS.Path(), netOpts) - } - return nil -} - // Tear down a network namespace, undoing all state associated with it. func (r *Runtime) teardownNetNS(ctr *Container) error { if err := r.unexposeMachinePorts(ctr.config.PortMappings); err != nil { @@ -828,12 +702,12 @@ func (r *Runtime) teardownNetNS(ctr *Container) error { // First unmount the namespace if err := netns.UnmountNS(ctr.state.NetNS); err != nil { - return fmt.Errorf("error unmounting network namespace for container %s: %w", ctr.ID(), err) + return fmt.Errorf("unmounting network namespace for container %s: %w", ctr.ID(), err) } // Now close the open file descriptor if err := ctr.state.NetNS.Close(); err != nil { - return fmt.Errorf("error closing network namespace for container %s: %w", ctr.ID(), err) + return fmt.Errorf("closing network namespace for container %s: %w", ctr.ID(), err) } ctr.state.NetNS = nil @@ -862,72 +736,6 @@ func getContainerNetNS(ctr *Container) (string, *Container, error) { return "", nil, nil } -// isBridgeNetMode checks if the given network mode is bridge. -// It returns nil when it is set to bridge and an error otherwise. -func isBridgeNetMode(n namespaces.NetworkMode) error { - if !n.IsBridge() { - return fmt.Errorf("%q is not supported: %w", n, define.ErrNetworkModeInvalid) - } - return nil -} - -// Reload only works with containers with a configured network. -// It will tear down, and then reconfigure, the network of the container. -// This is mainly used when a reload of firewall rules wipes out existing -// firewall configuration. -// Efforts will be made to preserve MAC and IP addresses, but this only works if -// the container only joined a single CNI network, and was only assigned a -// single MAC or IP. -// Only works on root containers at present, though in the future we could -// extend this to stop + restart slirp4netns -func (r *Runtime) reloadContainerNetwork(ctr *Container) (map[string]types.StatusBlock, error) { - if ctr.state.NetNS == nil { - return nil, fmt.Errorf("container %s network is not configured, refusing to reload: %w", ctr.ID(), define.ErrCtrStateInvalid) - } - if err := isBridgeNetMode(ctr.config.NetMode); err != nil { - return nil, err - } - logrus.Infof("Going to reload container %s network", ctr.ID()) - - err := r.teardownCNI(ctr) - if err != nil { - // teardownCNI will error if the iptables rules do not exists and this is the case after - // a firewall reload. The purpose of network reload is to recreate the rules if they do - // not exists so we should not log this specific error as error. This would confuse users otherwise. - // iptables-legacy and iptables-nft will create different errors make sure to match both. - b, rerr := regexp.MatchString("Couldn't load target `CNI-[a-f0-9]{24}':No such file or directory|Chain 'CNI-[a-f0-9]{24}' does not exist", err.Error()) - if rerr == nil && !b { - logrus.Error(err) - } else { - logrus.Info(err) - } - } - - networkOpts, err := ctr.networks() - if err != nil { - return nil, err - } - - // Set the same network settings as before.. - netStatus := ctr.getNetworkStatus() - for network, perNetOpts := range networkOpts { - for name, netInt := range netStatus[network].Interfaces { - perNetOpts.InterfaceName = name - perNetOpts.StaticMAC = netInt.MacAddress - for _, netAddress := range netInt.Subnets { - perNetOpts.StaticIPs = append(perNetOpts.StaticIPs, netAddress.IPNet.IP) - } - // Normally interfaces have a length of 1, only for some special cni configs we could get more. - // For now just use the first interface to get the ips this should be good enough for most cases. - break - } - networkOpts[network] = perNetOpts - } - ctr.perNetworkOpts = networkOpts - - return r.configureNetNS(ctr, ctr.state.NetNS) -} - // TODO (5.0): return the statistics per network interface // This would allow better compat with docker. func getContainerNetIO(ctr *Container) (*netlink.LinkStatistics, error) { @@ -981,110 +789,6 @@ func getContainerNetIO(ctr *Container) (*netlink.LinkStatistics, error) { return netStats, err } -// Produce an InspectNetworkSettings containing information on the container -// network. -func (c *Container) getContainerNetworkInfo() (*define.InspectNetworkSettings, error) { - if c.config.NetNsCtr != "" { - netNsCtr, err := c.runtime.GetContainer(c.config.NetNsCtr) - if err != nil { - return nil, err - } - // see https://github.com/containers/podman/issues/10090 - // the container has to be locked for syncContainer() - netNsCtr.lock.Lock() - defer netNsCtr.lock.Unlock() - // Have to sync to ensure that state is populated - if err := netNsCtr.syncContainer(); err != nil { - return nil, err - } - logrus.Debugf("Container %s shares network namespace, retrieving network info of container %s", c.ID(), c.config.NetNsCtr) - - return netNsCtr.getContainerNetworkInfo() - } - - settings := new(define.InspectNetworkSettings) - settings.Ports = makeInspectPortBindings(c.config.PortMappings, c.config.ExposedPorts) - - networks, err := c.networks() - if err != nil { - return nil, err - } - - if c.state.NetNS == nil { - if networkNSPath := c.joinedNetworkNSPath(); networkNSPath != "" { - if result, err := c.inspectJoinedNetworkNS(networkNSPath); err == nil { - // fallback to dummy configuration - settings.InspectBasicNetworkConfig = resultToBasicNetworkConfig(result) - return settings, nil - } - // do not propagate error inspecting a joined network ns - logrus.Errorf("Inspecting network namespace: %s of container %s: %v", networkNSPath, c.ID(), err) - } - // We can't do more if the network is down. - - // We still want to make dummy configurations for each CNI net - // the container joined. - if len(networks) > 0 { - settings.Networks = make(map[string]*define.InspectAdditionalNetwork, len(networks)) - for net, opts := range networks { - cniNet := new(define.InspectAdditionalNetwork) - cniNet.NetworkID = net - cniNet.Aliases = opts.Aliases - settings.Networks[net] = cniNet - } - } - - return settings, nil - } - - // Set network namespace path - settings.SandboxKey = c.state.NetNS.Path() - - netStatus := c.getNetworkStatus() - // If this is empty, we're probably slirp4netns - if len(netStatus) == 0 { - return settings, nil - } - - // If we have networks - handle that here - if len(networks) > 0 { - if len(networks) != len(netStatus) { - return nil, fmt.Errorf("network inspection mismatch: asked to join %d network(s) %v, but have information on %d network(s): %w", len(networks), networks, len(netStatus), define.ErrInternal) - } - - settings.Networks = make(map[string]*define.InspectAdditionalNetwork) - - for name, opts := range networks { - result := netStatus[name] - addedNet := new(define.InspectAdditionalNetwork) - addedNet.NetworkID = name - addedNet.Aliases = opts.Aliases - addedNet.InspectBasicNetworkConfig = resultToBasicNetworkConfig(result) - - settings.Networks[name] = addedNet - } - - // if not only the default network is connected we can return here - // otherwise we have to populate the InspectBasicNetworkConfig settings - _, isDefaultNet := networks[c.runtime.config.Network.DefaultNetwork] - if !(len(networks) == 1 && isDefaultNet) { - return settings, nil - } - } - - // If not joining networks, we should have at most 1 result - if len(netStatus) > 1 { - return nil, fmt.Errorf("should have at most 1 network status result if not joining networks, instead got %d: %w", len(netStatus), define.ErrInternal) - } - - if len(netStatus) == 1 { - for _, status := range netStatus { - settings.InspectBasicNetworkConfig = resultToBasicNetworkConfig(status) - } - } - return settings, nil -} - func (c *Container) joinedNetworkNSPath() string { for _, namespace := range c.config.Spec.Linux.Namespaces { if namespace.Type == specs.NetworkNamespace { @@ -1151,49 +855,6 @@ func (c *Container) inspectJoinedNetworkNS(networkns string) (q types.StatusBloc return result, err } -// resultToBasicNetworkConfig produces an InspectBasicNetworkConfig from a CNI -// result -func resultToBasicNetworkConfig(result types.StatusBlock) define.InspectBasicNetworkConfig { - config := define.InspectBasicNetworkConfig{} - interfaceNames := make([]string, 0, len(result.Interfaces)) - for interfaceName := range result.Interfaces { - interfaceNames = append(interfaceNames, interfaceName) - } - // ensure consistent inspect results by sorting - sort.Strings(interfaceNames) - for _, interfaceName := range interfaceNames { - netInt := result.Interfaces[interfaceName] - for _, netAddress := range netInt.Subnets { - size, _ := netAddress.IPNet.Mask.Size() - if netAddress.IPNet.IP.To4() != nil { - // ipv4 - if config.IPAddress == "" { - config.IPAddress = netAddress.IPNet.IP.String() - config.IPPrefixLen = size - config.Gateway = netAddress.Gateway.String() - } else { - config.SecondaryIPAddresses = append(config.SecondaryIPAddresses, define.Address{Addr: netAddress.IPNet.IP.String(), PrefixLength: size}) - } - } else { - // ipv6 - if config.GlobalIPv6Address == "" { - config.GlobalIPv6Address = netAddress.IPNet.IP.String() - config.GlobalIPv6PrefixLen = size - config.IPv6Gateway = netAddress.Gateway.String() - } else { - config.SecondaryIPv6Addresses = append(config.SecondaryIPv6Addresses, define.Address{Addr: netAddress.IPNet.IP.String(), PrefixLength: size}) - } - } - } - if config.MacAddress == "" { - config.MacAddress = netInt.MacAddress.String() - } else { - config.AdditionalMacAddresses = append(config.AdditionalMacAddresses, netInt.MacAddress.String()) - } - } - return config -} - type logrusDebugWriter struct { prefix string } @@ -1202,363 +863,3 @@ func (w *logrusDebugWriter) Write(p []byte) (int, error) { logrus.Debugf("%s%s", w.prefix, string(p)) return len(p), nil } - -// NetworkDisconnect removes a container from the network -func (c *Container) NetworkDisconnect(nameOrID, netName string, force bool) error { - // only the bridge mode supports cni networks - if err := isBridgeNetMode(c.config.NetMode); err != nil { - return err - } - - c.lock.Lock() - defer c.lock.Unlock() - - networks, err := c.networks() - if err != nil { - return err - } - - // check if network exists and if the input is a ID we get the name - // CNI only uses names so it is important that we only use the name - netName, err = c.runtime.normalizeNetworkName(netName) - if err != nil { - return err - } - - _, nameExists := networks[netName] - if !nameExists && len(networks) > 0 { - return fmt.Errorf("container %s is not connected to network %s", nameOrID, netName) - } - - if err := c.syncContainer(); err != nil { - return err - } - // get network status before we disconnect - networkStatus := c.getNetworkStatus() - - if err := c.runtime.state.NetworkDisconnect(c, netName); err != nil { - return err - } - - c.newNetworkEvent(events.NetworkDisconnect, netName) - if !c.ensureState(define.ContainerStateRunning, define.ContainerStateCreated) { - return nil - } - - if c.state.NetNS == nil { - return fmt.Errorf("unable to disconnect %s from %s: %w", nameOrID, netName, define.ErrNoNetwork) - } - - opts := types.NetworkOptions{ - ContainerID: c.config.ID, - ContainerName: getCNIPodName(c), - } - opts.PortMappings = c.convertPortMappings() - opts.Networks = map[string]types.PerNetworkOptions{ - netName: networks[netName], - } - - if err := c.runtime.teardownNetwork(c.state.NetNS.Path(), opts); err != nil { - return err - } - - // update network status if container is running - oldStatus, statusExist := networkStatus[netName] - delete(networkStatus, netName) - c.state.NetworkStatus = networkStatus - err = c.save() - if err != nil { - return err - } - - // Reload ports when there are still connected networks, maybe we removed the network interface with the child ip. - // Reloading without connected networks does not make sense, so we can skip this step. - if rootless.IsRootless() && len(networkStatus) > 0 { - if err := c.reloadRootlessRLKPortMapping(); err != nil { - return err - } - } - - // Update resolv.conf if required - if statusExist { - stringIPs := make([]string, 0, len(oldStatus.DNSServerIPs)) - for _, ip := range oldStatus.DNSServerIPs { - stringIPs = append(stringIPs, ip.String()) - } - if len(stringIPs) > 0 { - logrus.Debugf("Removing DNS Servers %v from resolv.conf", stringIPs) - if err := c.removeNameserver(stringIPs); err != nil { - return err - } - } - - // update /etc/hosts file - if file, ok := c.state.BindMounts[config.DefaultHostsFile]; ok { - // sync the names with c.getHostsEntries() - names := []string{c.Hostname(), c.config.Name} - rm := etchosts.GetNetworkHostEntries(map[string]types.StatusBlock{netName: oldStatus}, names...) - if len(rm) > 0 { - // make sure to lock this file to prevent concurrent writes when - // this is used a net dependency container - lock, err := lockfile.GetLockfile(file) - if err != nil { - return fmt.Errorf("failed to lock hosts file: %w", err) - } - logrus.Debugf("Remove /etc/hosts entries %v", rm) - lock.Lock() - err = etchosts.Remove(file, rm) - lock.Unlock() - if err != nil { - return err - } - } - } - } - return nil -} - -// ConnectNetwork connects a container to a given network -func (c *Container) NetworkConnect(nameOrID, netName string, netOpts types.PerNetworkOptions) error { - // only the bridge mode supports cni networks - if err := isBridgeNetMode(c.config.NetMode); err != nil { - return err - } - - c.lock.Lock() - defer c.lock.Unlock() - - networks, err := c.networks() - if err != nil { - return err - } - - // check if network exists and if the input is a ID we get the name - // CNI only uses names so it is important that we only use the name - netName, err = c.runtime.normalizeNetworkName(netName) - if err != nil { - return err - } - - if err := c.syncContainer(); err != nil { - return err - } - - // get network status before we connect - networkStatus := c.getNetworkStatus() - - // always add the short id as alias for docker compat - netOpts.Aliases = append(netOpts.Aliases, c.config.ID[:12]) - - if netOpts.InterfaceName == "" { - netOpts.InterfaceName = getFreeInterfaceName(networks) - if netOpts.InterfaceName == "" { - return errors.New("could not find free network interface name") - } - } - - if err := c.runtime.state.NetworkConnect(c, netName, netOpts); err != nil { - return err - } - c.newNetworkEvent(events.NetworkConnect, netName) - if !c.ensureState(define.ContainerStateRunning, define.ContainerStateCreated) { - return nil - } - if c.state.NetNS == nil { - return fmt.Errorf("unable to connect %s to %s: %w", nameOrID, netName, define.ErrNoNetwork) - } - - opts := types.NetworkOptions{ - ContainerID: c.config.ID, - ContainerName: getCNIPodName(c), - } - opts.PortMappings = c.convertPortMappings() - opts.Networks = map[string]types.PerNetworkOptions{ - netName: netOpts, - } - - results, err := c.runtime.setUpNetwork(c.state.NetNS.Path(), opts) - if err != nil { - return err - } - if len(results) != 1 { - return errors.New("when adding aliases, results must be of length 1") - } - - // we need to get the old host entries before we add the new one to the status - // if we do not add do it here we will get the wrong existing entries which will throw of the logic - // we could also copy the map but this does not seem worth it - // sync the hostNames with c.getHostsEntries() - hostNames := []string{c.Hostname(), c.config.Name} - oldHostEntries := etchosts.GetNetworkHostEntries(networkStatus, hostNames...) - - // update network status - if networkStatus == nil { - networkStatus = make(map[string]types.StatusBlock, 1) - } - networkStatus[netName] = results[netName] - c.state.NetworkStatus = networkStatus - - err = c.save() - if err != nil { - return err - } - - // The first network needs a port reload to set the correct child ip for the rootlessport process. - // Adding a second network does not require a port reload because the child ip is still valid. - if rootless.IsRootless() && len(networks) == 0 { - if err := c.reloadRootlessRLKPortMapping(); err != nil { - return err - } - } - - ipv6, err := c.checkForIPv6(networkStatus) - if err != nil { - return err - } - - // Update resolv.conf if required - stringIPs := make([]string, 0, len(results[netName].DNSServerIPs)) - for _, ip := range results[netName].DNSServerIPs { - if (ip.To4() == nil) && !ipv6 { - continue - } - stringIPs = append(stringIPs, ip.String()) - } - if len(stringIPs) > 0 { - logrus.Debugf("Adding DNS Servers %v to resolv.conf", stringIPs) - if err := c.addNameserver(stringIPs); err != nil { - return err - } - } - - // update /etc/hosts file - if file, ok := c.state.BindMounts[config.DefaultHostsFile]; ok { - // make sure to lock this file to prevent concurrent writes when - // this is used a net dependency container - lock, err := lockfile.GetLockfile(file) - if err != nil { - return fmt.Errorf("failed to lock hosts file: %w", err) - } - new := etchosts.GetNetworkHostEntries(results, hostNames...) - logrus.Debugf("Add /etc/hosts entries %v", new) - // use special AddIfExists API to make sure we only add new entries if an old one exists - // see the AddIfExists() comment for more information - lock.Lock() - err = etchosts.AddIfExists(file, oldHostEntries, new) - lock.Unlock() - if err != nil { - return err - } - } - - return nil -} - -// get a free interface name for a new network -// return an empty string if no free name was found -func getFreeInterfaceName(networks map[string]types.PerNetworkOptions) string { - ifNames := make([]string, 0, len(networks)) - for _, opts := range networks { - ifNames = append(ifNames, opts.InterfaceName) - } - for i := 0; i < 100000; i++ { - ifName := fmt.Sprintf("eth%d", i) - if !util.StringInSlice(ifName, ifNames) { - return ifName - } - } - return "" -} - -// DisconnectContainerFromNetwork removes a container from its CNI network -func (r *Runtime) DisconnectContainerFromNetwork(nameOrID, netName string, force bool) error { - ctr, err := r.LookupContainer(nameOrID) - if err != nil { - return err - } - return ctr.NetworkDisconnect(nameOrID, netName, force) -} - -// ConnectContainerToNetwork connects a container to a CNI network -func (r *Runtime) ConnectContainerToNetwork(nameOrID, netName string, netOpts types.PerNetworkOptions) error { - ctr, err := r.LookupContainer(nameOrID) - if err != nil { - return err - } - return ctr.NetworkConnect(nameOrID, netName, netOpts) -} - -// normalizeNetworkName takes a network name, a partial or a full network ID and returns the network name. -// If the network is not found a errors is returned. -func (r *Runtime) normalizeNetworkName(nameOrID string) (string, error) { - net, err := r.network.NetworkInspect(nameOrID) - if err != nil { - return "", err - } - return net.Name, nil -} - -// ocicniPortsToNetTypesPorts convert the old port format to the new one -// while deduplicating ports into ranges -func ocicniPortsToNetTypesPorts(ports []types.OCICNIPortMapping) []types.PortMapping { - if len(ports) == 0 { - return nil - } - - newPorts := make([]types.PortMapping, 0, len(ports)) - - // first sort the ports - sort.Slice(ports, func(i, j int) bool { - return compareOCICNIPorts(ports[i], ports[j]) - }) - - // we already check if the slice is empty so we can use the first element - currentPort := types.PortMapping{ - HostIP: ports[0].HostIP, - HostPort: uint16(ports[0].HostPort), - ContainerPort: uint16(ports[0].ContainerPort), - Protocol: ports[0].Protocol, - Range: 1, - } - - for i := 1; i < len(ports); i++ { - if ports[i].HostIP == currentPort.HostIP && - ports[i].Protocol == currentPort.Protocol && - ports[i].HostPort-int32(currentPort.Range) == int32(currentPort.HostPort) && - ports[i].ContainerPort-int32(currentPort.Range) == int32(currentPort.ContainerPort) { - currentPort.Range++ - } else { - newPorts = append(newPorts, currentPort) - currentPort = types.PortMapping{ - HostIP: ports[i].HostIP, - HostPort: uint16(ports[i].HostPort), - ContainerPort: uint16(ports[i].ContainerPort), - Protocol: ports[i].Protocol, - Range: 1, - } - } - } - newPorts = append(newPorts, currentPort) - return newPorts -} - -// compareOCICNIPorts will sort the ocicni ports by -// 1) host ip -// 2) protocol -// 3) hostPort -// 4) container port -func compareOCICNIPorts(i, j types.OCICNIPortMapping) bool { - if i.HostIP != j.HostIP { - return i.HostIP < j.HostIP - } - - if i.Protocol != j.Protocol { - return i.Protocol < j.Protocol - } - - if i.HostPort != j.HostPort { - return i.HostPort < j.HostPort - } - - return i.ContainerPort < j.ContainerPort -} diff --git a/libpod/networking_slirp4netns.go b/libpod/networking_slirp4netns.go index 4a6462d46..d4ec9082b 100644 --- a/libpod/networking_slirp4netns.go +++ b/libpod/networking_slirp4netns.go @@ -243,7 +243,7 @@ func (r *Runtime) setupSlirp4netns(ctr *Container, netns ns.NetNS) error { } slirpFeatures, err := checkSlirpFlags(path) if err != nil { - return fmt.Errorf("error checking slirp4netns binary %s: %q: %w", path, err, err) + return fmt.Errorf("checking slirp4netns binary %s: %q: %w", path, err, err) } cmdArgs, err := createBasicSlirp4netnsCmdArgs(netOptions, slirpFeatures) if err != nil { @@ -405,7 +405,7 @@ func GetSlirp4netnsIP(subnet *net.IPNet) (*net.IP, error) { } expectedIP, err := addToIP(slirpSubnet, uint32(100)) if err != nil { - return nil, fmt.Errorf("error calculating expected ip for slirp4netns: %w", err) + return nil, fmt.Errorf("calculating expected ip for slirp4netns: %w", err) } return expectedIP, nil } @@ -419,7 +419,7 @@ func GetSlirp4netnsGateway(subnet *net.IPNet) (*net.IP, error) { } expectedGatewayIP, err := addToIP(slirpSubnet, uint32(2)) if err != nil { - return nil, fmt.Errorf("error calculating expected gateway ip for slirp4netns: %w", err) + return nil, fmt.Errorf("calculating expected gateway ip for slirp4netns: %w", err) } return expectedGatewayIP, nil } @@ -433,7 +433,7 @@ func GetSlirp4netnsDNS(subnet *net.IPNet) (*net.IP, error) { } expectedDNSIP, err := addToIP(slirpSubnet, uint32(3)) if err != nil { - return nil, fmt.Errorf("error calculating expected dns ip for slirp4netns: %w", err) + return nil, fmt.Errorf("calculating expected dns ip for slirp4netns: %w", err) } return expectedDNSIP, nil } @@ -465,7 +465,7 @@ func waitForSync(syncR *os.File, cmd *exec.Cmd, logFile io.ReadSeeker, timeout t b := make([]byte, 16) for { if err := syncR.SetDeadline(time.Now().Add(timeout)); err != nil { - return fmt.Errorf("error setting %s pipe timeout: %w", prog, err) + return fmt.Errorf("setting %s pipe timeout: %w", prog, err) } // FIXME: return err as soon as proc exits, without waiting for timeout if _, err := syncR.Read(b); err == nil { @@ -676,7 +676,7 @@ func openSlirp4netnsPort(apiSocket, proto, hostip string, hostport, guestport ui // successful. var y map[string]interface{} if err := json.Unmarshal(buf[0:readLength], &y); err != nil { - return fmt.Errorf("error parsing error status from slirp4netns: %w", err) + return fmt.Errorf("parsing error status from slirp4netns: %w", err) } if e, found := y["error"]; found { return fmt.Errorf("from slirp4netns while setting up port redirection: %v", e) diff --git a/libpod/networking_unsupported.go b/libpod/networking_unsupported.go new file mode 100644 index 000000000..e5a6d1456 --- /dev/null +++ b/libpod/networking_unsupported.go @@ -0,0 +1,91 @@ +//go:build !linux && !freebsd +// +build !linux,!freebsd + +package libpod + +import ( + "errors" + "net" + "path/filepath" + + "github.com/containers/common/libnetwork/types" + "github.com/containers/podman/v4/libpod/define" + "github.com/containers/storage/pkg/lockfile" +) + +type RootlessNetNS struct { + dir string + Lock lockfile.Locker +} + +// ocicniPortsToNetTypesPorts convert the old port format to the new one +// while deduplicating ports into ranges +func ocicniPortsToNetTypesPorts(ports []types.OCICNIPortMapping) []types.PortMapping { + return []types.PortMapping{} +} + +func (c *Container) getContainerNetworkInfo() (*define.InspectNetworkSettings, error) { + return nil, errors.New("not implemented (*Container) getContainerNetworkInfo") +} + +func (c *Container) setupRootlessNetwork() error { + return errors.New("not implemented (*Container) setupRootlessNetwork") +} + +func (r *Runtime) setupNetNS(ctr *Container) error { + return errors.New("not implemented (*Runtime) setupNetNS") +} + +// normalizeNetworkName takes a network name, a partial or a full network ID and returns the network name. +// If the network is not found a errors is returned. +func (r *Runtime) normalizeNetworkName(nameOrID string) (string, error) { + return "", errors.New("not implemented (*Runtime) normalizeNetworkName") +} + +// DisconnectContainerFromNetwork removes a container from its CNI network +func (r *Runtime) DisconnectContainerFromNetwork(nameOrID, netName string, force bool) error { + return errors.New("not implemented (*Runtime) DisconnectContainerFromNetwork") +} + +// ConnectContainerToNetwork connects a container to a CNI network +func (r *Runtime) ConnectContainerToNetwork(nameOrID, netName string, netOpts types.PerNetworkOptions) error { + return errors.New("not implemented (*Runtime) ConnectContainerToNetwork") +} + +// getPath will join the given path to the rootless netns dir +func (r *RootlessNetNS) getPath(path string) string { + return filepath.Join(r.dir, path) +} + +// Do - run the given function in the rootless netns. +// It does not lock the rootlessCNI lock, the caller +// should only lock when needed, e.g. for cni operations. +func (r *RootlessNetNS) Do(toRun func() error) error { + return errors.New("not implemented (*RootlessNetNS) Do") +} + +// Cleanup the rootless network namespace if needed. +// It checks if we have running containers with the bridge network mode. +// Cleanup() expects that r.Lock is locked +func (r *RootlessNetNS) Cleanup(runtime *Runtime) error { + return errors.New("not implemented (*RootlessNetNS) Cleanup") +} + +// GetRootlessNetNs returns the rootless netns object. If create is set to true +// the rootless network namespace will be created if it does not exists already. +// If called as root it returns always nil. +// On success the returned RootlessCNI lock is locked and must be unlocked by the caller. +func (r *Runtime) GetRootlessNetNs(new bool) (*RootlessNetNS, error) { + return nil, errors.New("not implemented (*Runtime) GetRootlessNetNs") +} + +// convertPortMappings will remove the HostIP part from the ports when running inside podman machine. +// This is need because a HostIP of 127.0.0.1 would now allow the gvproxy forwarder to reach to open ports. +// For machine the HostIP must only be used by gvproxy and never in the VM. +func (c *Container) convertPortMappings() []types.PortMapping { + return []types.PortMapping{} +} + +func GetSlirp4netnsIP(subnet *net.IPNet) (*net.IP, error) { + return nil, errors.New("not implemented GetSlirp4netnsIP") +} diff --git a/libpod/oci.go b/libpod/oci.go index 70053db1b..e5b9a0dcd 100644 --- a/libpod/oci.go +++ b/libpod/oci.go @@ -5,6 +5,7 @@ import ( "github.com/containers/common/pkg/resize" "github.com/containers/podman/v4/libpod/define" + "github.com/opencontainers/runtime-spec/specs-go" ) // OCIRuntime is an implementation of an OCI runtime. @@ -148,6 +149,9 @@ type OCIRuntime interface { // RuntimeInfo returns verbose information about the runtime. RuntimeInfo() (*define.ConmonInfo, *define.OCIRuntimeInfo, error) + + // UpdateContainer updates the given container's cgroup configuration. + UpdateContainer(ctr *Container, res *specs.LinuxResources) error } // AttachOptions are options used when attached to a container or an exec diff --git a/libpod/oci_conmon_attach_common.go b/libpod/oci_conmon_attach_common.go new file mode 100644 index 000000000..dec749837 --- /dev/null +++ b/libpod/oci_conmon_attach_common.go @@ -0,0 +1,305 @@ +//go:build linux || freebsd +// +build linux freebsd + +package libpod + +import ( + "errors" + "fmt" + "io" + "net" + "os" + "path/filepath" + "syscall" + + "github.com/containers/common/pkg/config" + "github.com/containers/common/pkg/resize" + "github.com/containers/common/pkg/util" + "github.com/containers/podman/v4/libpod/define" + "github.com/containers/podman/v4/pkg/errorhandling" + "github.com/moby/term" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +/* Sync with stdpipe_t in conmon.c */ +const ( + AttachPipeStdin = 1 + AttachPipeStdout = 2 + AttachPipeStderr = 3 +) + +// Attach to the given container. +// Does not check if state is appropriate. +// started is only required if startContainer is true. +func (r *ConmonOCIRuntime) Attach(c *Container, params *AttachOptions) error { + passthrough := c.LogDriver() == define.PassthroughLogging + + if params == nil || params.Streams == nil { + return fmt.Errorf("must provide parameters to Attach: %w", define.ErrInternal) + } + + if !params.Streams.AttachOutput && !params.Streams.AttachError && !params.Streams.AttachInput && !passthrough { + return fmt.Errorf("must provide at least one stream to attach to: %w", define.ErrInvalidArg) + } + if params.Start && params.Started == nil { + return fmt.Errorf("started chan not passed when startContainer set: %w", define.ErrInternal) + } + + keys := config.DefaultDetachKeys + if params.DetachKeys != nil { + keys = *params.DetachKeys + } + + detachKeys, err := processDetachKeys(keys) + if err != nil { + return err + } + + var conn *net.UnixConn + if !passthrough { + logrus.Debugf("Attaching to container %s", c.ID()) + + // If we have a resize, do it. + if params.InitialSize != nil { + if err := r.AttachResize(c, *params.InitialSize); err != nil { + return err + } + } + + attachSock, err := c.AttachSocketPath() + if err != nil { + return err + } + + conn, err = openUnixSocket(attachSock) + if err != nil { + return fmt.Errorf("failed to connect to container's attach socket: %v: %w", attachSock, err) + } + defer func() { + if err := conn.Close(); err != nil { + logrus.Errorf("unable to close socket: %q", err) + } + }() + } + + // If starting was requested, start the container and notify when that's + // done. + if params.Start { + if err := c.start(); err != nil { + return err + } + params.Started <- true + } + + if passthrough { + return nil + } + + receiveStdoutError, stdinDone := setupStdioChannels(params.Streams, conn, detachKeys) + if params.AttachReady != nil { + params.AttachReady <- true + } + return readStdio(conn, params.Streams, receiveStdoutError, stdinDone) +} + +// Attach to the given container's exec session +// attachFd and startFd must be open file descriptors +// attachFd must be the output side of the fd. attachFd is used for two things: +// conmon will first send a nonce value across the pipe indicating it has set up its side of the console socket +// this ensures attachToExec gets all of the output of the called process +// conmon will then send the exit code of the exec process, or an error in the exec session +// startFd must be the input side of the fd. +// newSize resizes the tty to this size before the process is started, must be nil if the exec session has no tty +// conmon will wait to start the exec session until the parent process has set up the console socket. +// Once attachToExec successfully attaches to the console socket, the child conmon process responsible for calling runtime exec +// will read from the output side of start fd, thus learning to start the child process. +// Thus, the order goes as follow: +// 1. conmon parent process sets up its console socket. sends on attachFd +// 2. attachToExec attaches to the console socket after reading on attachFd and resizes the tty +// 3. child waits on startFd for attachToExec to attach to said console socket +// 4. attachToExec sends on startFd, signalling it has attached to the socket and child is ready to go +// 5. child receives on startFd, runs the runtime exec command +// attachToExec is responsible for closing startFd and attachFd +func (c *Container) attachToExec(streams *define.AttachStreams, keys *string, sessionID string, startFd, attachFd *os.File, newSize *resize.TerminalSize) error { + if !streams.AttachOutput && !streams.AttachError && !streams.AttachInput { + return fmt.Errorf("must provide at least one stream to attach to: %w", define.ErrInvalidArg) + } + if startFd == nil || attachFd == nil { + return fmt.Errorf("start sync pipe and attach sync pipe must be defined for exec attach: %w", define.ErrInvalidArg) + } + + defer errorhandling.CloseQuiet(startFd) + defer errorhandling.CloseQuiet(attachFd) + + detachString := config.DefaultDetachKeys + if keys != nil { + detachString = *keys + } + detachKeys, err := processDetachKeys(detachString) + if err != nil { + return err + } + + logrus.Debugf("Attaching to container %s exec session %s", c.ID(), sessionID) + + // set up the socket path, such that it is the correct length and location for exec + sockPath, err := c.execAttachSocketPath(sessionID) + if err != nil { + return err + } + + // 2: read from attachFd that the parent process has set up the console socket + if _, err := readConmonPipeData(c.ociRuntime.Name(), attachFd, ""); err != nil { + return err + } + + // resize before we start the container process + if newSize != nil { + err = c.ociRuntime.ExecAttachResize(c, sessionID, *newSize) + if err != nil { + logrus.Warnf("Resize failed: %v", err) + } + } + + // 2: then attach + conn, err := openUnixSocket(sockPath) + if err != nil { + return fmt.Errorf("failed to connect to container's attach socket: %v: %w", sockPath, err) + } + defer func() { + if err := conn.Close(); err != nil { + logrus.Errorf("Unable to close socket: %q", err) + } + }() + + // start listening on stdio of the process + receiveStdoutError, stdinDone := setupStdioChannels(streams, conn, detachKeys) + + // 4: send start message to child + if err := writeConmonPipeData(startFd); err != nil { + return err + } + + return readStdio(conn, streams, receiveStdoutError, stdinDone) +} + +func processDetachKeys(keys string) ([]byte, error) { + // Check the validity of the provided keys first + if len(keys) == 0 { + return []byte{}, nil + } + detachKeys, err := term.ToBytes(keys) + if err != nil { + return nil, fmt.Errorf("invalid detach keys: %w", err) + } + return detachKeys, nil +} + +func registerResizeFunc(r <-chan resize.TerminalSize, bundlePath string) { + resize.HandleResizing(r, func(size resize.TerminalSize) { + controlPath := filepath.Join(bundlePath, "ctl") + controlFile, err := os.OpenFile(controlPath, unix.O_WRONLY, 0) + if err != nil { + logrus.Debugf("Could not open ctl file: %v", err) + return + } + defer controlFile.Close() + + logrus.Debugf("Received a resize event: %+v", size) + if _, err = fmt.Fprintf(controlFile, "%d %d %d\n", 1, size.Height, size.Width); err != nil { + logrus.Warnf("Failed to write to control file to resize terminal: %v", err) + } + }) +} + +func setupStdioChannels(streams *define.AttachStreams, conn *net.UnixConn, detachKeys []byte) (chan error, chan error) { + receiveStdoutError := make(chan error) + go func() { + receiveStdoutError <- redirectResponseToOutputStreams(streams.OutputStream, streams.ErrorStream, streams.AttachOutput, streams.AttachError, conn) + }() + + stdinDone := make(chan error) + go func() { + var err error + if streams.AttachInput { + _, err = util.CopyDetachable(conn, streams.InputStream, detachKeys) + } + stdinDone <- err + }() + + return receiveStdoutError, stdinDone +} + +func redirectResponseToOutputStreams(outputStream, errorStream io.Writer, writeOutput, writeError bool, conn io.Reader) error { + var err error + buf := make([]byte, 8192+1) /* Sync with conmon STDIO_BUF_SIZE */ + for { + nr, er := conn.Read(buf) + if nr > 0 { + var dst io.Writer + var doWrite bool + switch buf[0] { + case AttachPipeStdout: + dst = outputStream + doWrite = writeOutput + case AttachPipeStderr: + dst = errorStream + doWrite = writeError + default: + logrus.Infof("Received unexpected attach type %+d", buf[0]) + } + if dst == nil { + return errors.New("output destination cannot be nil") + } + + if doWrite { + nw, ew := dst.Write(buf[1:nr]) + if ew != nil { + err = ew + break + } + if nr != nw+1 { + err = io.ErrShortWrite + break + } + } + } + if errors.Is(er, io.EOF) || errors.Is(er, syscall.ECONNRESET) { + break + } + if er != nil { + err = er + break + } + } + return err +} + +func readStdio(conn *net.UnixConn, streams *define.AttachStreams, receiveStdoutError, stdinDone chan error) error { + var err error + select { + case err = <-receiveStdoutError: + if err := socketCloseWrite(conn); err != nil { + logrus.Errorf("Failed to close stdin: %v", err) + } + return err + case err = <-stdinDone: + if err == define.ErrDetach { + if err := socketCloseWrite(conn); err != nil { + logrus.Errorf("Failed to close stdin: %v", err) + } + return err + } + if err == nil { + // copy stdin is done, close it + if connErr := socketCloseWrite(conn); connErr != nil { + logrus.Errorf("Unable to close conn: %v", connErr) + } + } + if streams.AttachOutput || streams.AttachError { + return <-receiveStdoutError + } + } + return nil +} diff --git a/libpod/oci_conmon_attach_freebsd.go b/libpod/oci_conmon_attach_freebsd.go new file mode 100644 index 000000000..de0054381 --- /dev/null +++ b/libpod/oci_conmon_attach_freebsd.go @@ -0,0 +1,21 @@ +package libpod + +import ( + "net" + "os" + "path/filepath" +) + +func openUnixSocket(path string) (*net.UnixConn, error) { + // socket paths can be too long to fit into a sockaddr_un so we create a shorter symlink. + tmpdir, err := os.MkdirTemp("", "podman") + if err != nil { + return nil, err + } + defer os.RemoveAll(tmpdir) + tmpsockpath := filepath.Join(tmpdir, "sock") + if err := os.Symlink(path, tmpsockpath); err != nil { + return nil, err + } + return net.DialUnix("unixpacket", nil, &net.UnixAddr{Name: tmpsockpath, Net: "unixpacket"}) +} diff --git a/libpod/oci_conmon_attach_linux.go b/libpod/oci_conmon_attach_linux.go index aa55aa6f5..f1aa89d3e 100644 --- a/libpod/oci_conmon_attach_linux.go +++ b/libpod/oci_conmon_attach_linux.go @@ -1,34 +1,12 @@ -//go:build linux -// +build linux - package libpod import ( - "errors" "fmt" - "io" "net" - "os" - "path/filepath" - "syscall" - "github.com/containers/common/pkg/config" - "github.com/containers/common/pkg/resize" - "github.com/containers/common/pkg/util" - "github.com/containers/podman/v4/libpod/define" - "github.com/containers/podman/v4/pkg/errorhandling" - "github.com/moby/term" - "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) -/* Sync with stdpipe_t in conmon.c */ -const ( - AttachPipeStdin = 1 - AttachPipeStdout = 2 - AttachPipeStderr = 3 -) - func openUnixSocket(path string) (*net.UnixConn, error) { fd, err := unix.Open(path, unix.O_PATH, 0) if err != nil { @@ -37,278 +15,3 @@ func openUnixSocket(path string) (*net.UnixConn, error) { defer unix.Close(fd) return net.DialUnix("unixpacket", nil, &net.UnixAddr{Name: fmt.Sprintf("/proc/self/fd/%d", fd), Net: "unixpacket"}) } - -// Attach to the given container. -// Does not check if state is appropriate. -// started is only required if startContainer is true. -func (r *ConmonOCIRuntime) Attach(c *Container, params *AttachOptions) error { - passthrough := c.LogDriver() == define.PassthroughLogging - - if params == nil || params.Streams == nil { - return fmt.Errorf("must provide parameters to Attach: %w", define.ErrInternal) - } - - if !params.Streams.AttachOutput && !params.Streams.AttachError && !params.Streams.AttachInput && !passthrough { - return fmt.Errorf("must provide at least one stream to attach to: %w", define.ErrInvalidArg) - } - if params.Start && params.Started == nil { - return fmt.Errorf("started chan not passed when startContainer set: %w", define.ErrInternal) - } - - keys := config.DefaultDetachKeys - if params.DetachKeys != nil { - keys = *params.DetachKeys - } - - detachKeys, err := processDetachKeys(keys) - if err != nil { - return err - } - - var conn *net.UnixConn - if !passthrough { - logrus.Debugf("Attaching to container %s", c.ID()) - - // If we have a resize, do it. - if params.InitialSize != nil { - if err := r.AttachResize(c, *params.InitialSize); err != nil { - return err - } - } - - attachSock, err := c.AttachSocketPath() - if err != nil { - return err - } - - conn, err = openUnixSocket(attachSock) - if err != nil { - return fmt.Errorf("failed to connect to container's attach socket: %v: %w", attachSock, err) - } - defer func() { - if err := conn.Close(); err != nil { - logrus.Errorf("unable to close socket: %q", err) - } - }() - } - - // If starting was requested, start the container and notify when that's - // done. - if params.Start { - if err := c.start(); err != nil { - return err - } - params.Started <- true - } - - if passthrough { - return nil - } - - receiveStdoutError, stdinDone := setupStdioChannels(params.Streams, conn, detachKeys) - if params.AttachReady != nil { - params.AttachReady <- true - } - return readStdio(conn, params.Streams, receiveStdoutError, stdinDone) -} - -// Attach to the given container's exec session -// attachFd and startFd must be open file descriptors -// attachFd must be the output side of the fd. attachFd is used for two things: -// conmon will first send a nonce value across the pipe indicating it has set up its side of the console socket -// this ensures attachToExec gets all of the output of the called process -// conmon will then send the exit code of the exec process, or an error in the exec session -// startFd must be the input side of the fd. -// newSize resizes the tty to this size before the process is started, must be nil if the exec session has no tty -// conmon will wait to start the exec session until the parent process has set up the console socket. -// Once attachToExec successfully attaches to the console socket, the child conmon process responsible for calling runtime exec -// will read from the output side of start fd, thus learning to start the child process. -// Thus, the order goes as follow: -// 1. conmon parent process sets up its console socket. sends on attachFd -// 2. attachToExec attaches to the console socket after reading on attachFd and resizes the tty -// 3. child waits on startFd for attachToExec to attach to said console socket -// 4. attachToExec sends on startFd, signalling it has attached to the socket and child is ready to go -// 5. child receives on startFd, runs the runtime exec command -// attachToExec is responsible for closing startFd and attachFd -func (c *Container) attachToExec(streams *define.AttachStreams, keys *string, sessionID string, startFd, attachFd *os.File, newSize *resize.TerminalSize) error { - if !streams.AttachOutput && !streams.AttachError && !streams.AttachInput { - return fmt.Errorf("must provide at least one stream to attach to: %w", define.ErrInvalidArg) - } - if startFd == nil || attachFd == nil { - return fmt.Errorf("start sync pipe and attach sync pipe must be defined for exec attach: %w", define.ErrInvalidArg) - } - - defer errorhandling.CloseQuiet(startFd) - defer errorhandling.CloseQuiet(attachFd) - - detachString := config.DefaultDetachKeys - if keys != nil { - detachString = *keys - } - detachKeys, err := processDetachKeys(detachString) - if err != nil { - return err - } - - logrus.Debugf("Attaching to container %s exec session %s", c.ID(), sessionID) - - // set up the socket path, such that it is the correct length and location for exec - sockPath, err := c.execAttachSocketPath(sessionID) - if err != nil { - return err - } - - // 2: read from attachFd that the parent process has set up the console socket - if _, err := readConmonPipeData(c.ociRuntime.Name(), attachFd, ""); err != nil { - return err - } - - // resize before we start the container process - if newSize != nil { - err = c.ociRuntime.ExecAttachResize(c, sessionID, *newSize) - if err != nil { - logrus.Warnf("Resize failed: %v", err) - } - } - - // 2: then attach - conn, err := openUnixSocket(sockPath) - if err != nil { - return fmt.Errorf("failed to connect to container's attach socket: %v: %w", sockPath, err) - } - defer func() { - if err := conn.Close(); err != nil { - logrus.Errorf("Unable to close socket: %q", err) - } - }() - - // start listening on stdio of the process - receiveStdoutError, stdinDone := setupStdioChannels(streams, conn, detachKeys) - - // 4: send start message to child - if err := writeConmonPipeData(startFd); err != nil { - return err - } - - return readStdio(conn, streams, receiveStdoutError, stdinDone) -} - -func processDetachKeys(keys string) ([]byte, error) { - // Check the validity of the provided keys first - if len(keys) == 0 { - return []byte{}, nil - } - detachKeys, err := term.ToBytes(keys) - if err != nil { - return nil, fmt.Errorf("invalid detach keys: %w", err) - } - return detachKeys, nil -} - -func registerResizeFunc(r <-chan resize.TerminalSize, bundlePath string) { - resize.HandleResizing(r, func(size resize.TerminalSize) { - controlPath := filepath.Join(bundlePath, "ctl") - controlFile, err := os.OpenFile(controlPath, unix.O_WRONLY, 0) - if err != nil { - logrus.Debugf("Could not open ctl file: %v", err) - return - } - defer controlFile.Close() - - logrus.Debugf("Received a resize event: %+v", size) - if _, err = fmt.Fprintf(controlFile, "%d %d %d\n", 1, size.Height, size.Width); err != nil { - logrus.Warnf("Failed to write to control file to resize terminal: %v", err) - } - }) -} - -func setupStdioChannels(streams *define.AttachStreams, conn *net.UnixConn, detachKeys []byte) (chan error, chan error) { - receiveStdoutError := make(chan error) - go func() { - receiveStdoutError <- redirectResponseToOutputStreams(streams.OutputStream, streams.ErrorStream, streams.AttachOutput, streams.AttachError, conn) - }() - - stdinDone := make(chan error) - go func() { - var err error - if streams.AttachInput { - _, err = util.CopyDetachable(conn, streams.InputStream, detachKeys) - } - stdinDone <- err - }() - - return receiveStdoutError, stdinDone -} - -func redirectResponseToOutputStreams(outputStream, errorStream io.Writer, writeOutput, writeError bool, conn io.Reader) error { - var err error - buf := make([]byte, 8192+1) /* Sync with conmon STDIO_BUF_SIZE */ - for { - nr, er := conn.Read(buf) - if nr > 0 { - var dst io.Writer - var doWrite bool - switch buf[0] { - case AttachPipeStdout: - dst = outputStream - doWrite = writeOutput - case AttachPipeStderr: - dst = errorStream - doWrite = writeError - default: - logrus.Infof("Received unexpected attach type %+d", buf[0]) - } - if dst == nil { - return errors.New("output destination cannot be nil") - } - - if doWrite { - nw, ew := dst.Write(buf[1:nr]) - if ew != nil { - err = ew - break - } - if nr != nw+1 { - err = io.ErrShortWrite - break - } - } - } - if errors.Is(er, io.EOF) || errors.Is(er, syscall.ECONNRESET) { - break - } - if er != nil { - err = er - break - } - } - return err -} - -func readStdio(conn *net.UnixConn, streams *define.AttachStreams, receiveStdoutError, stdinDone chan error) error { - var err error - select { - case err = <-receiveStdoutError: - if err := conn.CloseWrite(); err != nil { - logrus.Errorf("Failed to close stdin: %v", err) - } - return err - case err = <-stdinDone: - if err == define.ErrDetach { - if err := conn.CloseWrite(); err != nil { - logrus.Errorf("Failed to close stdin: %v", err) - } - return err - } - if err == nil { - // copy stdin is done, close it - if connErr := conn.CloseWrite(); connErr != nil { - logrus.Errorf("Unable to close conn: %v", connErr) - } - } - if streams.AttachOutput || streams.AttachError { - return <-receiveStdoutError - } - } - return nil -} diff --git a/libpod/oci_conmon_common.go b/libpod/oci_conmon_common.go new file mode 100644 index 000000000..53dddd064 --- /dev/null +++ b/libpod/oci_conmon_common.go @@ -0,0 +1,1650 @@ +//go:build linux || freebsd +// +build linux freebsd + +package libpod + +import ( + "bufio" + "bytes" + "context" + "errors" + "fmt" + "io" + "io/ioutil" + "net" + "net/http" + "os" + "os/exec" + "path/filepath" + "strconv" + "strings" + "sync" + "syscall" + "text/template" + "time" + + "github.com/containers/common/pkg/config" + "github.com/containers/common/pkg/resize" + cutil "github.com/containers/common/pkg/util" + conmonConfig "github.com/containers/conmon/runner/config" + "github.com/containers/podman/v4/libpod/define" + "github.com/containers/podman/v4/libpod/logs" + "github.com/containers/podman/v4/pkg/checkpoint/crutils" + "github.com/containers/podman/v4/pkg/errorhandling" + "github.com/containers/podman/v4/pkg/rootless" + "github.com/containers/podman/v4/pkg/specgenutil" + "github.com/containers/podman/v4/pkg/util" + "github.com/containers/podman/v4/utils" + "github.com/containers/storage/pkg/homedir" + spec "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +const ( + // This is Conmon's STDIO_BUF_SIZE. I don't believe we have access to it + // directly from the Go code, so const it here + // Important: The conmon attach socket uses an extra byte at the beginning of each + // message to specify the STREAM so we have to increase the buffer size by one + bufferSize = conmonConfig.BufSize + 1 +) + +// ConmonOCIRuntime is an OCI runtime managed by Conmon. +// TODO: Make all calls to OCI runtime have a timeout. +type ConmonOCIRuntime struct { + name string + path string + conmonPath string + conmonEnv []string + tmpDir string + exitsDir string + logSizeMax int64 + noPivot bool + reservePorts bool + runtimeFlags []string + supportsJSON bool + supportsKVM bool + supportsNoCgroups bool + enableKeyring bool +} + +// Make a new Conmon-based OCI runtime with the given options. +// Conmon will wrap the given OCI runtime, which can be `runc`, `crun`, or +// any runtime with a runc-compatible CLI. +// The first path that points to a valid executable will be used. +// Deliberately private. Someone should not be able to construct this outside of +// libpod. +func newConmonOCIRuntime(name string, paths []string, conmonPath string, runtimeFlags []string, runtimeCfg *config.Config) (OCIRuntime, error) { + if name == "" { + return nil, fmt.Errorf("the OCI runtime must be provided a non-empty name: %w", define.ErrInvalidArg) + } + + // Make lookup tables for runtime support + supportsJSON := make(map[string]bool, len(runtimeCfg.Engine.RuntimeSupportsJSON)) + supportsNoCgroups := make(map[string]bool, len(runtimeCfg.Engine.RuntimeSupportsNoCgroups)) + supportsKVM := make(map[string]bool, len(runtimeCfg.Engine.RuntimeSupportsKVM)) + for _, r := range runtimeCfg.Engine.RuntimeSupportsJSON { + supportsJSON[r] = true + } + for _, r := range runtimeCfg.Engine.RuntimeSupportsNoCgroups { + supportsNoCgroups[r] = true + } + for _, r := range runtimeCfg.Engine.RuntimeSupportsKVM { + supportsKVM[r] = true + } + + runtime := new(ConmonOCIRuntime) + runtime.name = name + runtime.conmonPath = conmonPath + runtime.runtimeFlags = runtimeFlags + + runtime.conmonEnv = runtimeCfg.Engine.ConmonEnvVars + runtime.tmpDir = runtimeCfg.Engine.TmpDir + runtime.logSizeMax = runtimeCfg.Containers.LogSizeMax + runtime.noPivot = runtimeCfg.Engine.NoPivotRoot + runtime.reservePorts = runtimeCfg.Engine.EnablePortReservation + runtime.enableKeyring = runtimeCfg.Containers.EnableKeyring + + // TODO: probe OCI runtime for feature and enable automatically if + // available. + + base := filepath.Base(name) + runtime.supportsJSON = supportsJSON[base] + runtime.supportsNoCgroups = supportsNoCgroups[base] + runtime.supportsKVM = supportsKVM[base] + + foundPath := false + for _, path := range paths { + stat, err := os.Stat(path) + if err != nil { + if os.IsNotExist(err) { + continue + } + return nil, fmt.Errorf("cannot stat OCI runtime %s path: %w", name, err) + } + if !stat.Mode().IsRegular() { + continue + } + foundPath = true + logrus.Tracef("found runtime %q", path) + runtime.path = path + break + } + + // Search the $PATH as last fallback + if !foundPath { + if foundRuntime, err := exec.LookPath(name); err == nil { + foundPath = true + runtime.path = foundRuntime + logrus.Debugf("using runtime %q from $PATH: %q", name, foundRuntime) + } + } + + if !foundPath { + return nil, fmt.Errorf("no valid executable found for OCI runtime %s: %w", name, define.ErrInvalidArg) + } + + runtime.exitsDir = filepath.Join(runtime.tmpDir, "exits") + + // Create the exit files and attach sockets directories + if err := os.MkdirAll(runtime.exitsDir, 0750); err != nil { + // The directory is allowed to exist + if !os.IsExist(err) { + return nil, fmt.Errorf("creating OCI runtime exit files directory: %w", err) + } + } + return runtime, nil +} + +// Name returns the name of the runtime being wrapped by Conmon. +func (r *ConmonOCIRuntime) Name() string { + return r.name +} + +// Path returns the path of the OCI runtime being wrapped by Conmon. +func (r *ConmonOCIRuntime) Path() string { + return r.path +} + +// hasCurrentUserMapped checks whether the current user is mapped inside the container user namespace +func hasCurrentUserMapped(ctr *Container) bool { + if len(ctr.config.IDMappings.UIDMap) == 0 && len(ctr.config.IDMappings.GIDMap) == 0 { + return true + } + uid := os.Geteuid() + for _, m := range ctr.config.IDMappings.UIDMap { + if uid >= m.HostID && uid < m.HostID+m.Size { + return true + } + } + return false +} + +// CreateContainer creates a container. +func (r *ConmonOCIRuntime) CreateContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) { + // always make the run dir accessible to the current user so that the PID files can be read without + // being in the rootless user namespace. + if err := makeAccessible(ctr.state.RunDir, 0, 0); err != nil { + return 0, err + } + if !hasCurrentUserMapped(ctr) { + for _, i := range []string{ctr.state.RunDir, ctr.runtime.config.Engine.TmpDir, ctr.config.StaticDir, ctr.state.Mountpoint, ctr.runtime.config.Engine.VolumePath} { + if err := makeAccessible(i, ctr.RootUID(), ctr.RootGID()); err != nil { + return 0, err + } + } + + // if we are running a non privileged container, be sure to umount some kernel paths so they are not + // bind mounted inside the container at all. + if !ctr.config.Privileged && !rootless.IsRootless() { + return r.createRootlessContainer(ctr, restoreOptions) + } + } + return r.createOCIContainer(ctr, restoreOptions) +} + +// UpdateContainerStatus retrieves the current status of the container from the +// runtime. It updates the container's state but does not save it. +// If useRuntime is false, we will not directly hit runc to see the container's +// status, but will instead only check for the existence of the conmon exit file +// and update state to stopped if it exists. +func (r *ConmonOCIRuntime) UpdateContainerStatus(ctr *Container) error { + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return err + } + + // Store old state so we know if we were already stopped + oldState := ctr.state.State + + state := new(spec.State) + + cmd := exec.Command(r.path, "state", ctr.ID()) + cmd.Env = append(cmd.Env, fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)) + + outPipe, err := cmd.StdoutPipe() + if err != nil { + return fmt.Errorf("getting stdout pipe: %w", err) + } + errPipe, err := cmd.StderrPipe() + if err != nil { + return fmt.Errorf("getting stderr pipe: %w", err) + } + + if err := cmd.Start(); err != nil { + out, err2 := ioutil.ReadAll(errPipe) + if err2 != nil { + return fmt.Errorf("getting container %s state: %w", ctr.ID(), err) + } + if strings.Contains(string(out), "does not exist") || strings.Contains(string(out), "No such file") { + if err := ctr.removeConmonFiles(); err != nil { + logrus.Debugf("unable to remove conmon files for container %s", ctr.ID()) + } + ctr.state.ExitCode = -1 + ctr.state.FinishedTime = time.Now() + ctr.state.State = define.ContainerStateExited + return ctr.runtime.state.AddContainerExitCode(ctr.ID(), ctr.state.ExitCode) + } + return fmt.Errorf("getting container %s state. stderr/out: %s: %w", ctr.ID(), out, err) + } + defer func() { + _ = cmd.Wait() + }() + + if err := errPipe.Close(); err != nil { + return err + } + out, err := ioutil.ReadAll(outPipe) + if err != nil { + return fmt.Errorf("reading stdout: %s: %w", ctr.ID(), err) + } + if err := json.NewDecoder(bytes.NewBuffer(out)).Decode(state); err != nil { + return fmt.Errorf("decoding container status for container %s: %w", ctr.ID(), err) + } + ctr.state.PID = state.Pid + + switch state.Status { + case "created": + ctr.state.State = define.ContainerStateCreated + case "paused": + ctr.state.State = define.ContainerStatePaused + case "running": + ctr.state.State = define.ContainerStateRunning + case "stopped": + ctr.state.State = define.ContainerStateStopped + default: + return fmt.Errorf("unrecognized status returned by runtime for container %s: %s: %w", + ctr.ID(), state.Status, define.ErrInternal) + } + + // Handle ContainerStateStopping - keep it unless the container + // transitioned to no longer running. + if oldState == define.ContainerStateStopping && (ctr.state.State == define.ContainerStatePaused || ctr.state.State == define.ContainerStateRunning) { + ctr.state.State = define.ContainerStateStopping + } + + return nil +} + +// StartContainer starts the given container. +// Sets time the container was started, but does not save it. +func (r *ConmonOCIRuntime) StartContainer(ctr *Container) error { + // TODO: streams should probably *not* be our STDIN/OUT/ERR - redirect to buffers? + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return err + } + env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} + if path, ok := os.LookupEnv("PATH"); ok { + env = append(env, fmt.Sprintf("PATH=%s", path)) + } + if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, append(r.runtimeFlags, "start", ctr.ID())...); err != nil { + return err + } + + ctr.state.StartedTime = time.Now() + + return nil +} + +// UpdateContainer updates the given container's cgroup configuration +func (r *ConmonOCIRuntime) UpdateContainer(ctr *Container, resources *spec.LinuxResources) error { + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return err + } + env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} + if path, ok := os.LookupEnv("PATH"); ok { + env = append(env, fmt.Sprintf("PATH=%s", path)) + } + args := r.runtimeFlags + args = append(args, "update") + tempFile, additionalArgs, err := generateResourceFile(resources) + if err != nil { + return err + } + defer os.Remove(tempFile) + + args = append(args, additionalArgs...) + return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, append(args, ctr.ID())...) +} + +func generateResourceFile(res *spec.LinuxResources) (string, []string, error) { + flags := []string{} + if res == nil { + return "", flags, nil + } + + f, err := ioutil.TempFile("", "podman") + if err != nil { + return "", nil, err + } + + j, err := json.Marshal(res) + if err != nil { + return "", nil, err + } + _, err = f.WriteString(string(j)) + if err != nil { + return "", nil, err + } + + flags = append(flags, "--resources="+f.Name()) + return f.Name(), flags, nil +} + +// KillContainer sends the given signal to the given container. +// If all is set, send to all PIDs in the container. +// All is only supported if the container created cgroups. +func (r *ConmonOCIRuntime) KillContainer(ctr *Container, signal uint, all bool) error { + logrus.Debugf("Sending signal %d to container %s", signal, ctr.ID()) + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return err + } + env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} + var args []string + args = append(args, r.runtimeFlags...) + if all { + args = append(args, "kill", "--all", ctr.ID(), fmt.Sprintf("%d", signal)) + } else { + args = append(args, "kill", ctr.ID(), fmt.Sprintf("%d", signal)) + } + if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, args...); err != nil { + // Update container state - there's a chance we failed because + // the container exited in the meantime. + if err2 := r.UpdateContainerStatus(ctr); err2 != nil { + logrus.Infof("Error updating status for container %s: %v", ctr.ID(), err2) + } + if ctr.ensureState(define.ContainerStateStopped, define.ContainerStateExited) { + return define.ErrCtrStateInvalid + } + return fmt.Errorf("sending signal to container %s: %w", ctr.ID(), err) + } + + return nil +} + +// StopContainer stops a container, first using its given stop signal (or +// SIGTERM if no signal was specified), then using SIGKILL. +// Timeout is given in seconds. If timeout is 0, the container will be +// immediately kill with SIGKILL. +// Does not set finished time for container, assumes you will run updateStatus +// after to pull the exit code. +func (r *ConmonOCIRuntime) StopContainer(ctr *Container, timeout uint, all bool) error { + logrus.Debugf("Stopping container %s (PID %d)", ctr.ID(), ctr.state.PID) + + // Ping the container to see if it's alive + // If it's not, it's already stopped, return + err := unix.Kill(ctr.state.PID, 0) + if err == unix.ESRCH { + return nil + } + + stopSignal := ctr.config.StopSignal + if stopSignal == 0 { + stopSignal = uint(syscall.SIGTERM) + } + + if timeout > 0 { + if err := r.KillContainer(ctr, stopSignal, all); err != nil { + // Is the container gone? + // If so, it probably died between the first check and + // our sending the signal + // The container is stopped, so exit cleanly + err := unix.Kill(ctr.state.PID, 0) + if err == unix.ESRCH { + return nil + } + + return err + } + + if err := waitContainerStop(ctr, time.Duration(timeout)*time.Second); err != nil { + logrus.Debugf("Timed out stopping container %s with %s, resorting to SIGKILL: %v", ctr.ID(), unix.SignalName(syscall.Signal(stopSignal)), err) + logrus.Warnf("StopSignal %s failed to stop container %s in %d seconds, resorting to SIGKILL", unix.SignalName(syscall.Signal(stopSignal)), ctr.Name(), timeout) + } else { + // No error, the container is dead + return nil + } + } + + if err := r.KillContainer(ctr, uint(unix.SIGKILL), all); err != nil { + // Again, check if the container is gone. If it is, exit cleanly. + if aliveErr := unix.Kill(ctr.state.PID, 0); errors.Is(aliveErr, unix.ESRCH) { + return nil + } + return fmt.Errorf("sending SIGKILL to container %s: %w", ctr.ID(), err) + } + + // Give runtime a few seconds to make it happen + if err := waitContainerStop(ctr, killContainerTimeout); err != nil { + return err + } + + return nil +} + +// DeleteContainer deletes a container from the OCI runtime. +func (r *ConmonOCIRuntime) DeleteContainer(ctr *Container) error { + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return err + } + env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} + return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, append(r.runtimeFlags, "delete", "--force", ctr.ID())...) +} + +// PauseContainer pauses the given container. +func (r *ConmonOCIRuntime) PauseContainer(ctr *Container) error { + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return err + } + env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} + return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, append(r.runtimeFlags, "pause", ctr.ID())...) +} + +// UnpauseContainer unpauses the given container. +func (r *ConmonOCIRuntime) UnpauseContainer(ctr *Container) error { + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return err + } + env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} + return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, append(r.runtimeFlags, "resume", ctr.ID())...) +} + +// This filters out ENOTCONN errors which can happen on FreeBSD if the +// other side of the connection is already closed. +func socketCloseWrite(conn *net.UnixConn) error { + err := conn.CloseWrite() + if err != nil && errors.Is(err, syscall.ENOTCONN) { + return nil + } + return err +} + +// HTTPAttach performs an attach for the HTTP API. +// The caller must handle closing the HTTP connection after this returns. +// The cancel channel is not closed; it is up to the caller to do so after +// this function returns. +// If this is a container with a terminal, we will stream raw. If it is not, we +// will stream with an 8-byte header to multiplex STDOUT and STDERR. +// Returns any errors that occurred, and whether the connection was successfully +// hijacked before that error occurred. +func (r *ConmonOCIRuntime) HTTPAttach(ctr *Container, req *http.Request, w http.ResponseWriter, streams *HTTPAttachStreams, detachKeys *string, cancel <-chan bool, hijackDone chan<- bool, streamAttach, streamLogs bool) (deferredErr error) { + isTerminal := false + if ctr.config.Spec.Process != nil { + isTerminal = ctr.config.Spec.Process.Terminal + } + + if streams != nil { + if !streams.Stdin && !streams.Stdout && !streams.Stderr { + return fmt.Errorf("must specify at least one stream to attach to: %w", define.ErrInvalidArg) + } + } + + attachSock, err := r.AttachSocketPath(ctr) + if err != nil { + return err + } + + var conn *net.UnixConn + if streamAttach { + newConn, err := openUnixSocket(attachSock) + if err != nil { + return fmt.Errorf("failed to connect to container's attach socket: %v: %w", attachSock, err) + } + conn = newConn + defer func() { + if err := conn.Close(); err != nil { + logrus.Errorf("Unable to close container %s attach socket: %q", ctr.ID(), err) + } + }() + + logrus.Debugf("Successfully connected to container %s attach socket %s", ctr.ID(), attachSock) + } + + detachString := ctr.runtime.config.Engine.DetachKeys + if detachKeys != nil { + detachString = *detachKeys + } + detach, err := processDetachKeys(detachString) + if err != nil { + return err + } + + attachStdout := true + attachStderr := true + attachStdin := true + if streams != nil { + attachStdout = streams.Stdout + attachStderr = streams.Stderr + attachStdin = streams.Stdin + } + + logrus.Debugf("Going to hijack container %s attach connection", ctr.ID()) + + // Alright, let's hijack. + hijacker, ok := w.(http.Hijacker) + if !ok { + return fmt.Errorf("unable to hijack connection") + } + + httpCon, httpBuf, err := hijacker.Hijack() + if err != nil { + return fmt.Errorf("hijacking connection: %w", err) + } + + hijackDone <- true + + writeHijackHeader(req, httpBuf) + + // Force a flush after the header is written. + if err := httpBuf.Flush(); err != nil { + return fmt.Errorf("flushing HTTP hijack header: %w", err) + } + + defer func() { + hijackWriteErrorAndClose(deferredErr, ctr.ID(), isTerminal, httpCon, httpBuf) + }() + + logrus.Debugf("Hijack for container %s attach session done, ready to stream", ctr.ID()) + + // TODO: This is gross. Really, really gross. + // I want to say we should read all the logs into an array before + // calling this, in container_api.go, but that could take a lot of + // memory... + // On the whole, we need to figure out a better way of doing this, + // though. + logSize := 0 + if streamLogs { + logrus.Debugf("Will stream logs for container %s attach session", ctr.ID()) + + // Get all logs for the container + logChan := make(chan *logs.LogLine) + logOpts := new(logs.LogOptions) + logOpts.Tail = -1 + logOpts.WaitGroup = new(sync.WaitGroup) + errChan := make(chan error) + go func() { + var err error + // In non-terminal mode we need to prepend with the + // stream header. + logrus.Debugf("Writing logs for container %s to HTTP attach", ctr.ID()) + for logLine := range logChan { + if !isTerminal { + device := logLine.Device + var header []byte + headerLen := uint32(len(logLine.Msg)) + logSize += len(logLine.Msg) + switch strings.ToLower(device) { + case "stdin": + header = makeHTTPAttachHeader(0, headerLen) + case "stdout": + header = makeHTTPAttachHeader(1, headerLen) + case "stderr": + header = makeHTTPAttachHeader(2, headerLen) + default: + logrus.Errorf("Unknown device for log line: %s", device) + header = makeHTTPAttachHeader(1, headerLen) + } + _, err = httpBuf.Write(header) + if err != nil { + break + } + } + _, err = httpBuf.Write([]byte(logLine.Msg)) + if err != nil { + break + } + if !logLine.Partial() { + _, err = httpBuf.Write([]byte("\n")) + if err != nil { + break + } + } + err = httpBuf.Flush() + if err != nil { + break + } + } + errChan <- err + }() + if err := ctr.ReadLog(context.Background(), logOpts, logChan, 0); err != nil { + return err + } + go func() { + logOpts.WaitGroup.Wait() + close(logChan) + }() + logrus.Debugf("Done reading logs for container %s, %d bytes", ctr.ID(), logSize) + if err := <-errChan; err != nil { + return err + } + } + if !streamAttach { + logrus.Debugf("Done streaming logs for container %s attach, exiting as attach streaming not requested", ctr.ID()) + return nil + } + + logrus.Debugf("Forwarding attach output for container %s", ctr.ID()) + + stdoutChan := make(chan error) + stdinChan := make(chan error) + + // Handle STDOUT/STDERR + go func() { + var err error + if isTerminal { + // Hack: return immediately if attachStdout not set to + // emulate Docker. + // Basically, when terminal is set, STDERR goes nowhere. + // Everything does over STDOUT. + // Therefore, if not attaching STDOUT - we'll never copy + // anything from here. + logrus.Debugf("Performing terminal HTTP attach for container %s", ctr.ID()) + if attachStdout { + err = httpAttachTerminalCopy(conn, httpBuf, ctr.ID()) + } + } else { + logrus.Debugf("Performing non-terminal HTTP attach for container %s", ctr.ID()) + err = httpAttachNonTerminalCopy(conn, httpBuf, ctr.ID(), attachStdin, attachStdout, attachStderr) + } + stdoutChan <- err + logrus.Debugf("STDOUT/ERR copy completed") + }() + // Next, STDIN. Avoid entirely if attachStdin unset. + if attachStdin { + go func() { + _, err := cutil.CopyDetachable(conn, httpBuf, detach) + logrus.Debugf("STDIN copy completed") + stdinChan <- err + }() + } + + for { + select { + case err := <-stdoutChan: + if err != nil { + return err + } + + return nil + case err := <-stdinChan: + if err != nil { + return err + } + // copy stdin is done, close it + if connErr := socketCloseWrite(conn); connErr != nil { + logrus.Errorf("Unable to close conn: %v", connErr) + } + case <-cancel: + return nil + } + } +} + +// isRetryable returns whether the error was caused by a blocked syscall or the +// specified operation on a non blocking file descriptor wasn't ready for completion. +func isRetryable(err error) bool { + var errno syscall.Errno + if errors.As(err, &errno) { + return errno == syscall.EINTR || errno == syscall.EAGAIN + } + return false +} + +// openControlFile opens the terminal control file. +func openControlFile(ctr *Container, parentDir string) (*os.File, error) { + controlPath := filepath.Join(parentDir, "ctl") + for i := 0; i < 600; i++ { + controlFile, err := os.OpenFile(controlPath, unix.O_WRONLY|unix.O_NONBLOCK, 0) + if err == nil { + return controlFile, nil + } + if !isRetryable(err) { + return nil, fmt.Errorf("could not open ctl file for terminal resize for container %s: %w", ctr.ID(), err) + } + time.Sleep(time.Second / 10) + } + return nil, fmt.Errorf("timeout waiting for %q", controlPath) +} + +// AttachResize resizes the terminal used by the given container. +func (r *ConmonOCIRuntime) AttachResize(ctr *Container, newSize resize.TerminalSize) error { + controlFile, err := openControlFile(ctr, ctr.bundlePath()) + if err != nil { + return err + } + defer controlFile.Close() + + logrus.Debugf("Received a resize event for container %s: %+v", ctr.ID(), newSize) + if _, err = fmt.Fprintf(controlFile, "%d %d %d\n", 1, newSize.Height, newSize.Width); err != nil { + return fmt.Errorf("failed to write to ctl file to resize terminal: %w", err) + } + + return nil +} + +// CheckpointContainer checkpoints the given container. +func (r *ConmonOCIRuntime) CheckpointContainer(ctr *Container, options ContainerCheckpointOptions) (int64, error) { + // imagePath is used by CRIU to store the actual checkpoint files + imagePath := ctr.CheckpointPath() + if options.PreCheckPoint { + imagePath = ctr.PreCheckPointPath() + } + // workPath will be used to store dump.log and stats-dump + workPath := ctr.bundlePath() + logrus.Debugf("Writing checkpoint to %s", imagePath) + logrus.Debugf("Writing checkpoint logs to %s", workPath) + logrus.Debugf("Pre-dump the container %t", options.PreCheckPoint) + args := []string{} + args = append(args, r.runtimeFlags...) + args = append(args, "checkpoint") + args = append(args, "--image-path") + args = append(args, imagePath) + args = append(args, "--work-path") + args = append(args, workPath) + if options.KeepRunning { + args = append(args, "--leave-running") + } + if options.TCPEstablished { + args = append(args, "--tcp-established") + } + if options.FileLocks { + args = append(args, "--file-locks") + } + if !options.PreCheckPoint && options.KeepRunning { + args = append(args, "--leave-running") + } + if options.PreCheckPoint { + args = append(args, "--pre-dump") + } + if !options.PreCheckPoint && options.WithPrevious { + args = append( + args, + "--parent-path", + filepath.Join("..", preCheckpointDir), + ) + } + + args = append(args, ctr.ID()) + logrus.Debugf("the args to checkpoint: %s %s", r.path, strings.Join(args, " ")) + + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return 0, err + } + env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} + if path, ok := os.LookupEnv("PATH"); ok { + env = append(env, fmt.Sprintf("PATH=%s", path)) + } + + var runtimeCheckpointStarted time.Time + err = r.withContainerSocketLabel(ctr, func() error { + runtimeCheckpointStarted = time.Now() + return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, args...) + }) + + runtimeCheckpointDuration := func() int64 { + if options.PrintStats { + return time.Since(runtimeCheckpointStarted).Microseconds() + } + return 0 + }() + + return runtimeCheckpointDuration, err +} + +func (r *ConmonOCIRuntime) CheckConmonRunning(ctr *Container) (bool, error) { + if ctr.state.ConmonPID == 0 { + // If the container is running or paused, assume Conmon is + // running. We didn't record Conmon PID on some old versions, so + // that is likely what's going on... + // Unusual enough that we should print a warning message though. + if ctr.ensureState(define.ContainerStateRunning, define.ContainerStatePaused) { + logrus.Warnf("Conmon PID is not set, but container is running!") + return true, nil + } + // Container's not running, so conmon PID being unset is + // expected. Conmon is not running. + return false, nil + } + + // We have a conmon PID. Ping it with signal 0. + if err := unix.Kill(ctr.state.ConmonPID, 0); err != nil { + if err == unix.ESRCH { + return false, nil + } + return false, fmt.Errorf("pinging container %s conmon with signal 0: %w", ctr.ID(), err) + } + return true, nil +} + +// SupportsCheckpoint checks if the OCI runtime supports checkpointing +// containers. +func (r *ConmonOCIRuntime) SupportsCheckpoint() bool { + return crutils.CRRuntimeSupportsCheckpointRestore(r.path) +} + +// SupportsJSONErrors checks if the OCI runtime supports JSON-formatted error +// messages. +func (r *ConmonOCIRuntime) SupportsJSONErrors() bool { + return r.supportsJSON +} + +// SupportsNoCgroups checks if the OCI runtime supports running containers +// without cgroups (the --cgroup-manager=disabled flag). +func (r *ConmonOCIRuntime) SupportsNoCgroups() bool { + return r.supportsNoCgroups +} + +// SupportsKVM checks if the OCI runtime supports running containers +// without KVM separation +func (r *ConmonOCIRuntime) SupportsKVM() bool { + return r.supportsKVM +} + +// AttachSocketPath is the path to a single container's attach socket. +func (r *ConmonOCIRuntime) AttachSocketPath(ctr *Container) (string, error) { + if ctr == nil { + return "", fmt.Errorf("must provide a valid container to get attach socket path: %w", define.ErrInvalidArg) + } + + return filepath.Join(ctr.bundlePath(), "attach"), nil +} + +// ExitFilePath is the path to a container's exit file. +func (r *ConmonOCIRuntime) ExitFilePath(ctr *Container) (string, error) { + if ctr == nil { + return "", fmt.Errorf("must provide a valid container to get exit file path: %w", define.ErrInvalidArg) + } + return filepath.Join(r.exitsDir, ctr.ID()), nil +} + +// RuntimeInfo provides information on the runtime. +func (r *ConmonOCIRuntime) RuntimeInfo() (*define.ConmonInfo, *define.OCIRuntimeInfo, error) { + runtimePackage := packageVersion(r.path) + conmonPackage := packageVersion(r.conmonPath) + runtimeVersion, err := r.getOCIRuntimeVersion() + if err != nil { + return nil, nil, fmt.Errorf("getting version of OCI runtime %s: %w", r.name, err) + } + conmonVersion, err := r.getConmonVersion() + if err != nil { + return nil, nil, fmt.Errorf("getting conmon version: %w", err) + } + + conmon := define.ConmonInfo{ + Package: conmonPackage, + Path: r.conmonPath, + Version: conmonVersion, + } + ocirt := define.OCIRuntimeInfo{ + Name: r.name, + Path: r.path, + Package: runtimePackage, + Version: runtimeVersion, + } + return &conmon, &ocirt, nil +} + +// makeAccessible changes the path permission and each parent directory to have --x--x--x +func makeAccessible(path string, uid, gid int) error { + for ; path != "/"; path = filepath.Dir(path) { + st, err := os.Stat(path) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + if int(st.Sys().(*syscall.Stat_t).Uid) == uid && int(st.Sys().(*syscall.Stat_t).Gid) == gid { + continue + } + if st.Mode()&0111 != 0111 { + if err := os.Chmod(path, st.Mode()|0111); err != nil { + return err + } + } + } + return nil +} + +// Wait for a container which has been sent a signal to stop +func waitContainerStop(ctr *Container, timeout time.Duration) error { + return waitPidStop(ctr.state.PID, timeout) +} + +// Wait for a given PID to stop +func waitPidStop(pid int, timeout time.Duration) error { + done := make(chan struct{}) + chControl := make(chan struct{}) + go func() { + for { + select { + case <-chControl: + return + default: + if err := unix.Kill(pid, 0); err != nil { + if err == unix.ESRCH { + close(done) + return + } + logrus.Errorf("Pinging PID %d with signal 0: %v", pid, err) + } + time.Sleep(100 * time.Millisecond) + } + } + }() + select { + case <-done: + return nil + case <-time.After(timeout): + close(chControl) + return fmt.Errorf("given PIDs did not die within timeout") + } +} + +func (r *ConmonOCIRuntime) getLogTag(ctr *Container) (string, error) { + logTag := ctr.LogTag() + if logTag == "" { + return "", nil + } + data, err := ctr.inspectLocked(false) + if err != nil { + // FIXME: this error should probably be returned + return "", nil //nolint: nilerr + } + tmpl, err := template.New("container").Parse(logTag) + if err != nil { + return "", fmt.Errorf("template parsing error %s: %w", logTag, err) + } + var b bytes.Buffer + err = tmpl.Execute(&b, data) + if err != nil { + return "", err + } + return b.String(), nil +} + +// createOCIContainer generates this container's main conmon instance and prepares it for starting +func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) { + var stderrBuf bytes.Buffer + + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return 0, err + } + + parentSyncPipe, childSyncPipe, err := newPipe() + if err != nil { + return 0, fmt.Errorf("creating socket pair: %w", err) + } + defer errorhandling.CloseQuiet(parentSyncPipe) + + childStartPipe, parentStartPipe, err := newPipe() + if err != nil { + return 0, fmt.Errorf("creating socket pair for start pipe: %w", err) + } + + defer errorhandling.CloseQuiet(parentStartPipe) + + var ociLog string + if logrus.GetLevel() != logrus.DebugLevel && r.supportsJSON { + ociLog = filepath.Join(ctr.state.RunDir, "oci-log") + } + + logTag, err := r.getLogTag(ctr) + if err != nil { + return 0, err + } + + if ctr.config.CgroupsMode == cgroupSplit { + if err := utils.MoveUnderCgroupSubtree("runtime"); err != nil { + return 0, err + } + } + + pidfile := ctr.config.PidFile + if pidfile == "" { + pidfile = filepath.Join(ctr.state.RunDir, "pidfile") + } + + args := r.sharedConmonArgs(ctr, ctr.ID(), ctr.bundlePath(), pidfile, ctr.LogPath(), r.exitsDir, ociLog, ctr.LogDriver(), logTag) + + if ctr.config.SdNotifyMode == define.SdNotifyModeContainer && ctr.config.SdNotifySocket != "" { + args = append(args, fmt.Sprintf("--sdnotify-socket=%s", ctr.config.SdNotifySocket)) + } + + if ctr.config.Spec.Process.Terminal { + args = append(args, "-t") + } else if ctr.config.Stdin { + args = append(args, "-i") + } + + if ctr.config.Timeout > 0 { + args = append(args, fmt.Sprintf("--timeout=%d", ctr.config.Timeout)) + } + + if !r.enableKeyring { + args = append(args, "--no-new-keyring") + } + if ctr.config.ConmonPidFile != "" { + args = append(args, "--conmon-pidfile", ctr.config.ConmonPidFile) + } + + if r.noPivot { + args = append(args, "--no-pivot") + } + + exitCommand, err := specgenutil.CreateExitCommandArgs(ctr.runtime.storageConfig, ctr.runtime.config, logrus.IsLevelEnabled(logrus.DebugLevel), ctr.AutoRemove(), false) + if err != nil { + return 0, err + } + exitCommand = append(exitCommand, ctr.config.ID) + + args = append(args, "--exit-command", exitCommand[0]) + for _, arg := range exitCommand[1:] { + args = append(args, []string{"--exit-command-arg", arg}...) + } + + // Pass down the LISTEN_* environment (see #10443). + preserveFDs := ctr.config.PreserveFDs + if val := os.Getenv("LISTEN_FDS"); val != "" { + if ctr.config.PreserveFDs > 0 { + logrus.Warnf("Ignoring LISTEN_FDS to preserve custom user-specified FDs") + } else { + fds, err := strconv.Atoi(val) + if err != nil { + return 0, fmt.Errorf("converting LISTEN_FDS=%s: %w", val, err) + } + preserveFDs = uint(fds) + } + } + + if preserveFDs > 0 { + args = append(args, formatRuntimeOpts("--preserve-fds", fmt.Sprintf("%d", preserveFDs))...) + } + + if restoreOptions != nil { + args = append(args, "--restore", ctr.CheckpointPath()) + if restoreOptions.TCPEstablished { + args = append(args, "--runtime-opt", "--tcp-established") + } + if restoreOptions.FileLocks { + args = append(args, "--runtime-opt", "--file-locks") + } + if restoreOptions.Pod != "" { + mountLabel := ctr.config.MountLabel + processLabel := ctr.config.ProcessLabel + if mountLabel != "" { + args = append( + args, + "--runtime-opt", + fmt.Sprintf( + "--lsm-mount-context=%s", + mountLabel, + ), + ) + } + if processLabel != "" { + args = append( + args, + "--runtime-opt", + fmt.Sprintf( + "--lsm-profile=selinux:%s", + processLabel, + ), + ) + } + } + } + + logrus.WithFields(logrus.Fields{ + "args": args, + }).Debugf("running conmon: %s", r.conmonPath) + + cmd := exec.Command(r.conmonPath, args...) + cmd.SysProcAttr = &syscall.SysProcAttr{ + Setpgid: true, + } + // TODO this is probably a really bad idea for some uses + // Make this configurable + cmd.Stdin = os.Stdin + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + if ctr.config.Spec.Process.Terminal { + cmd.Stderr = &stderrBuf + } + + // 0, 1 and 2 are stdin, stdout and stderr + conmonEnv := r.configureConmonEnv(runtimeDir) + + var filesToClose []*os.File + if preserveFDs > 0 { + for fd := 3; fd < int(3+preserveFDs); fd++ { + f := os.NewFile(uintptr(fd), fmt.Sprintf("fd-%d", fd)) + filesToClose = append(filesToClose, f) + cmd.ExtraFiles = append(cmd.ExtraFiles, f) + } + } + + cmd.Env = r.conmonEnv + // we don't want to step on users fds they asked to preserve + // Since 0-2 are used for stdio, start the fds we pass in at preserveFDs+3 + cmd.Env = append(cmd.Env, fmt.Sprintf("_OCI_SYNCPIPE=%d", preserveFDs+3), fmt.Sprintf("_OCI_STARTPIPE=%d", preserveFDs+4)) + cmd.Env = append(cmd.Env, conmonEnv...) + cmd.ExtraFiles = append(cmd.ExtraFiles, childSyncPipe, childStartPipe) + + if r.reservePorts && !rootless.IsRootless() && !ctr.config.NetMode.IsSlirp4netns() { + ports, err := bindPorts(ctr.convertPortMappings()) + if err != nil { + return 0, err + } + filesToClose = append(filesToClose, ports...) + + // Leak the port we bound in the conmon process. These fd's won't be used + // by the container and conmon will keep the ports busy so that another + // process cannot use them. + cmd.ExtraFiles = append(cmd.ExtraFiles, ports...) + } + + if ctr.config.NetMode.IsSlirp4netns() || rootless.IsRootless() { + if ctr.config.PostConfigureNetNS { + havePortMapping := len(ctr.config.PortMappings) > 0 + if havePortMapping { + ctr.rootlessPortSyncR, ctr.rootlessPortSyncW, err = os.Pipe() + if err != nil { + return 0, fmt.Errorf("failed to create rootless port sync pipe: %w", err) + } + } + ctr.rootlessSlirpSyncR, ctr.rootlessSlirpSyncW, err = os.Pipe() + if err != nil { + return 0, fmt.Errorf("failed to create rootless network sync pipe: %w", err) + } + } else { + if ctr.rootlessSlirpSyncR != nil { + defer errorhandling.CloseQuiet(ctr.rootlessSlirpSyncR) + } + if ctr.rootlessSlirpSyncW != nil { + defer errorhandling.CloseQuiet(ctr.rootlessSlirpSyncW) + } + } + // Leak one end in conmon, the other one will be leaked into slirp4netns + cmd.ExtraFiles = append(cmd.ExtraFiles, ctr.rootlessSlirpSyncW) + + if ctr.rootlessPortSyncW != nil { + defer errorhandling.CloseQuiet(ctr.rootlessPortSyncW) + // Leak one end in conmon, the other one will be leaked into rootlessport + cmd.ExtraFiles = append(cmd.ExtraFiles, ctr.rootlessPortSyncW) + } + } + var runtimeRestoreStarted time.Time + if restoreOptions != nil { + runtimeRestoreStarted = time.Now() + } + err = startCommand(cmd, ctr) + + // regardless of whether we errored or not, we no longer need the children pipes + childSyncPipe.Close() + childStartPipe.Close() + if err != nil { + return 0, err + } + if err := r.moveConmonToCgroupAndSignal(ctr, cmd, parentStartPipe); err != nil { + return 0, err + } + /* Wait for initial setup and fork, and reap child */ + err = cmd.Wait() + if err != nil { + return 0, err + } + + pid, err := readConmonPipeData(r.name, parentSyncPipe, ociLog) + if err != nil { + if err2 := r.DeleteContainer(ctr); err2 != nil { + logrus.Errorf("Removing container %s from runtime after creation failed", ctr.ID()) + } + return 0, err + } + ctr.state.PID = pid + + conmonPID, err := readConmonPidFile(ctr.config.ConmonPidFile) + if err != nil { + logrus.Warnf("Error reading conmon pid file for container %s: %v", ctr.ID(), err) + } else if conmonPID > 0 { + // conmon not having a pid file is a valid state, so don't set it if we don't have it + logrus.Infof("Got Conmon PID as %d", conmonPID) + ctr.state.ConmonPID = conmonPID + } + + runtimeRestoreDuration := func() int64 { + if restoreOptions != nil && restoreOptions.PrintStats { + return time.Since(runtimeRestoreStarted).Microseconds() + } + return 0 + }() + + // These fds were passed down to the runtime. Close them + // and not interfere + for _, f := range filesToClose { + errorhandling.CloseQuiet(f) + } + + return runtimeRestoreDuration, nil +} + +// configureConmonEnv gets the environment values to add to conmon's exec struct +// TODO this may want to be less hardcoded/more configurable in the future +func (r *ConmonOCIRuntime) configureConmonEnv(runtimeDir string) []string { + var env []string + for _, e := range os.Environ() { + if strings.HasPrefix(e, "LC_") { + env = append(env, e) + } + } + if path, ok := os.LookupEnv("PATH"); ok { + env = append(env, fmt.Sprintf("PATH=%s", path)) + } + if conf, ok := os.LookupEnv("CONTAINERS_CONF"); ok { + env = append(env, fmt.Sprintf("CONTAINERS_CONF=%s", conf)) + } + if conf, ok := os.LookupEnv("CONTAINERS_HELPER_BINARY_DIR"); ok { + env = append(env, fmt.Sprintf("CONTAINERS_HELPER_BINARY_DIR=%s", conf)) + } + env = append(env, fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)) + env = append(env, fmt.Sprintf("_CONTAINERS_USERNS_CONFIGURED=%s", os.Getenv("_CONTAINERS_USERNS_CONFIGURED"))) + env = append(env, fmt.Sprintf("_CONTAINERS_ROOTLESS_UID=%s", os.Getenv("_CONTAINERS_ROOTLESS_UID"))) + home := homedir.Get() + if home != "" { + env = append(env, fmt.Sprintf("HOME=%s", home)) + } + + return env +} + +// sharedConmonArgs takes common arguments for exec and create/restore and formats them for the conmon CLI +func (r *ConmonOCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, pidPath, logPath, exitDir, ociLogPath, logDriver, logTag string) []string { + // set the conmon API version to be able to use the correct sync struct keys + args := []string{ + "--api-version", "1", + "-c", ctr.ID(), + "-u", cuuid, + "-r", r.path, + "-b", bundlePath, + "-p", pidPath, + "-n", ctr.Name(), + "--exit-dir", exitDir, + "--full-attach", + } + if len(r.runtimeFlags) > 0 { + rFlags := []string{} + for _, arg := range r.runtimeFlags { + rFlags = append(rFlags, "--runtime-arg", arg) + } + args = append(args, rFlags...) + } + + if ctr.CgroupManager() == config.SystemdCgroupsManager && !ctr.config.NoCgroups && ctr.config.CgroupsMode != cgroupSplit { + args = append(args, "-s") + } + + var logDriverArg string + switch logDriver { + case define.JournaldLogging: + logDriverArg = define.JournaldLogging + case define.NoLogging: + logDriverArg = define.NoLogging + case define.PassthroughLogging: + logDriverArg = define.PassthroughLogging + //lint:ignore ST1015 the default case has to be here + default: //nolint:stylecheck,gocritic + // No case here should happen except JSONLogging, but keep this here in case the options are extended + logrus.Errorf("%s logging specified but not supported. Choosing k8s-file logging instead", ctr.LogDriver()) + fallthrough + case "": + // to get here, either a user would specify `--log-driver ""`, or this came from another place in libpod + // since the former case is obscure, and the latter case isn't an error, let's silently fallthrough + fallthrough + case define.JSONLogging: + fallthrough + case define.KubernetesLogging: + logDriverArg = fmt.Sprintf("%s:%s", define.KubernetesLogging, logPath) + } + + args = append(args, "-l", logDriverArg) + logLevel := logrus.GetLevel() + args = append(args, "--log-level", logLevel.String()) + + if logLevel == logrus.DebugLevel { + logrus.Debugf("%s messages will be logged to syslog", r.conmonPath) + args = append(args, "--syslog") + } + + size := r.logSizeMax + if ctr.config.LogSize > 0 { + size = ctr.config.LogSize + } + if size > 0 { + args = append(args, "--log-size-max", fmt.Sprintf("%v", size)) + } + + if ociLogPath != "" { + args = append(args, "--runtime-arg", "--log-format=json", "--runtime-arg", "--log", fmt.Sprintf("--runtime-arg=%s", ociLogPath)) + } + if logTag != "" { + args = append(args, "--log-tag", logTag) + } + if ctr.config.NoCgroups { + logrus.Debugf("Running with no Cgroups") + args = append(args, "--runtime-arg", "--cgroup-manager", "--runtime-arg", "disabled") + } + return args +} + +func startCommand(cmd *exec.Cmd, ctr *Container) error { + // Make sure to unset the NOTIFY_SOCKET and reset it afterwards if needed. + switch ctr.config.SdNotifyMode { + case define.SdNotifyModeContainer, define.SdNotifyModeIgnore: + if prev := os.Getenv("NOTIFY_SOCKET"); prev != "" { + if err := os.Unsetenv("NOTIFY_SOCKET"); err != nil { + logrus.Warnf("Error unsetting NOTIFY_SOCKET %v", err) + } + defer func() { + if err := os.Setenv("NOTIFY_SOCKET", prev); err != nil { + logrus.Errorf("Resetting NOTIFY_SOCKET=%s", prev) + } + }() + } + } + + return cmd.Start() +} + +// newPipe creates a unix socket pair for communication. +// Returns two files - first is parent, second is child. +func newPipe() (*os.File, *os.File, error) { + fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0) + if err != nil { + return nil, nil, err + } + return os.NewFile(uintptr(fds[1]), "parent"), os.NewFile(uintptr(fds[0]), "child"), nil +} + +// readConmonPidFile attempts to read conmon's pid from its pid file +func readConmonPidFile(pidFile string) (int, error) { + // Let's try reading the Conmon pid at the same time. + if pidFile != "" { + contents, err := ioutil.ReadFile(pidFile) + if err != nil { + return -1, err + } + // Convert it to an int + conmonPID, err := strconv.Atoi(string(contents)) + if err != nil { + return -1, err + } + return conmonPID, nil + } + return 0, nil +} + +// readConmonPipeData attempts to read a syncInfo struct from the pipe +func readConmonPipeData(runtimeName string, pipe *os.File, ociLog string) (int, error) { + // syncInfo is used to return data from monitor process to daemon + type syncInfo struct { + Data int `json:"data"` + Message string `json:"message,omitempty"` + } + + // Wait to get container pid from conmon + type syncStruct struct { + si *syncInfo + err error + } + ch := make(chan syncStruct) + go func() { + var si *syncInfo + rdr := bufio.NewReader(pipe) + b, err := rdr.ReadBytes('\n') + // ignore EOF here, error is returned even when data was read + // if it is no valid json unmarshal will fail below + if err != nil && !errors.Is(err, io.EOF) { + ch <- syncStruct{err: err} + } + if err := json.Unmarshal(b, &si); err != nil { + ch <- syncStruct{err: fmt.Errorf("conmon bytes %q: %w", string(b), err)} + return + } + ch <- syncStruct{si: si} + }() + + data := -1 //nolint: wastedassign + select { + case ss := <-ch: + if ss.err != nil { + if ociLog != "" { + ociLogData, err := ioutil.ReadFile(ociLog) + if err == nil { + var ociErr ociError + if err := json.Unmarshal(ociLogData, &ociErr); err == nil { + return -1, getOCIRuntimeError(runtimeName, ociErr.Msg) + } + } + } + return -1, fmt.Errorf("container create failed (no logs from conmon): %w", ss.err) + } + logrus.Debugf("Received: %d", ss.si.Data) + if ss.si.Data < 0 { + if ociLog != "" { + ociLogData, err := ioutil.ReadFile(ociLog) + if err == nil { + var ociErr ociError + if err := json.Unmarshal(ociLogData, &ociErr); err == nil { + return ss.si.Data, getOCIRuntimeError(runtimeName, ociErr.Msg) + } + } + } + // If we failed to parse the JSON errors, then print the output as it is + if ss.si.Message != "" { + return ss.si.Data, getOCIRuntimeError(runtimeName, ss.si.Message) + } + return ss.si.Data, fmt.Errorf("container create failed: %w", define.ErrInternal) + } + data = ss.si.Data + case <-time.After(define.ContainerCreateTimeout): + return -1, fmt.Errorf("container creation timeout: %w", define.ErrInternal) + } + return data, nil +} + +// writeConmonPipeData writes nonce data to a pipe +func writeConmonPipeData(pipe *os.File) error { + someData := []byte{0} + _, err := pipe.Write(someData) + return err +} + +// formatRuntimeOpts prepends opts passed to it with --runtime-opt for passing to conmon +func formatRuntimeOpts(opts ...string) []string { + args := make([]string, 0, len(opts)*2) + for _, o := range opts { + args = append(args, "--runtime-opt", o) + } + return args +} + +// getConmonVersion returns a string representation of the conmon version. +func (r *ConmonOCIRuntime) getConmonVersion() (string, error) { + output, err := utils.ExecCmd(r.conmonPath, "--version") + if err != nil { + return "", err + } + return strings.TrimSuffix(strings.Replace(output, "\n", ", ", 1), "\n"), nil +} + +// getOCIRuntimeVersion returns a string representation of the OCI runtime's +// version. +func (r *ConmonOCIRuntime) getOCIRuntimeVersion() (string, error) { + output, err := utils.ExecCmd(r.path, "--version") + if err != nil { + return "", err + } + return strings.TrimSuffix(output, "\n"), nil +} + +// Copy data from container to HTTP connection, for terminal attach. +// Container is the container's attach socket connection, http is a buffer for +// the HTTP connection. cid is the ID of the container the attach session is +// running for (used solely for error messages). +func httpAttachTerminalCopy(container *net.UnixConn, http *bufio.ReadWriter, cid string) error { + buf := make([]byte, bufferSize) + for { + numR, err := container.Read(buf) + logrus.Debugf("Read fd(%d) %d/%d bytes for container %s", int(buf[0]), numR, len(buf), cid) + + if numR > 0 { + switch buf[0] { + case AttachPipeStdout: + // Do nothing + default: + logrus.Errorf("Received unexpected attach type %+d, discarding %d bytes", buf[0], numR) + continue + } + + numW, err2 := http.Write(buf[1:numR]) + if err2 != nil { + if err != nil { + logrus.Errorf("Reading container %s STDOUT: %v", cid, err) + } + return err2 + } else if numW+1 != numR { + return io.ErrShortWrite + } + // We need to force the buffer to write immediately, so + // there isn't a delay on the terminal side. + if err2 := http.Flush(); err2 != nil { + if err != nil { + logrus.Errorf("Reading container %s STDOUT: %v", cid, err) + } + return err2 + } + } + if err != nil { + if err == io.EOF { + return nil + } + return err + } + } +} + +// Copy data from a container to an HTTP connection, for non-terminal attach. +// Appends a header to multiplex input. +func httpAttachNonTerminalCopy(container *net.UnixConn, http *bufio.ReadWriter, cid string, stdin, stdout, stderr bool) error { + buf := make([]byte, bufferSize) + for { + numR, err := container.Read(buf) + if numR > 0 { + var headerBuf []byte + + // Subtract 1 because we strip the first byte (used for + // multiplexing by Conmon). + headerLen := uint32(numR - 1) + // Practically speaking, we could make this buf[0] - 1, + // but we need to validate it anyway. + switch buf[0] { + case AttachPipeStdin: + headerBuf = makeHTTPAttachHeader(0, headerLen) + if !stdin { + continue + } + case AttachPipeStdout: + if !stdout { + continue + } + headerBuf = makeHTTPAttachHeader(1, headerLen) + case AttachPipeStderr: + if !stderr { + continue + } + headerBuf = makeHTTPAttachHeader(2, headerLen) + default: + logrus.Errorf("Received unexpected attach type %+d, discarding %d bytes", buf[0], numR) + continue + } + + numH, err2 := http.Write(headerBuf) + if err2 != nil { + if err != nil { + logrus.Errorf("Reading container %s standard streams: %v", cid, err) + } + + return err2 + } + // Hardcoding header length is pretty gross, but + // fast. Should be safe, as this is a fixed part + // of the protocol. + if numH != 8 { + if err != nil { + logrus.Errorf("Reading container %s standard streams: %v", cid, err) + } + + return io.ErrShortWrite + } + + numW, err2 := http.Write(buf[1:numR]) + if err2 != nil { + if err != nil { + logrus.Errorf("Reading container %s standard streams: %v", cid, err) + } + + return err2 + } else if numW+1 != numR { + if err != nil { + logrus.Errorf("Reading container %s standard streams: %v", cid, err) + } + + return io.ErrShortWrite + } + // We need to force the buffer to write immediately, so + // there isn't a delay on the terminal side. + if err2 := http.Flush(); err2 != nil { + if err != nil { + logrus.Errorf("Reading container %s STDOUT: %v", cid, err) + } + return err2 + } + } + if err != nil { + if err == io.EOF { + return nil + } + + return err + } + } +} diff --git a/libpod/oci_conmon_exec_common.go b/libpod/oci_conmon_exec_common.go new file mode 100644 index 000000000..e5080942b --- /dev/null +++ b/libpod/oci_conmon_exec_common.go @@ -0,0 +1,771 @@ +package libpod + +import ( + "errors" + "fmt" + "io/ioutil" + "net/http" + "os" + "os/exec" + "path/filepath" + "strings" + "syscall" + "time" + + "github.com/containers/common/pkg/config" + "github.com/containers/common/pkg/resize" + cutil "github.com/containers/common/pkg/util" + "github.com/containers/podman/v4/libpod/define" + "github.com/containers/podman/v4/pkg/errorhandling" + "github.com/containers/podman/v4/pkg/lookup" + "github.com/containers/podman/v4/pkg/util" + spec "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +// ExecContainer executes a command in a running container +func (r *ConmonOCIRuntime) ExecContainer(c *Container, sessionID string, options *ExecOptions, streams *define.AttachStreams, newSize *resize.TerminalSize) (int, chan error, error) { + if options == nil { + return -1, nil, fmt.Errorf("must provide an ExecOptions struct to ExecContainer: %w", define.ErrInvalidArg) + } + if len(options.Cmd) == 0 { + return -1, nil, fmt.Errorf("must provide a command to execute: %w", define.ErrInvalidArg) + } + + if sessionID == "" { + return -1, nil, fmt.Errorf("must provide a session ID for exec: %w", define.ErrEmptyID) + } + + // TODO: Should we default this to false? + // Or maybe make streams mandatory? + attachStdin := true + if streams != nil { + attachStdin = streams.AttachInput + } + + var ociLog string + if logrus.GetLevel() != logrus.DebugLevel && r.supportsJSON { + ociLog = c.execOCILog(sessionID) + } + + execCmd, pipes, err := r.startExec(c, sessionID, options, attachStdin, ociLog) + if err != nil { + return -1, nil, err + } + + // Only close sync pipe. Start and attach are consumed in the attach + // goroutine. + defer func() { + if pipes.syncPipe != nil && !pipes.syncClosed { + errorhandling.CloseQuiet(pipes.syncPipe) + pipes.syncClosed = true + } + }() + + // TODO Only create if !detach + // Attach to the container before starting it + attachChan := make(chan error) + go func() { + // attachToExec is responsible for closing pipes + attachChan <- c.attachToExec(streams, options.DetachKeys, sessionID, pipes.startPipe, pipes.attachPipe, newSize) + close(attachChan) + }() + + if err := execCmd.Wait(); err != nil { + return -1, nil, fmt.Errorf("cannot run conmon: %w", err) + } + + pid, err := readConmonPipeData(r.name, pipes.syncPipe, ociLog) + + return pid, attachChan, err +} + +// ExecContainerHTTP executes a new command in an existing container and +// forwards its standard streams over an attach +func (r *ConmonOCIRuntime) ExecContainerHTTP(ctr *Container, sessionID string, options *ExecOptions, req *http.Request, w http.ResponseWriter, + streams *HTTPAttachStreams, cancel <-chan bool, hijackDone chan<- bool, holdConnOpen <-chan bool, newSize *resize.TerminalSize) (int, chan error, error) { + if streams != nil { + if !streams.Stdin && !streams.Stdout && !streams.Stderr { + return -1, nil, fmt.Errorf("must provide at least one stream to attach to: %w", define.ErrInvalidArg) + } + } + + if options == nil { + return -1, nil, fmt.Errorf("must provide exec options to ExecContainerHTTP: %w", define.ErrInvalidArg) + } + + detachString := config.DefaultDetachKeys + if options.DetachKeys != nil { + detachString = *options.DetachKeys + } + detachKeys, err := processDetachKeys(detachString) + if err != nil { + return -1, nil, err + } + + // TODO: Should we default this to false? + // Or maybe make streams mandatory? + attachStdin := true + if streams != nil { + attachStdin = streams.Stdin + } + + var ociLog string + if logrus.GetLevel() != logrus.DebugLevel && r.supportsJSON { + ociLog = ctr.execOCILog(sessionID) + } + + execCmd, pipes, err := r.startExec(ctr, sessionID, options, attachStdin, ociLog) + if err != nil { + return -1, nil, err + } + + // Only close sync pipe. Start and attach are consumed in the attach + // goroutine. + defer func() { + if pipes.syncPipe != nil && !pipes.syncClosed { + errorhandling.CloseQuiet(pipes.syncPipe) + pipes.syncClosed = true + } + }() + + attachChan := make(chan error) + conmonPipeDataChan := make(chan conmonPipeData) + go func() { + // attachToExec is responsible for closing pipes + attachChan <- attachExecHTTP(ctr, sessionID, req, w, streams, pipes, detachKeys, options.Terminal, cancel, hijackDone, holdConnOpen, execCmd, conmonPipeDataChan, ociLog, newSize, r.name) + close(attachChan) + }() + + // NOTE: the channel is needed to communicate conmon's data. In case + // of an error, the error will be written on the hijacked http + // connection such that remote clients will receive the error. + pipeData := <-conmonPipeDataChan + + return pipeData.pid, attachChan, pipeData.err +} + +// conmonPipeData contains the data when reading from conmon's pipe. +type conmonPipeData struct { + pid int + err error +} + +// ExecContainerDetached executes a command in a running container, but does +// not attach to it. +func (r *ConmonOCIRuntime) ExecContainerDetached(ctr *Container, sessionID string, options *ExecOptions, stdin bool) (int, error) { + if options == nil { + return -1, fmt.Errorf("must provide exec options to ExecContainerHTTP: %w", define.ErrInvalidArg) + } + + var ociLog string + if logrus.GetLevel() != logrus.DebugLevel && r.supportsJSON { + ociLog = ctr.execOCILog(sessionID) + } + + execCmd, pipes, err := r.startExec(ctr, sessionID, options, stdin, ociLog) + if err != nil { + return -1, err + } + + defer func() { + pipes.cleanup() + }() + + // Wait for Conmon to tell us we're ready to attach. + // We aren't actually *going* to attach, but this means that we're good + // to proceed. + if _, err := readConmonPipeData(r.name, pipes.attachPipe, ""); err != nil { + return -1, err + } + + // Start the exec session + if err := writeConmonPipeData(pipes.startPipe); err != nil { + return -1, err + } + + // Wait for conmon to succeed, when return. + if err := execCmd.Wait(); err != nil { + return -1, fmt.Errorf("cannot run conmon: %w", err) + } + + pid, err := readConmonPipeData(r.name, pipes.syncPipe, ociLog) + + return pid, err +} + +// ExecAttachResize resizes the TTY of the given exec session. +func (r *ConmonOCIRuntime) ExecAttachResize(ctr *Container, sessionID string, newSize resize.TerminalSize) error { + controlFile, err := openControlFile(ctr, ctr.execBundlePath(sessionID)) + if err != nil { + return err + } + defer controlFile.Close() + + if _, err = fmt.Fprintf(controlFile, "%d %d %d\n", 1, newSize.Height, newSize.Width); err != nil { + return fmt.Errorf("failed to write to ctl file to resize terminal: %w", err) + } + + return nil +} + +// ExecStopContainer stops a given exec session in a running container. +func (r *ConmonOCIRuntime) ExecStopContainer(ctr *Container, sessionID string, timeout uint) error { + pid, err := ctr.getExecSessionPID(sessionID) + if err != nil { + return err + } + + logrus.Debugf("Going to stop container %s exec session %s", ctr.ID(), sessionID) + + // Is the session dead? + // Ping the PID with signal 0 to see if it still exists. + if err := unix.Kill(pid, 0); err != nil { + if err == unix.ESRCH { + return nil + } + return fmt.Errorf("pinging container %s exec session %s PID %d with signal 0: %w", ctr.ID(), sessionID, pid, err) + } + + if timeout > 0 { + // Use SIGTERM by default, then SIGSTOP after timeout. + logrus.Debugf("Killing exec session %s (PID %d) of container %s with SIGTERM", sessionID, pid, ctr.ID()) + if err := unix.Kill(pid, unix.SIGTERM); err != nil { + if err == unix.ESRCH { + return nil + } + return fmt.Errorf("killing container %s exec session %s PID %d with SIGTERM: %w", ctr.ID(), sessionID, pid, err) + } + + // Wait for the PID to stop + if err := waitPidStop(pid, time.Duration(timeout)*time.Second); err != nil { + logrus.Infof("Timed out waiting for container %s exec session %s to stop, resorting to SIGKILL: %v", ctr.ID(), sessionID, err) + } else { + // No error, container is dead + return nil + } + } + + // SIGTERM did not work. On to SIGKILL. + logrus.Debugf("Killing exec session %s (PID %d) of container %s with SIGKILL", sessionID, pid, ctr.ID()) + if err := unix.Kill(pid, unix.SIGTERM); err != nil { + if err == unix.ESRCH { + return nil + } + return fmt.Errorf("killing container %s exec session %s PID %d with SIGKILL: %w", ctr.ID(), sessionID, pid, err) + } + + // Wait for the PID to stop + if err := waitPidStop(pid, killContainerTimeout); err != nil { + return fmt.Errorf("timed out waiting for container %s exec session %s PID %d to stop after SIGKILL: %w", ctr.ID(), sessionID, pid, err) + } + + return nil +} + +// ExecUpdateStatus checks if the given exec session is still running. +func (r *ConmonOCIRuntime) ExecUpdateStatus(ctr *Container, sessionID string) (bool, error) { + pid, err := ctr.getExecSessionPID(sessionID) + if err != nil { + return false, err + } + + logrus.Debugf("Checking status of container %s exec session %s", ctr.ID(), sessionID) + + // Is the session dead? + // Ping the PID with signal 0 to see if it still exists. + if err := unix.Kill(pid, 0); err != nil { + if err == unix.ESRCH { + return false, nil + } + return false, fmt.Errorf("pinging container %s exec session %s PID %d with signal 0: %w", ctr.ID(), sessionID, pid, err) + } + + return true, nil +} + +// ExecAttachSocketPath is the path to a container's exec session attach socket. +func (r *ConmonOCIRuntime) ExecAttachSocketPath(ctr *Container, sessionID string) (string, error) { + // We don't even use container, so don't validity check it + if sessionID == "" { + return "", fmt.Errorf("must provide a valid session ID to get attach socket path: %w", define.ErrInvalidArg) + } + + return filepath.Join(ctr.execBundlePath(sessionID), "attach"), nil +} + +// This contains pipes used by the exec API. +type execPipes struct { + syncPipe *os.File + syncClosed bool + startPipe *os.File + startClosed bool + attachPipe *os.File + attachClosed bool +} + +func (p *execPipes) cleanup() { + if p.syncPipe != nil && !p.syncClosed { + errorhandling.CloseQuiet(p.syncPipe) + p.syncClosed = true + } + if p.startPipe != nil && !p.startClosed { + errorhandling.CloseQuiet(p.startPipe) + p.startClosed = true + } + if p.attachPipe != nil && !p.attachClosed { + errorhandling.CloseQuiet(p.attachPipe) + p.attachClosed = true + } +} + +// Start an exec session's conmon parent from the given options. +func (r *ConmonOCIRuntime) startExec(c *Container, sessionID string, options *ExecOptions, attachStdin bool, ociLog string) (_ *exec.Cmd, _ *execPipes, deferredErr error) { + pipes := new(execPipes) + + if options == nil { + return nil, nil, fmt.Errorf("must provide an ExecOptions struct to ExecContainer: %w", define.ErrInvalidArg) + } + if len(options.Cmd) == 0 { + return nil, nil, fmt.Errorf("must provide a command to execute: %w", define.ErrInvalidArg) + } + + if sessionID == "" { + return nil, nil, fmt.Errorf("must provide a session ID for exec: %w", define.ErrEmptyID) + } + + // create sync pipe to receive the pid + parentSyncPipe, childSyncPipe, err := newPipe() + if err != nil { + return nil, nil, fmt.Errorf("creating socket pair: %w", err) + } + pipes.syncPipe = parentSyncPipe + + defer func() { + if deferredErr != nil { + pipes.cleanup() + } + }() + + // create start pipe to set the cgroup before running + // attachToExec is responsible for closing parentStartPipe + childStartPipe, parentStartPipe, err := newPipe() + if err != nil { + return nil, nil, fmt.Errorf("creating socket pair: %w", err) + } + pipes.startPipe = parentStartPipe + + // create the attach pipe to allow attach socket to be created before + // $RUNTIME exec starts running. This is to make sure we can capture all output + // from the process through that socket, rather than half reading the log, half attaching to the socket + // attachToExec is responsible for closing parentAttachPipe + parentAttachPipe, childAttachPipe, err := newPipe() + if err != nil { + return nil, nil, fmt.Errorf("creating socket pair: %w", err) + } + pipes.attachPipe = parentAttachPipe + + childrenClosed := false + defer func() { + if !childrenClosed { + errorhandling.CloseQuiet(childSyncPipe) + errorhandling.CloseQuiet(childAttachPipe) + errorhandling.CloseQuiet(childStartPipe) + } + }() + + runtimeDir, err := util.GetRuntimeDir() + if err != nil { + return nil, nil, err + } + + finalEnv := make([]string, 0, len(options.Env)) + for k, v := range options.Env { + finalEnv = append(finalEnv, fmt.Sprintf("%s=%s", k, v)) + } + + processFile, err := c.prepareProcessExec(options, finalEnv, sessionID) + if err != nil { + return nil, nil, err + } + defer processFile.Close() + + args := r.sharedConmonArgs(c, sessionID, c.execBundlePath(sessionID), c.execPidPath(sessionID), c.execLogPath(sessionID), c.execExitFileDir(sessionID), ociLog, define.NoLogging, c.config.LogTag) + + if options.PreserveFDs > 0 { + args = append(args, formatRuntimeOpts("--preserve-fds", fmt.Sprintf("%d", options.PreserveFDs))...) + } + + if options.Terminal { + args = append(args, "-t") + } + + if attachStdin { + args = append(args, "-i") + } + + // Append container ID and command + args = append(args, "-e") + // TODO make this optional when we can detach + args = append(args, "--exec-attach") + args = append(args, "--exec-process-spec", processFile.Name()) + + if len(options.ExitCommand) > 0 { + args = append(args, "--exit-command", options.ExitCommand[0]) + for _, arg := range options.ExitCommand[1:] { + args = append(args, []string{"--exit-command-arg", arg}...) + } + if options.ExitCommandDelay > 0 { + args = append(args, []string{"--exit-delay", fmt.Sprintf("%d", options.ExitCommandDelay)}...) + } + } + + logrus.WithFields(logrus.Fields{ + "args": args, + }).Debugf("running conmon: %s", r.conmonPath) + execCmd := exec.Command(r.conmonPath, args...) + + // TODO: This is commented because it doesn't make much sense in HTTP + // attach, and I'm not certain it does for non-HTTP attach as well. + // if streams != nil { + // // Don't add the InputStream to the execCmd. Instead, the data should be passed + // // through CopyDetachable + // if streams.AttachOutput { + // execCmd.Stdout = options.Streams.OutputStream + // } + // if streams.AttachError { + // execCmd.Stderr = options.Streams.ErrorStream + // } + // } + + conmonEnv := r.configureConmonEnv(runtimeDir) + + var filesToClose []*os.File + if options.PreserveFDs > 0 { + for fd := 3; fd < int(3+options.PreserveFDs); fd++ { + f := os.NewFile(uintptr(fd), fmt.Sprintf("fd-%d", fd)) + filesToClose = append(filesToClose, f) + execCmd.ExtraFiles = append(execCmd.ExtraFiles, f) + } + } + + // we don't want to step on users fds they asked to preserve + // Since 0-2 are used for stdio, start the fds we pass in at preserveFDs+3 + execCmd.Env = r.conmonEnv + execCmd.Env = append(execCmd.Env, fmt.Sprintf("_OCI_SYNCPIPE=%d", options.PreserveFDs+3), fmt.Sprintf("_OCI_STARTPIPE=%d", options.PreserveFDs+4), fmt.Sprintf("_OCI_ATTACHPIPE=%d", options.PreserveFDs+5)) + execCmd.Env = append(execCmd.Env, conmonEnv...) + + execCmd.ExtraFiles = append(execCmd.ExtraFiles, childSyncPipe, childStartPipe, childAttachPipe) + execCmd.Dir = c.execBundlePath(sessionID) + execCmd.SysProcAttr = &syscall.SysProcAttr{ + Setpgid: true, + } + + err = startCommand(execCmd, c) + + // We don't need children pipes on the parent side + errorhandling.CloseQuiet(childSyncPipe) + errorhandling.CloseQuiet(childAttachPipe) + errorhandling.CloseQuiet(childStartPipe) + childrenClosed = true + + if err != nil { + return nil, nil, fmt.Errorf("cannot start container %s: %w", c.ID(), err) + } + if err := r.moveConmonToCgroupAndSignal(c, execCmd, parentStartPipe); err != nil { + return nil, nil, err + } + + // These fds were passed down to the runtime. Close them + // and not interfere + for _, f := range filesToClose { + errorhandling.CloseQuiet(f) + } + + return execCmd, pipes, nil +} + +// Attach to a container over HTTP +func attachExecHTTP(c *Container, sessionID string, r *http.Request, w http.ResponseWriter, streams *HTTPAttachStreams, pipes *execPipes, detachKeys []byte, isTerminal bool, cancel <-chan bool, hijackDone chan<- bool, holdConnOpen <-chan bool, execCmd *exec.Cmd, conmonPipeDataChan chan<- conmonPipeData, ociLog string, newSize *resize.TerminalSize, runtimeName string) (deferredErr error) { + // NOTE: As you may notice, the attach code is quite complex. + // Many things happen concurrently and yet are interdependent. + // If you ever change this function, make sure to write to the + // conmonPipeDataChan in case of an error. + + if pipes == nil || pipes.startPipe == nil || pipes.attachPipe == nil { + err := fmt.Errorf("must provide a start and attach pipe to finish an exec attach: %w", define.ErrInvalidArg) + conmonPipeDataChan <- conmonPipeData{-1, err} + return err + } + + defer func() { + if !pipes.startClosed { + errorhandling.CloseQuiet(pipes.startPipe) + pipes.startClosed = true + } + if !pipes.attachClosed { + errorhandling.CloseQuiet(pipes.attachPipe) + pipes.attachClosed = true + } + }() + + logrus.Debugf("Attaching to container %s exec session %s", c.ID(), sessionID) + + // set up the socket path, such that it is the correct length and location for exec + sockPath, err := c.execAttachSocketPath(sessionID) + if err != nil { + conmonPipeDataChan <- conmonPipeData{-1, err} + return err + } + + // 2: read from attachFd that the parent process has set up the console socket + if _, err := readConmonPipeData(runtimeName, pipes.attachPipe, ""); err != nil { + conmonPipeDataChan <- conmonPipeData{-1, err} + return err + } + + // resize before we start the container process + if newSize != nil { + err = c.ociRuntime.ExecAttachResize(c, sessionID, *newSize) + if err != nil { + logrus.Warnf("Resize failed: %v", err) + } + } + + // 2: then attach + conn, err := openUnixSocket(sockPath) + if err != nil { + conmonPipeDataChan <- conmonPipeData{-1, err} + return fmt.Errorf("failed to connect to container's attach socket: %v: %w", sockPath, err) + } + defer func() { + if err := conn.Close(); err != nil { + logrus.Errorf("Unable to close socket: %q", err) + } + }() + + attachStdout := true + attachStderr := true + attachStdin := true + if streams != nil { + attachStdout = streams.Stdout + attachStderr = streams.Stderr + attachStdin = streams.Stdin + } + + // Perform hijack + hijacker, ok := w.(http.Hijacker) + if !ok { + conmonPipeDataChan <- conmonPipeData{-1, err} + return errors.New("unable to hijack connection") + } + + httpCon, httpBuf, err := hijacker.Hijack() + if err != nil { + conmonPipeDataChan <- conmonPipeData{-1, err} + return fmt.Errorf("hijacking connection: %w", err) + } + + hijackDone <- true + + // Write a header to let the client know what happened + writeHijackHeader(r, httpBuf) + + // Force a flush after the header is written. + if err := httpBuf.Flush(); err != nil { + conmonPipeDataChan <- conmonPipeData{-1, err} + return fmt.Errorf("flushing HTTP hijack header: %w", err) + } + + go func() { + // Wait for conmon to succeed, when return. + if err := execCmd.Wait(); err != nil { + conmonPipeDataChan <- conmonPipeData{-1, err} + } else { + pid, err := readConmonPipeData(runtimeName, pipes.syncPipe, ociLog) + if err != nil { + hijackWriteError(err, c.ID(), isTerminal, httpBuf) + conmonPipeDataChan <- conmonPipeData{pid, err} + } else { + conmonPipeDataChan <- conmonPipeData{pid, err} + } + } + // We need to hold the connection open until the complete exec + // function has finished. This channel will be closed in a defer + // in that function, so we can wait for it here. + // Can't be a defer, because this would block the function from + // returning. + <-holdConnOpen + hijackWriteErrorAndClose(deferredErr, c.ID(), isTerminal, httpCon, httpBuf) + }() + + stdoutChan := make(chan error) + stdinChan := make(chan error) + + // Next, STDIN. Avoid entirely if attachStdin unset. + if attachStdin { + go func() { + logrus.Debugf("Beginning STDIN copy") + _, err := cutil.CopyDetachable(conn, httpBuf, detachKeys) + logrus.Debugf("STDIN copy completed") + stdinChan <- err + }() + } + + // 4: send start message to child + if err := writeConmonPipeData(pipes.startPipe); err != nil { + return err + } + + // Handle STDOUT/STDERR *after* start message is sent + go func() { + var err error + if isTerminal { + // Hack: return immediately if attachStdout not set to + // emulate Docker. + // Basically, when terminal is set, STDERR goes nowhere. + // Everything does over STDOUT. + // Therefore, if not attaching STDOUT - we'll never copy + // anything from here. + logrus.Debugf("Performing terminal HTTP attach for container %s", c.ID()) + if attachStdout { + err = httpAttachTerminalCopy(conn, httpBuf, c.ID()) + } + } else { + logrus.Debugf("Performing non-terminal HTTP attach for container %s", c.ID()) + err = httpAttachNonTerminalCopy(conn, httpBuf, c.ID(), attachStdin, attachStdout, attachStderr) + } + stdoutChan <- err + logrus.Debugf("STDOUT/ERR copy completed") + }() + + for { + select { + case err := <-stdoutChan: + if err != nil { + return err + } + + return nil + case err := <-stdinChan: + if err != nil { + return err + } + // copy stdin is done, close it + if connErr := socketCloseWrite(conn); connErr != nil { + logrus.Errorf("Unable to close conn: %v", connErr) + } + case <-cancel: + return nil + } + } +} + +// prepareProcessExec returns the path of the process.json used in runc exec -p +// caller is responsible to close the returned *os.File if needed. +func (c *Container) prepareProcessExec(options *ExecOptions, env []string, sessionID string) (*os.File, error) { + f, err := ioutil.TempFile(c.execBundlePath(sessionID), "exec-process-") + if err != nil { + return nil, err + } + pspec := new(spec.Process) + if err := JSONDeepCopy(c.config.Spec.Process, pspec); err != nil { + return nil, err + } + pspec.SelinuxLabel = c.config.ProcessLabel + pspec.Args = options.Cmd + + // We need to default this to false else it will inherit terminal as true + // from the container. + pspec.Terminal = false + if options.Terminal { + pspec.Terminal = true + } + if len(env) > 0 { + pspec.Env = append(pspec.Env, env...) + } + + // Add secret envs if they exist + manager, err := c.runtime.SecretsManager() + if err != nil { + return nil, err + } + for name, secr := range c.config.EnvSecrets { + _, data, err := manager.LookupSecretData(secr.Name) + if err != nil { + return nil, err + } + pspec.Env = append(pspec.Env, fmt.Sprintf("%s=%s", name, string(data))) + } + + if options.Cwd != "" { + pspec.Cwd = options.Cwd + } + + var addGroups []string + var sgids []uint32 + + // if the user is empty, we should inherit the user that the container is currently running with + user := options.User + if user == "" { + logrus.Debugf("Set user to %s", c.config.User) + user = c.config.User + addGroups = c.config.Groups + } + + overrides := c.getUserOverrides() + execUser, err := lookup.GetUserGroupInfo(c.state.Mountpoint, user, overrides) + if err != nil { + return nil, err + } + + if len(addGroups) > 0 { + sgids, err = lookup.GetContainerGroups(addGroups, c.state.Mountpoint, overrides) + if err != nil { + return nil, fmt.Errorf("looking up supplemental groups for container %s exec session %s: %w", c.ID(), sessionID, err) + } + } + + // If user was set, look it up in the container to get a UID to use on + // the host + if user != "" || len(sgids) > 0 { + if user != "" { + for _, sgid := range execUser.Sgids { + sgids = append(sgids, uint32(sgid)) + } + } + processUser := spec.User{ + UID: uint32(execUser.Uid), + GID: uint32(execUser.Gid), + AdditionalGids: sgids, + } + + pspec.User = processUser + } + + if err := c.setProcessCapabilitiesExec(options, user, execUser, pspec); err != nil { + return nil, err + } + + hasHomeSet := false + for _, s := range pspec.Env { + if strings.HasPrefix(s, "HOME=") { + hasHomeSet = true + break + } + } + if !hasHomeSet { + pspec.Env = append(pspec.Env, fmt.Sprintf("HOME=%s", execUser.Home)) + } + + processJSON, err := json.Marshal(pspec) + if err != nil { + return nil, err + } + + if err := ioutil.WriteFile(f.Name(), processJSON, 0644); err != nil { + return nil, err + } + return f, nil +} diff --git a/libpod/oci_conmon_exec_freebsd.go b/libpod/oci_conmon_exec_freebsd.go new file mode 100644 index 000000000..bf30404a1 --- /dev/null +++ b/libpod/oci_conmon_exec_freebsd.go @@ -0,0 +1,10 @@ +package libpod + +import ( + "github.com/opencontainers/runc/libcontainer/user" + spec "github.com/opencontainers/runtime-spec/specs-go" +) + +func (c *Container) setProcessCapabilitiesExec(options *ExecOptions, user string, execUser *user.ExecUser, pspec *spec.Process) error { + return nil +} diff --git a/libpod/oci_conmon_exec_linux.go b/libpod/oci_conmon_exec_linux.go index 16cd7ef9f..617e8d601 100644 --- a/libpod/oci_conmon_exec_linux.go +++ b/libpod/oci_conmon_exec_linux.go @@ -1,758 +1,20 @@ package libpod import ( - "errors" - "fmt" - "io/ioutil" - "net/http" - "os" - "os/exec" - "path/filepath" - "strings" - "syscall" - "time" - "github.com/containers/common/pkg/capabilities" - "github.com/containers/common/pkg/config" - "github.com/containers/common/pkg/resize" - cutil "github.com/containers/common/pkg/util" - "github.com/containers/podman/v4/libpod/define" - "github.com/containers/podman/v4/pkg/errorhandling" - "github.com/containers/podman/v4/pkg/lookup" - "github.com/containers/podman/v4/pkg/util" + "github.com/opencontainers/runc/libcontainer/user" spec "github.com/opencontainers/runtime-spec/specs-go" - "github.com/sirupsen/logrus" - "golang.org/x/sys/unix" ) -// ExecContainer executes a command in a running container -func (r *ConmonOCIRuntime) ExecContainer(c *Container, sessionID string, options *ExecOptions, streams *define.AttachStreams, newSize *resize.TerminalSize) (int, chan error, error) { - if options == nil { - return -1, nil, fmt.Errorf("must provide an ExecOptions struct to ExecContainer: %w", define.ErrInvalidArg) - } - if len(options.Cmd) == 0 { - return -1, nil, fmt.Errorf("must provide a command to execute: %w", define.ErrInvalidArg) - } - - if sessionID == "" { - return -1, nil, fmt.Errorf("must provide a session ID for exec: %w", define.ErrEmptyID) - } - - // TODO: Should we default this to false? - // Or maybe make streams mandatory? - attachStdin := true - if streams != nil { - attachStdin = streams.AttachInput - } - - var ociLog string - if logrus.GetLevel() != logrus.DebugLevel && r.supportsJSON { - ociLog = c.execOCILog(sessionID) - } - - execCmd, pipes, err := r.startExec(c, sessionID, options, attachStdin, ociLog) - if err != nil { - return -1, nil, err - } - - // Only close sync pipe. Start and attach are consumed in the attach - // goroutine. - defer func() { - if pipes.syncPipe != nil && !pipes.syncClosed { - errorhandling.CloseQuiet(pipes.syncPipe) - pipes.syncClosed = true - } - }() - - // TODO Only create if !detach - // Attach to the container before starting it - attachChan := make(chan error) - go func() { - // attachToExec is responsible for closing pipes - attachChan <- c.attachToExec(streams, options.DetachKeys, sessionID, pipes.startPipe, pipes.attachPipe, newSize) - close(attachChan) - }() - - if err := execCmd.Wait(); err != nil { - return -1, nil, fmt.Errorf("cannot run conmon: %w", err) - } - - pid, err := readConmonPipeData(r.name, pipes.syncPipe, ociLog) - - return pid, attachChan, err -} - -// ExecContainerHTTP executes a new command in an existing container and -// forwards its standard streams over an attach -func (r *ConmonOCIRuntime) ExecContainerHTTP(ctr *Container, sessionID string, options *ExecOptions, req *http.Request, w http.ResponseWriter, - streams *HTTPAttachStreams, cancel <-chan bool, hijackDone chan<- bool, holdConnOpen <-chan bool, newSize *resize.TerminalSize) (int, chan error, error) { - if streams != nil { - if !streams.Stdin && !streams.Stdout && !streams.Stderr { - return -1, nil, fmt.Errorf("must provide at least one stream to attach to: %w", define.ErrInvalidArg) - } - } - - if options == nil { - return -1, nil, fmt.Errorf("must provide exec options to ExecContainerHTTP: %w", define.ErrInvalidArg) - } - - detachString := config.DefaultDetachKeys - if options.DetachKeys != nil { - detachString = *options.DetachKeys - } - detachKeys, err := processDetachKeys(detachString) - if err != nil { - return -1, nil, err - } - - // TODO: Should we default this to false? - // Or maybe make streams mandatory? - attachStdin := true - if streams != nil { - attachStdin = streams.Stdin - } - - var ociLog string - if logrus.GetLevel() != logrus.DebugLevel && r.supportsJSON { - ociLog = ctr.execOCILog(sessionID) - } - - execCmd, pipes, err := r.startExec(ctr, sessionID, options, attachStdin, ociLog) - if err != nil { - return -1, nil, err - } - - // Only close sync pipe. Start and attach are consumed in the attach - // goroutine. - defer func() { - if pipes.syncPipe != nil && !pipes.syncClosed { - errorhandling.CloseQuiet(pipes.syncPipe) - pipes.syncClosed = true - } - }() - - attachChan := make(chan error) - conmonPipeDataChan := make(chan conmonPipeData) - go func() { - // attachToExec is responsible for closing pipes - attachChan <- attachExecHTTP(ctr, sessionID, req, w, streams, pipes, detachKeys, options.Terminal, cancel, hijackDone, holdConnOpen, execCmd, conmonPipeDataChan, ociLog, newSize, r.name) - close(attachChan) - }() - - // NOTE: the channel is needed to communicate conmon's data. In case - // of an error, the error will be written on the hijacked http - // connection such that remote clients will receive the error. - pipeData := <-conmonPipeDataChan - - return pipeData.pid, attachChan, pipeData.err -} - -// conmonPipeData contains the data when reading from conmon's pipe. -type conmonPipeData struct { - pid int - err error -} - -// ExecContainerDetached executes a command in a running container, but does -// not attach to it. -func (r *ConmonOCIRuntime) ExecContainerDetached(ctr *Container, sessionID string, options *ExecOptions, stdin bool) (int, error) { - if options == nil { - return -1, fmt.Errorf("must provide exec options to ExecContainerHTTP: %w", define.ErrInvalidArg) - } - - var ociLog string - if logrus.GetLevel() != logrus.DebugLevel && r.supportsJSON { - ociLog = ctr.execOCILog(sessionID) - } - - execCmd, pipes, err := r.startExec(ctr, sessionID, options, stdin, ociLog) - if err != nil { - return -1, err - } - - defer func() { - pipes.cleanup() - }() - - // Wait for Conmon to tell us we're ready to attach. - // We aren't actually *going* to attach, but this means that we're good - // to proceed. - if _, err := readConmonPipeData(r.name, pipes.attachPipe, ""); err != nil { - return -1, err - } - - // Start the exec session - if err := writeConmonPipeData(pipes.startPipe); err != nil { - return -1, err - } - - // Wait for conmon to succeed, when return. - if err := execCmd.Wait(); err != nil { - return -1, fmt.Errorf("cannot run conmon: %w", err) - } - - pid, err := readConmonPipeData(r.name, pipes.syncPipe, ociLog) - - return pid, err -} - -// ExecAttachResize resizes the TTY of the given exec session. -func (r *ConmonOCIRuntime) ExecAttachResize(ctr *Container, sessionID string, newSize resize.TerminalSize) error { - controlFile, err := openControlFile(ctr, ctr.execBundlePath(sessionID)) - if err != nil { - return err - } - defer controlFile.Close() - - if _, err = fmt.Fprintf(controlFile, "%d %d %d\n", 1, newSize.Height, newSize.Width); err != nil { - return fmt.Errorf("failed to write to ctl file to resize terminal: %w", err) - } - - return nil -} - -// ExecStopContainer stops a given exec session in a running container. -func (r *ConmonOCIRuntime) ExecStopContainer(ctr *Container, sessionID string, timeout uint) error { - pid, err := ctr.getExecSessionPID(sessionID) - if err != nil { - return err - } - - logrus.Debugf("Going to stop container %s exec session %s", ctr.ID(), sessionID) - - // Is the session dead? - // Ping the PID with signal 0 to see if it still exists. - if err := unix.Kill(pid, 0); err != nil { - if err == unix.ESRCH { - return nil - } - return fmt.Errorf("error pinging container %s exec session %s PID %d with signal 0: %w", ctr.ID(), sessionID, pid, err) - } - - if timeout > 0 { - // Use SIGTERM by default, then SIGSTOP after timeout. - logrus.Debugf("Killing exec session %s (PID %d) of container %s with SIGTERM", sessionID, pid, ctr.ID()) - if err := unix.Kill(pid, unix.SIGTERM); err != nil { - if err == unix.ESRCH { - return nil - } - return fmt.Errorf("error killing container %s exec session %s PID %d with SIGTERM: %w", ctr.ID(), sessionID, pid, err) - } - - // Wait for the PID to stop - if err := waitPidStop(pid, time.Duration(timeout)*time.Second); err != nil { - logrus.Infof("Timed out waiting for container %s exec session %s to stop, resorting to SIGKILL: %v", ctr.ID(), sessionID, err) - } else { - // No error, container is dead - return nil - } - } - - // SIGTERM did not work. On to SIGKILL. - logrus.Debugf("Killing exec session %s (PID %d) of container %s with SIGKILL", sessionID, pid, ctr.ID()) - if err := unix.Kill(pid, unix.SIGTERM); err != nil { - if err == unix.ESRCH { - return nil - } - return fmt.Errorf("error killing container %s exec session %s PID %d with SIGKILL: %w", ctr.ID(), sessionID, pid, err) - } - - // Wait for the PID to stop - if err := waitPidStop(pid, killContainerTimeout); err != nil { - return fmt.Errorf("timed out waiting for container %s exec session %s PID %d to stop after SIGKILL: %w", ctr.ID(), sessionID, pid, err) - } - - return nil -} - -// ExecUpdateStatus checks if the given exec session is still running. -func (r *ConmonOCIRuntime) ExecUpdateStatus(ctr *Container, sessionID string) (bool, error) { - pid, err := ctr.getExecSessionPID(sessionID) - if err != nil { - return false, err - } - - logrus.Debugf("Checking status of container %s exec session %s", ctr.ID(), sessionID) - - // Is the session dead? - // Ping the PID with signal 0 to see if it still exists. - if err := unix.Kill(pid, 0); err != nil { - if err == unix.ESRCH { - return false, nil - } - return false, fmt.Errorf("error pinging container %s exec session %s PID %d with signal 0: %w", ctr.ID(), sessionID, pid, err) - } - - return true, nil -} - -// ExecAttachSocketPath is the path to a container's exec session attach socket. -func (r *ConmonOCIRuntime) ExecAttachSocketPath(ctr *Container, sessionID string) (string, error) { - // We don't even use container, so don't validity check it - if sessionID == "" { - return "", fmt.Errorf("must provide a valid session ID to get attach socket path: %w", define.ErrInvalidArg) - } - - return filepath.Join(ctr.execBundlePath(sessionID), "attach"), nil -} - -// This contains pipes used by the exec API. -type execPipes struct { - syncPipe *os.File - syncClosed bool - startPipe *os.File - startClosed bool - attachPipe *os.File - attachClosed bool -} - -func (p *execPipes) cleanup() { - if p.syncPipe != nil && !p.syncClosed { - errorhandling.CloseQuiet(p.syncPipe) - p.syncClosed = true - } - if p.startPipe != nil && !p.startClosed { - errorhandling.CloseQuiet(p.startPipe) - p.startClosed = true - } - if p.attachPipe != nil && !p.attachClosed { - errorhandling.CloseQuiet(p.attachPipe) - p.attachClosed = true - } -} - -// Start an exec session's conmon parent from the given options. -func (r *ConmonOCIRuntime) startExec(c *Container, sessionID string, options *ExecOptions, attachStdin bool, ociLog string) (_ *exec.Cmd, _ *execPipes, deferredErr error) { - pipes := new(execPipes) - - if options == nil { - return nil, nil, fmt.Errorf("must provide an ExecOptions struct to ExecContainer: %w", define.ErrInvalidArg) - } - if len(options.Cmd) == 0 { - return nil, nil, fmt.Errorf("must provide a command to execute: %w", define.ErrInvalidArg) - } - - if sessionID == "" { - return nil, nil, fmt.Errorf("must provide a session ID for exec: %w", define.ErrEmptyID) - } - - // create sync pipe to receive the pid - parentSyncPipe, childSyncPipe, err := newPipe() - if err != nil { - return nil, nil, fmt.Errorf("error creating socket pair: %w", err) - } - pipes.syncPipe = parentSyncPipe - - defer func() { - if deferredErr != nil { - pipes.cleanup() - } - }() - - // create start pipe to set the cgroup before running - // attachToExec is responsible for closing parentStartPipe - childStartPipe, parentStartPipe, err := newPipe() - if err != nil { - return nil, nil, fmt.Errorf("error creating socket pair: %w", err) - } - pipes.startPipe = parentStartPipe - - // create the attach pipe to allow attach socket to be created before - // $RUNTIME exec starts running. This is to make sure we can capture all output - // from the process through that socket, rather than half reading the log, half attaching to the socket - // attachToExec is responsible for closing parentAttachPipe - parentAttachPipe, childAttachPipe, err := newPipe() - if err != nil { - return nil, nil, fmt.Errorf("error creating socket pair: %w", err) - } - pipes.attachPipe = parentAttachPipe - - childrenClosed := false - defer func() { - if !childrenClosed { - errorhandling.CloseQuiet(childSyncPipe) - errorhandling.CloseQuiet(childAttachPipe) - errorhandling.CloseQuiet(childStartPipe) - } - }() - - runtimeDir, err := util.GetRuntimeDir() - if err != nil { - return nil, nil, err - } - - finalEnv := make([]string, 0, len(options.Env)) - for k, v := range options.Env { - finalEnv = append(finalEnv, fmt.Sprintf("%s=%s", k, v)) - } - - processFile, err := prepareProcessExec(c, options, finalEnv, sessionID) - if err != nil { - return nil, nil, err - } - defer processFile.Close() - - args := r.sharedConmonArgs(c, sessionID, c.execBundlePath(sessionID), c.execPidPath(sessionID), c.execLogPath(sessionID), c.execExitFileDir(sessionID), ociLog, define.NoLogging, c.config.LogTag) - - if options.PreserveFDs > 0 { - args = append(args, formatRuntimeOpts("--preserve-fds", fmt.Sprintf("%d", options.PreserveFDs))...) - } - - if options.Terminal { - args = append(args, "-t") - } - - if attachStdin { - args = append(args, "-i") - } - - // Append container ID and command - args = append(args, "-e") - // TODO make this optional when we can detach - args = append(args, "--exec-attach") - args = append(args, "--exec-process-spec", processFile.Name()) - - if len(options.ExitCommand) > 0 { - args = append(args, "--exit-command", options.ExitCommand[0]) - for _, arg := range options.ExitCommand[1:] { - args = append(args, []string{"--exit-command-arg", arg}...) - } - if options.ExitCommandDelay > 0 { - args = append(args, []string{"--exit-delay", fmt.Sprintf("%d", options.ExitCommandDelay)}...) - } - } - - logrus.WithFields(logrus.Fields{ - "args": args, - }).Debugf("running conmon: %s", r.conmonPath) - execCmd := exec.Command(r.conmonPath, args...) - - // TODO: This is commented because it doesn't make much sense in HTTP - // attach, and I'm not certain it does for non-HTTP attach as well. - // if streams != nil { - // // Don't add the InputStream to the execCmd. Instead, the data should be passed - // // through CopyDetachable - // if streams.AttachOutput { - // execCmd.Stdout = options.Streams.OutputStream - // } - // if streams.AttachError { - // execCmd.Stderr = options.Streams.ErrorStream - // } - // } - - conmonEnv := r.configureConmonEnv(runtimeDir) - - var filesToClose []*os.File - if options.PreserveFDs > 0 { - for fd := 3; fd < int(3+options.PreserveFDs); fd++ { - f := os.NewFile(uintptr(fd), fmt.Sprintf("fd-%d", fd)) - filesToClose = append(filesToClose, f) - execCmd.ExtraFiles = append(execCmd.ExtraFiles, f) - } - } - - // we don't want to step on users fds they asked to preserve - // Since 0-2 are used for stdio, start the fds we pass in at preserveFDs+3 - execCmd.Env = r.conmonEnv - execCmd.Env = append(execCmd.Env, fmt.Sprintf("_OCI_SYNCPIPE=%d", options.PreserveFDs+3), fmt.Sprintf("_OCI_STARTPIPE=%d", options.PreserveFDs+4), fmt.Sprintf("_OCI_ATTACHPIPE=%d", options.PreserveFDs+5)) - execCmd.Env = append(execCmd.Env, conmonEnv...) - - execCmd.ExtraFiles = append(execCmd.ExtraFiles, childSyncPipe, childStartPipe, childAttachPipe) - execCmd.Dir = c.execBundlePath(sessionID) - execCmd.SysProcAttr = &syscall.SysProcAttr{ - Setpgid: true, - } - - err = startCommand(execCmd, c) - - // We don't need children pipes on the parent side - errorhandling.CloseQuiet(childSyncPipe) - errorhandling.CloseQuiet(childAttachPipe) - errorhandling.CloseQuiet(childStartPipe) - childrenClosed = true - - if err != nil { - return nil, nil, fmt.Errorf("cannot start container %s: %w", c.ID(), err) - } - if err := r.moveConmonToCgroupAndSignal(c, execCmd, parentStartPipe); err != nil { - return nil, nil, err - } - - // These fds were passed down to the runtime. Close them - // and not interfere - for _, f := range filesToClose { - errorhandling.CloseQuiet(f) - } - - return execCmd, pipes, nil -} - -// Attach to a container over HTTP -func attachExecHTTP(c *Container, sessionID string, r *http.Request, w http.ResponseWriter, streams *HTTPAttachStreams, pipes *execPipes, detachKeys []byte, isTerminal bool, cancel <-chan bool, hijackDone chan<- bool, holdConnOpen <-chan bool, execCmd *exec.Cmd, conmonPipeDataChan chan<- conmonPipeData, ociLog string, newSize *resize.TerminalSize, runtimeName string) (deferredErr error) { - // NOTE: As you may notice, the attach code is quite complex. - // Many things happen concurrently and yet are interdependent. - // If you ever change this function, make sure to write to the - // conmonPipeDataChan in case of an error. - - if pipes == nil || pipes.startPipe == nil || pipes.attachPipe == nil { - err := fmt.Errorf("must provide a start and attach pipe to finish an exec attach: %w", define.ErrInvalidArg) - conmonPipeDataChan <- conmonPipeData{-1, err} - return err - } - - defer func() { - if !pipes.startClosed { - errorhandling.CloseQuiet(pipes.startPipe) - pipes.startClosed = true - } - if !pipes.attachClosed { - errorhandling.CloseQuiet(pipes.attachPipe) - pipes.attachClosed = true - } - }() - - logrus.Debugf("Attaching to container %s exec session %s", c.ID(), sessionID) - - // set up the socket path, such that it is the correct length and location for exec - sockPath, err := c.execAttachSocketPath(sessionID) - if err != nil { - conmonPipeDataChan <- conmonPipeData{-1, err} - return err - } - - // 2: read from attachFd that the parent process has set up the console socket - if _, err := readConmonPipeData(runtimeName, pipes.attachPipe, ""); err != nil { - conmonPipeDataChan <- conmonPipeData{-1, err} - return err - } - - // resize before we start the container process - if newSize != nil { - err = c.ociRuntime.ExecAttachResize(c, sessionID, *newSize) - if err != nil { - logrus.Warnf("Resize failed: %v", err) - } - } - - // 2: then attach - conn, err := openUnixSocket(sockPath) - if err != nil { - conmonPipeDataChan <- conmonPipeData{-1, err} - return fmt.Errorf("failed to connect to container's attach socket: %v: %w", sockPath, err) - } - defer func() { - if err := conn.Close(); err != nil { - logrus.Errorf("Unable to close socket: %q", err) - } - }() - - attachStdout := true - attachStderr := true - attachStdin := true - if streams != nil { - attachStdout = streams.Stdout - attachStderr = streams.Stderr - attachStdin = streams.Stdin - } - - // Perform hijack - hijacker, ok := w.(http.Hijacker) - if !ok { - conmonPipeDataChan <- conmonPipeData{-1, err} - return errors.New("unable to hijack connection") - } - - httpCon, httpBuf, err := hijacker.Hijack() - if err != nil { - conmonPipeDataChan <- conmonPipeData{-1, err} - return fmt.Errorf("error hijacking connection: %w", err) - } - - hijackDone <- true - - // Write a header to let the client know what happened - writeHijackHeader(r, httpBuf) - - // Force a flush after the header is written. - if err := httpBuf.Flush(); err != nil { - conmonPipeDataChan <- conmonPipeData{-1, err} - return fmt.Errorf("error flushing HTTP hijack header: %w", err) - } - - go func() { - // Wait for conmon to succeed, when return. - if err := execCmd.Wait(); err != nil { - conmonPipeDataChan <- conmonPipeData{-1, err} - } else { - pid, err := readConmonPipeData(runtimeName, pipes.syncPipe, ociLog) - if err != nil { - hijackWriteError(err, c.ID(), isTerminal, httpBuf) - conmonPipeDataChan <- conmonPipeData{pid, err} - } else { - conmonPipeDataChan <- conmonPipeData{pid, err} - } - } - // We need to hold the connection open until the complete exec - // function has finished. This channel will be closed in a defer - // in that function, so we can wait for it here. - // Can't be a defer, because this would block the function from - // returning. - <-holdConnOpen - hijackWriteErrorAndClose(deferredErr, c.ID(), isTerminal, httpCon, httpBuf) - }() - - stdoutChan := make(chan error) - stdinChan := make(chan error) - - // Next, STDIN. Avoid entirely if attachStdin unset. - if attachStdin { - go func() { - logrus.Debugf("Beginning STDIN copy") - _, err := cutil.CopyDetachable(conn, httpBuf, detachKeys) - logrus.Debugf("STDIN copy completed") - stdinChan <- err - }() - } - - // 4: send start message to child - if err := writeConmonPipeData(pipes.startPipe); err != nil { - return err - } - - // Handle STDOUT/STDERR *after* start message is sent - go func() { - var err error - if isTerminal { - // Hack: return immediately if attachStdout not set to - // emulate Docker. - // Basically, when terminal is set, STDERR goes nowhere. - // Everything does over STDOUT. - // Therefore, if not attaching STDOUT - we'll never copy - // anything from here. - logrus.Debugf("Performing terminal HTTP attach for container %s", c.ID()) - if attachStdout { - err = httpAttachTerminalCopy(conn, httpBuf, c.ID()) - } - } else { - logrus.Debugf("Performing non-terminal HTTP attach for container %s", c.ID()) - err = httpAttachNonTerminalCopy(conn, httpBuf, c.ID(), attachStdin, attachStdout, attachStderr) - } - stdoutChan <- err - logrus.Debugf("STDOUT/ERR copy completed") - }() - - for { - select { - case err := <-stdoutChan: - if err != nil { - return err - } - - return nil - case err := <-stdinChan: - if err != nil { - return err - } - // copy stdin is done, close it - if connErr := conn.CloseWrite(); connErr != nil { - logrus.Errorf("Unable to close conn: %v", connErr) - } - case <-cancel: - return nil - } - } -} - -// prepareProcessExec returns the path of the process.json used in runc exec -p -// caller is responsible to close the returned *os.File if needed. -func prepareProcessExec(c *Container, options *ExecOptions, env []string, sessionID string) (*os.File, error) { - f, err := ioutil.TempFile(c.execBundlePath(sessionID), "exec-process-") - if err != nil { - return nil, err - } - pspec := new(spec.Process) - if err := JSONDeepCopy(c.config.Spec.Process, pspec); err != nil { - return nil, err - } - pspec.SelinuxLabel = c.config.ProcessLabel - pspec.Args = options.Cmd - - // We need to default this to false else it will inherit terminal as true - // from the container. - pspec.Terminal = false - if options.Terminal { - pspec.Terminal = true - } - if len(env) > 0 { - pspec.Env = append(pspec.Env, env...) - } - - // Add secret envs if they exist - manager, err := c.runtime.SecretsManager() - if err != nil { - return nil, err - } - for name, secr := range c.config.EnvSecrets { - _, data, err := manager.LookupSecretData(secr.Name) - if err != nil { - return nil, err - } - pspec.Env = append(pspec.Env, fmt.Sprintf("%s=%s", name, string(data))) - } - - if options.Cwd != "" { - pspec.Cwd = options.Cwd - } - - var addGroups []string - var sgids []uint32 - - // if the user is empty, we should inherit the user that the container is currently running with - user := options.User - if user == "" { - logrus.Debugf("Set user to %s", c.config.User) - user = c.config.User - addGroups = c.config.Groups - } - - overrides := c.getUserOverrides() - execUser, err := lookup.GetUserGroupInfo(c.state.Mountpoint, user, overrides) - if err != nil { - return nil, err - } - - if len(addGroups) > 0 { - sgids, err = lookup.GetContainerGroups(addGroups, c.state.Mountpoint, overrides) - if err != nil { - return nil, fmt.Errorf("error looking up supplemental groups for container %s exec session %s: %w", c.ID(), sessionID, err) - } - } - - // If user was set, look it up in the container to get a UID to use on - // the host - if user != "" || len(sgids) > 0 { - if user != "" { - for _, sgid := range execUser.Sgids { - sgids = append(sgids, uint32(sgid)) - } - } - processUser := spec.User{ - UID: uint32(execUser.Uid), - GID: uint32(execUser.Gid), - AdditionalGids: sgids, - } - - pspec.User = processUser - } - +func (c *Container) setProcessCapabilitiesExec(options *ExecOptions, user string, execUser *user.ExecUser, pspec *spec.Process) error { ctrSpec, err := c.specFromState() if err != nil { - return nil, err + return err } allCaps, err := capabilities.BoundingSet() if err != nil { - return nil, err + return err } if options.Privileged { pspec.Capabilities.Bounding = allCaps @@ -773,25 +35,5 @@ func prepareProcessExec(c *Container, options *ExecOptions, env []string, sessio pspec.Capabilities.Permitted = ctrSpec.Process.Capabilities.Effective pspec.Capabilities.Ambient = ctrSpec.Process.Capabilities.Effective } - - hasHomeSet := false - for _, s := range pspec.Env { - if strings.HasPrefix(s, "HOME=") { - hasHomeSet = true - break - } - } - if !hasHomeSet { - pspec.Env = append(pspec.Env, fmt.Sprintf("HOME=%s", execUser.Home)) - } - - processJSON, err := json.Marshal(pspec) - if err != nil { - return nil, err - } - - if err := ioutil.WriteFile(f.Name(), processJSON, 0644); err != nil { - return nil, err - } - return f, nil + return nil } diff --git a/libpod/oci_conmon_freebsd.go b/libpod/oci_conmon_freebsd.go new file mode 100644 index 000000000..d74f2af01 --- /dev/null +++ b/libpod/oci_conmon_freebsd.go @@ -0,0 +1,27 @@ +package libpod + +import ( + "errors" + "os" + "os/exec" +) + +func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) { + return -1, errors.New("unsupported (*ConmonOCIRuntime) createRootlessContainer") +} + +// Run the closure with the container's socket label set +func (r *ConmonOCIRuntime) withContainerSocketLabel(ctr *Container, closure func() error) error { + // No label support yet + return closure() +} + +// moveConmonToCgroupAndSignal gets a container's cgroupParent and moves the conmon process to that cgroup +// it then signals for conmon to start by sending nonce data down the start fd +func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec.Cmd, startFd *os.File) error { + // No equivalent to cgroup on FreeBSD, just signal conmon to start + if err := writeConmonPipeData(startFd); err != nil { + return err + } + return nil +} diff --git a/libpod/oci_conmon_linux.go b/libpod/oci_conmon_linux.go index cb76de72c..0964d4ea3 100644 --- a/libpod/oci_conmon_linux.go +++ b/libpod/oci_conmon_linux.go @@ -1,46 +1,21 @@ -//go:build linux -// +build linux - package libpod import ( - "bufio" - "bytes" - "context" - "errors" "fmt" - "io" - "io/ioutil" - "net" - "net/http" "os" "os/exec" "path/filepath" "runtime" - "strconv" "strings" - "sync" - "syscall" - "text/template" - "time" runcconfig "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/devices" "github.com/containers/common/pkg/cgroups" "github.com/containers/common/pkg/config" - "github.com/containers/common/pkg/resize" - cutil "github.com/containers/common/pkg/util" - conmonConfig "github.com/containers/conmon/runner/config" - "github.com/containers/podman/v4/libpod/define" - "github.com/containers/podman/v4/libpod/logs" - "github.com/containers/podman/v4/pkg/checkpoint/crutils" "github.com/containers/podman/v4/pkg/errorhandling" "github.com/containers/podman/v4/pkg/rootless" - "github.com/containers/podman/v4/pkg/specgenutil" - "github.com/containers/podman/v4/pkg/util" "github.com/containers/podman/v4/utils" - "github.com/containers/storage/pkg/homedir" pmount "github.com/containers/storage/pkg/mount" spec "github.com/opencontainers/runtime-spec/specs-go" "github.com/opencontainers/selinux/go-selinux/label" @@ -48,782 +23,70 @@ import ( "golang.org/x/sys/unix" ) -const ( - // This is Conmon's STDIO_BUF_SIZE. I don't believe we have access to it - // directly from the Go code, so const it here - // Important: The conmon attach socket uses an extra byte at the beginning of each - // message to specify the STREAM so we have to increase the buffer size by one - bufferSize = conmonConfig.BufSize + 1 -) - -// ConmonOCIRuntime is an OCI runtime managed by Conmon. -// TODO: Make all calls to OCI runtime have a timeout. -type ConmonOCIRuntime struct { - name string - path string - conmonPath string - conmonEnv []string - tmpDir string - exitsDir string - logSizeMax int64 - noPivot bool - reservePorts bool - runtimeFlags []string - supportsJSON bool - supportsKVM bool - supportsNoCgroups bool - enableKeyring bool -} - -// Make a new Conmon-based OCI runtime with the given options. -// Conmon will wrap the given OCI runtime, which can be `runc`, `crun`, or -// any runtime with a runc-compatible CLI. -// The first path that points to a valid executable will be used. -// Deliberately private. Someone should not be able to construct this outside of -// libpod. -func newConmonOCIRuntime(name string, paths []string, conmonPath string, runtimeFlags []string, runtimeCfg *config.Config) (OCIRuntime, error) { - if name == "" { - return nil, fmt.Errorf("the OCI runtime must be provided a non-empty name: %w", define.ErrInvalidArg) - } - - // Make lookup tables for runtime support - supportsJSON := make(map[string]bool, len(runtimeCfg.Engine.RuntimeSupportsJSON)) - supportsNoCgroups := make(map[string]bool, len(runtimeCfg.Engine.RuntimeSupportsNoCgroups)) - supportsKVM := make(map[string]bool, len(runtimeCfg.Engine.RuntimeSupportsKVM)) - for _, r := range runtimeCfg.Engine.RuntimeSupportsJSON { - supportsJSON[r] = true - } - for _, r := range runtimeCfg.Engine.RuntimeSupportsNoCgroups { - supportsNoCgroups[r] = true - } - for _, r := range runtimeCfg.Engine.RuntimeSupportsKVM { - supportsKVM[r] = true - } - - runtime := new(ConmonOCIRuntime) - runtime.name = name - runtime.conmonPath = conmonPath - runtime.runtimeFlags = runtimeFlags - - runtime.conmonEnv = runtimeCfg.Engine.ConmonEnvVars - runtime.tmpDir = runtimeCfg.Engine.TmpDir - runtime.logSizeMax = runtimeCfg.Containers.LogSizeMax - runtime.noPivot = runtimeCfg.Engine.NoPivotRoot - runtime.reservePorts = runtimeCfg.Engine.EnablePortReservation - runtime.enableKeyring = runtimeCfg.Containers.EnableKeyring - - // TODO: probe OCI runtime for feature and enable automatically if - // available. - - base := filepath.Base(name) - runtime.supportsJSON = supportsJSON[base] - runtime.supportsNoCgroups = supportsNoCgroups[base] - runtime.supportsKVM = supportsKVM[base] - - foundPath := false - for _, path := range paths { - stat, err := os.Stat(path) - if err != nil { - if os.IsNotExist(err) { - continue - } - return nil, fmt.Errorf("cannot stat OCI runtime %s path: %w", name, err) - } - if !stat.Mode().IsRegular() { - continue - } - foundPath = true - logrus.Tracef("found runtime %q", path) - runtime.path = path - break - } - - // Search the $PATH as last fallback - if !foundPath { - if foundRuntime, err := exec.LookPath(name); err == nil { - foundPath = true - runtime.path = foundRuntime - logrus.Debugf("using runtime %q from $PATH: %q", name, foundRuntime) - } - } - - if !foundPath { - return nil, fmt.Errorf("no valid executable found for OCI runtime %s: %w", name, define.ErrInvalidArg) - } - - runtime.exitsDir = filepath.Join(runtime.tmpDir, "exits") - - // Create the exit files and attach sockets directories - if err := os.MkdirAll(runtime.exitsDir, 0750); err != nil { - // The directory is allowed to exist - if !os.IsExist(err) { - return nil, fmt.Errorf("error creating OCI runtime exit files directory: %w", err) - } +func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) { + type result struct { + restoreDuration int64 + err error } - return runtime, nil -} - -// Name returns the name of the runtime being wrapped by Conmon. -func (r *ConmonOCIRuntime) Name() string { - return r.name -} - -// Path returns the path of the OCI runtime being wrapped by Conmon. -func (r *ConmonOCIRuntime) Path() string { - return r.path -} - -// hasCurrentUserMapped checks whether the current user is mapped inside the container user namespace -func hasCurrentUserMapped(ctr *Container) bool { - if len(ctr.config.IDMappings.UIDMap) == 0 && len(ctr.config.IDMappings.GIDMap) == 0 { - return true - } - uid := os.Geteuid() - for _, m := range ctr.config.IDMappings.UIDMap { - if uid >= m.HostID && uid < m.HostID+m.Size { - return true - } - } - return false -} - -// CreateContainer creates a container. -func (r *ConmonOCIRuntime) CreateContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) { - // always make the run dir accessible to the current user so that the PID files can be read without - // being in the rootless user namespace. - if err := makeAccessible(ctr.state.RunDir, 0, 0); err != nil { - return 0, err - } - if !hasCurrentUserMapped(ctr) { - for _, i := range []string{ctr.state.RunDir, ctr.runtime.config.Engine.TmpDir, ctr.config.StaticDir, ctr.state.Mountpoint, ctr.runtime.config.Engine.VolumePath} { - if err := makeAccessible(i, ctr.RootUID(), ctr.RootGID()); err != nil { + ch := make(chan result) + go func() { + runtime.LockOSThread() + restoreDuration, err := func() (int64, error) { + fd, err := os.Open(fmt.Sprintf("/proc/%d/task/%d/ns/mnt", os.Getpid(), unix.Gettid())) + if err != nil { return 0, err } - } + defer errorhandling.CloseQuiet(fd) - // if we are running a non privileged container, be sure to umount some kernel paths so they are not - // bind mounted inside the container at all. - if !ctr.config.Privileged && !rootless.IsRootless() { - type result struct { - restoreDuration int64 - err error + // create a new mountns on the current thread + if err = unix.Unshare(unix.CLONE_NEWNS); err != nil { + return 0, err } - ch := make(chan result) - go func() { - runtime.LockOSThread() - restoreDuration, err := func() (int64, error) { - fd, err := os.Open(fmt.Sprintf("/proc/%d/task/%d/ns/mnt", os.Getpid(), unix.Gettid())) - if err != nil { - return 0, err - } - defer errorhandling.CloseQuiet(fd) - - // create a new mountns on the current thread - if err = unix.Unshare(unix.CLONE_NEWNS); err != nil { - return 0, err - } - defer func() { - if err := unix.Setns(int(fd.Fd()), unix.CLONE_NEWNS); err != nil { - logrus.Errorf("Unable to clone new namespace: %q", err) - } - }() - - // don't spread our mounts around. We are setting only /sys to be slave - // so that the cleanup process is still able to umount the storage and the - // changes are propagated to the host. - err = unix.Mount("/sys", "/sys", "none", unix.MS_REC|unix.MS_SLAVE, "") - if err != nil { - return 0, fmt.Errorf("cannot make /sys slave: %w", err) - } - - mounts, err := pmount.GetMounts() - if err != nil { - return 0, err - } - for _, m := range mounts { - if !strings.HasPrefix(m.Mountpoint, "/sys/kernel") { - continue - } - err = unix.Unmount(m.Mountpoint, 0) - if err != nil && !os.IsNotExist(err) { - return 0, fmt.Errorf("cannot unmount %s: %w", m.Mountpoint, err) - } - } - return r.createOCIContainer(ctr, restoreOptions) - }() - ch <- result{ - restoreDuration: restoreDuration, - err: err, + defer func() { + if err := unix.Setns(int(fd.Fd()), unix.CLONE_NEWNS); err != nil { + logrus.Errorf("Unable to clone new namespace: %q", err) } }() - r := <-ch - return r.restoreDuration, r.err - } - } - return r.createOCIContainer(ctr, restoreOptions) -} - -// UpdateContainerStatus retrieves the current status of the container from the -// runtime. It updates the container's state but does not save it. -// If useRuntime is false, we will not directly hit runc to see the container's -// status, but will instead only check for the existence of the conmon exit file -// and update state to stopped if it exists. -func (r *ConmonOCIRuntime) UpdateContainerStatus(ctr *Container) error { - runtimeDir, err := util.GetRuntimeDir() - if err != nil { - return err - } - - // Store old state so we know if we were already stopped - oldState := ctr.state.State - - state := new(spec.State) - - cmd := exec.Command(r.path, "state", ctr.ID()) - cmd.Env = append(cmd.Env, fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)) - - outPipe, err := cmd.StdoutPipe() - if err != nil { - return fmt.Errorf("getting stdout pipe: %w", err) - } - errPipe, err := cmd.StderrPipe() - if err != nil { - return fmt.Errorf("getting stderr pipe: %w", err) - } - - if err := cmd.Start(); err != nil { - out, err2 := ioutil.ReadAll(errPipe) - if err2 != nil { - return fmt.Errorf("error getting container %s state: %w", ctr.ID(), err) - } - if strings.Contains(string(out), "does not exist") || strings.Contains(string(out), "No such file") { - if err := ctr.removeConmonFiles(); err != nil { - logrus.Debugf("unable to remove conmon files for container %s", ctr.ID()) - } - ctr.state.ExitCode = -1 - ctr.state.FinishedTime = time.Now() - ctr.state.State = define.ContainerStateExited - return ctr.runtime.state.AddContainerExitCode(ctr.ID(), ctr.state.ExitCode) - } - return fmt.Errorf("error getting container %s state. stderr/out: %s: %w", ctr.ID(), out, err) - } - defer func() { - _ = cmd.Wait() - }() - - if err := errPipe.Close(); err != nil { - return err - } - out, err := ioutil.ReadAll(outPipe) - if err != nil { - return fmt.Errorf("error reading stdout: %s: %w", ctr.ID(), err) - } - if err := json.NewDecoder(bytes.NewBuffer(out)).Decode(state); err != nil { - return fmt.Errorf("error decoding container status for container %s: %w", ctr.ID(), err) - } - ctr.state.PID = state.Pid - - switch state.Status { - case "created": - ctr.state.State = define.ContainerStateCreated - case "paused": - ctr.state.State = define.ContainerStatePaused - case "running": - ctr.state.State = define.ContainerStateRunning - case "stopped": - ctr.state.State = define.ContainerStateStopped - default: - return fmt.Errorf("unrecognized status returned by runtime for container %s: %s: %w", - ctr.ID(), state.Status, define.ErrInternal) - } - - // Only grab exit status if we were not already stopped - // If we were, it should already be in the database - if ctr.state.State == define.ContainerStateStopped && oldState != define.ContainerStateStopped { - if _, err := ctr.Wait(context.Background()); err != nil { - logrus.Errorf("Waiting for container %s to exit: %v", ctr.ID(), err) - } - return nil - } - - // Handle ContainerStateStopping - keep it unless the container - // transitioned to no longer running. - if oldState == define.ContainerStateStopping && (ctr.state.State == define.ContainerStatePaused || ctr.state.State == define.ContainerStateRunning) { - ctr.state.State = define.ContainerStateStopping - } - return nil -} - -// StartContainer starts the given container. -// Sets time the container was started, but does not save it. -func (r *ConmonOCIRuntime) StartContainer(ctr *Container) error { - // TODO: streams should probably *not* be our STDIN/OUT/ERR - redirect to buffers? - runtimeDir, err := util.GetRuntimeDir() - if err != nil { - return err - } - env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} - if path, ok := os.LookupEnv("PATH"); ok { - env = append(env, fmt.Sprintf("PATH=%s", path)) - } - if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, append(r.runtimeFlags, "start", ctr.ID())...); err != nil { - return err - } - - ctr.state.StartedTime = time.Now() - - return nil -} - -// KillContainer sends the given signal to the given container. -// If all is set, send to all PIDs in the container. -// All is only supported if the container created cgroups. -func (r *ConmonOCIRuntime) KillContainer(ctr *Container, signal uint, all bool) error { - logrus.Debugf("Sending signal %d to container %s", signal, ctr.ID()) - runtimeDir, err := util.GetRuntimeDir() - if err != nil { - return err - } - env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} - var args []string - args = append(args, r.runtimeFlags...) - if all { - args = append(args, "kill", "--all", ctr.ID(), fmt.Sprintf("%d", signal)) - } else { - args = append(args, "kill", ctr.ID(), fmt.Sprintf("%d", signal)) - } - if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, args...); err != nil { - // Update container state - there's a chance we failed because - // the container exited in the meantime. - if err2 := r.UpdateContainerStatus(ctr); err2 != nil { - logrus.Infof("Error updating status for container %s: %v", ctr.ID(), err2) - } - if ctr.ensureState(define.ContainerStateStopped, define.ContainerStateExited) { - return define.ErrCtrStateInvalid - } - return fmt.Errorf("error sending signal to container %s: %w", ctr.ID(), err) - } - - return nil -} - -// StopContainer stops a container, first using its given stop signal (or -// SIGTERM if no signal was specified), then using SIGKILL. -// Timeout is given in seconds. If timeout is 0, the container will be -// immediately kill with SIGKILL. -// Does not set finished time for container, assumes you will run updateStatus -// after to pull the exit code. -func (r *ConmonOCIRuntime) StopContainer(ctr *Container, timeout uint, all bool) error { - logrus.Debugf("Stopping container %s (PID %d)", ctr.ID(), ctr.state.PID) - - // Ping the container to see if it's alive - // If it's not, it's already stopped, return - err := unix.Kill(ctr.state.PID, 0) - if err == unix.ESRCH { - return nil - } - - stopSignal := ctr.config.StopSignal - if stopSignal == 0 { - stopSignal = uint(syscall.SIGTERM) - } - - if timeout > 0 { - if err := r.KillContainer(ctr, stopSignal, all); err != nil { - // Is the container gone? - // If so, it probably died between the first check and - // our sending the signal - // The container is stopped, so exit cleanly - err := unix.Kill(ctr.state.PID, 0) - if err == unix.ESRCH { - return nil + // don't spread our mounts around. We are setting only /sys to be slave + // so that the cleanup process is still able to umount the storage and the + // changes are propagated to the host. + err = unix.Mount("/sys", "/sys", "none", unix.MS_REC|unix.MS_SLAVE, "") + if err != nil { + return 0, fmt.Errorf("cannot make /sys slave: %w", err) } - return err - } - - if err := waitContainerStop(ctr, time.Duration(timeout)*time.Second); err != nil { - logrus.Debugf("Timed out stopping container %s with %s, resorting to SIGKILL: %v", ctr.ID(), unix.SignalName(syscall.Signal(stopSignal)), err) - logrus.Warnf("StopSignal %s failed to stop container %s in %d seconds, resorting to SIGKILL", unix.SignalName(syscall.Signal(stopSignal)), ctr.Name(), timeout) - } else { - // No error, the container is dead - return nil - } - } - - if err := r.KillContainer(ctr, 9, all); err != nil { - // Again, check if the container is gone. If it is, exit cleanly. - err := unix.Kill(ctr.state.PID, 0) - if err == unix.ESRCH { - return nil - } - - return fmt.Errorf("error sending SIGKILL to container %s: %w", ctr.ID(), err) - } - - // Give runtime a few seconds to make it happen - if err := waitContainerStop(ctr, killContainerTimeout); err != nil { - return err - } - - return nil -} - -// DeleteContainer deletes a container from the OCI runtime. -func (r *ConmonOCIRuntime) DeleteContainer(ctr *Container) error { - runtimeDir, err := util.GetRuntimeDir() - if err != nil { - return err - } - env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} - return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, append(r.runtimeFlags, "delete", "--force", ctr.ID())...) -} - -// PauseContainer pauses the given container. -func (r *ConmonOCIRuntime) PauseContainer(ctr *Container) error { - runtimeDir, err := util.GetRuntimeDir() - if err != nil { - return err - } - env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} - return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, append(r.runtimeFlags, "pause", ctr.ID())...) -} - -// UnpauseContainer unpauses the given container. -func (r *ConmonOCIRuntime) UnpauseContainer(ctr *Container) error { - runtimeDir, err := util.GetRuntimeDir() - if err != nil { - return err - } - env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} - return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, append(r.runtimeFlags, "resume", ctr.ID())...) -} - -// HTTPAttach performs an attach for the HTTP API. -// The caller must handle closing the HTTP connection after this returns. -// The cancel channel is not closed; it is up to the caller to do so after -// this function returns. -// If this is a container with a terminal, we will stream raw. If it is not, we -// will stream with an 8-byte header to multiplex STDOUT and STDERR. -// Returns any errors that occurred, and whether the connection was successfully -// hijacked before that error occurred. -func (r *ConmonOCIRuntime) HTTPAttach(ctr *Container, req *http.Request, w http.ResponseWriter, streams *HTTPAttachStreams, detachKeys *string, cancel <-chan bool, hijackDone chan<- bool, streamAttach, streamLogs bool) (deferredErr error) { - isTerminal := false - if ctr.config.Spec.Process != nil { - isTerminal = ctr.config.Spec.Process.Terminal - } - - if streams != nil { - if !streams.Stdin && !streams.Stdout && !streams.Stderr { - return fmt.Errorf("must specify at least one stream to attach to: %w", define.ErrInvalidArg) - } - } - - attachSock, err := r.AttachSocketPath(ctr) - if err != nil { - return err - } - - var conn *net.UnixConn - if streamAttach { - newConn, err := openUnixSocket(attachSock) - if err != nil { - return fmt.Errorf("failed to connect to container's attach socket: %v: %w", attachSock, err) - } - conn = newConn - defer func() { - if err := conn.Close(); err != nil { - logrus.Errorf("Unable to close container %s attach socket: %q", ctr.ID(), err) + mounts, err := pmount.GetMounts() + if err != nil { + return 0, err } - }() - - logrus.Debugf("Successfully connected to container %s attach socket %s", ctr.ID(), attachSock) - } - - detachString := ctr.runtime.config.Engine.DetachKeys - if detachKeys != nil { - detachString = *detachKeys - } - detach, err := processDetachKeys(detachString) - if err != nil { - return err - } - - attachStdout := true - attachStderr := true - attachStdin := true - if streams != nil { - attachStdout = streams.Stdout - attachStderr = streams.Stderr - attachStdin = streams.Stdin - } - - logrus.Debugf("Going to hijack container %s attach connection", ctr.ID()) - - // Alright, let's hijack. - hijacker, ok := w.(http.Hijacker) - if !ok { - return fmt.Errorf("unable to hijack connection") - } - - httpCon, httpBuf, err := hijacker.Hijack() - if err != nil { - return fmt.Errorf("error hijacking connection: %w", err) - } - - hijackDone <- true - - writeHijackHeader(req, httpBuf) - - // Force a flush after the header is written. - if err := httpBuf.Flush(); err != nil { - return fmt.Errorf("error flushing HTTP hijack header: %w", err) - } - - defer func() { - hijackWriteErrorAndClose(deferredErr, ctr.ID(), isTerminal, httpCon, httpBuf) - }() - - logrus.Debugf("Hijack for container %s attach session done, ready to stream", ctr.ID()) - - // TODO: This is gross. Really, really gross. - // I want to say we should read all the logs into an array before - // calling this, in container_api.go, but that could take a lot of - // memory... - // On the whole, we need to figure out a better way of doing this, - // though. - logSize := 0 - if streamLogs { - logrus.Debugf("Will stream logs for container %s attach session", ctr.ID()) - - // Get all logs for the container - logChan := make(chan *logs.LogLine) - logOpts := new(logs.LogOptions) - logOpts.Tail = -1 - logOpts.WaitGroup = new(sync.WaitGroup) - errChan := make(chan error) - go func() { - var err error - // In non-terminal mode we need to prepend with the - // stream header. - logrus.Debugf("Writing logs for container %s to HTTP attach", ctr.ID()) - for logLine := range logChan { - if !isTerminal { - device := logLine.Device - var header []byte - headerLen := uint32(len(logLine.Msg)) - logSize += len(logLine.Msg) - switch strings.ToLower(device) { - case "stdin": - header = makeHTTPAttachHeader(0, headerLen) - case "stdout": - header = makeHTTPAttachHeader(1, headerLen) - case "stderr": - header = makeHTTPAttachHeader(2, headerLen) - default: - logrus.Errorf("Unknown device for log line: %s", device) - header = makeHTTPAttachHeader(1, headerLen) - } - _, err = httpBuf.Write(header) - if err != nil { - break - } - } - _, err = httpBuf.Write([]byte(logLine.Msg)) - if err != nil { - break - } - if !logLine.Partial() { - _, err = httpBuf.Write([]byte("\n")) - if err != nil { - break - } + for _, m := range mounts { + if !strings.HasPrefix(m.Mountpoint, "/sys/kernel") { + continue } - err = httpBuf.Flush() - if err != nil { - break + err = unix.Unmount(m.Mountpoint, 0) + if err != nil && !os.IsNotExist(err) { + return 0, fmt.Errorf("cannot unmount %s: %w", m.Mountpoint, err) } } - errChan <- err - }() - if err := ctr.ReadLog(context.Background(), logOpts, logChan, 0); err != nil { - return err - } - go func() { - logOpts.WaitGroup.Wait() - close(logChan) + return r.createOCIContainer(ctr, restoreOptions) }() - logrus.Debugf("Done reading logs for container %s, %d bytes", ctr.ID(), logSize) - if err := <-errChan; err != nil { - return err + ch <- result{ + restoreDuration: restoreDuration, + err: err, } - } - if !streamAttach { - logrus.Debugf("Done streaming logs for container %s attach, exiting as attach streaming not requested", ctr.ID()) - return nil - } - - logrus.Debugf("Forwarding attach output for container %s", ctr.ID()) - - stdoutChan := make(chan error) - stdinChan := make(chan error) - - // Handle STDOUT/STDERR - go func() { - var err error - if isTerminal { - // Hack: return immediately if attachStdout not set to - // emulate Docker. - // Basically, when terminal is set, STDERR goes nowhere. - // Everything does over STDOUT. - // Therefore, if not attaching STDOUT - we'll never copy - // anything from here. - logrus.Debugf("Performing terminal HTTP attach for container %s", ctr.ID()) - if attachStdout { - err = httpAttachTerminalCopy(conn, httpBuf, ctr.ID()) - } - } else { - logrus.Debugf("Performing non-terminal HTTP attach for container %s", ctr.ID()) - err = httpAttachNonTerminalCopy(conn, httpBuf, ctr.ID(), attachStdin, attachStdout, attachStderr) - } - stdoutChan <- err - logrus.Debugf("STDOUT/ERR copy completed") }() - // Next, STDIN. Avoid entirely if attachStdin unset. - if attachStdin { - go func() { - _, err := cutil.CopyDetachable(conn, httpBuf, detach) - logrus.Debugf("STDIN copy completed") - stdinChan <- err - }() - } - - for { - select { - case err := <-stdoutChan: - if err != nil { - return err - } - - return nil - case err := <-stdinChan: - if err != nil { - return err - } - // copy stdin is done, close it - if connErr := conn.CloseWrite(); connErr != nil { - logrus.Errorf("Unable to close conn: %v", connErr) - } - case <-cancel: - return nil - } - } -} - -// isRetryable returns whether the error was caused by a blocked syscall or the -// specified operation on a non blocking file descriptor wasn't ready for completion. -func isRetryable(err error) bool { - var errno syscall.Errno - if errors.As(err, &errno) { - return errno == syscall.EINTR || errno == syscall.EAGAIN - } - return false -} - -// openControlFile opens the terminal control file. -func openControlFile(ctr *Container, parentDir string) (*os.File, error) { - controlPath := filepath.Join(parentDir, "ctl") - for i := 0; i < 600; i++ { - controlFile, err := os.OpenFile(controlPath, unix.O_WRONLY|unix.O_NONBLOCK, 0) - if err == nil { - return controlFile, nil - } - if !isRetryable(err) { - return nil, fmt.Errorf("could not open ctl file for terminal resize for container %s: %w", ctr.ID(), err) - } - time.Sleep(time.Second / 10) - } - return nil, fmt.Errorf("timeout waiting for %q", controlPath) + res := <-ch + return res.restoreDuration, res.err } -// AttachResize resizes the terminal used by the given container. -func (r *ConmonOCIRuntime) AttachResize(ctr *Container, newSize resize.TerminalSize) error { - controlFile, err := openControlFile(ctr, ctr.bundlePath()) - if err != nil { - return err - } - defer controlFile.Close() - - logrus.Debugf("Received a resize event for container %s: %+v", ctr.ID(), newSize) - if _, err = fmt.Fprintf(controlFile, "%d %d %d\n", 1, newSize.Height, newSize.Width); err != nil { - return fmt.Errorf("failed to write to ctl file to resize terminal: %w", err) - } - - return nil -} - -// CheckpointContainer checkpoints the given container. -func (r *ConmonOCIRuntime) CheckpointContainer(ctr *Container, options ContainerCheckpointOptions) (int64, error) { - // imagePath is used by CRIU to store the actual checkpoint files - imagePath := ctr.CheckpointPath() - if options.PreCheckPoint { - imagePath = ctr.PreCheckPointPath() - } - // workPath will be used to store dump.log and stats-dump - workPath := ctr.bundlePath() - logrus.Debugf("Writing checkpoint to %s", imagePath) - logrus.Debugf("Writing checkpoint logs to %s", workPath) - logrus.Debugf("Pre-dump the container %t", options.PreCheckPoint) - args := []string{} - args = append(args, r.runtimeFlags...) - args = append(args, "checkpoint") - args = append(args, "--image-path") - args = append(args, imagePath) - args = append(args, "--work-path") - args = append(args, workPath) - if options.KeepRunning { - args = append(args, "--leave-running") - } - if options.TCPEstablished { - args = append(args, "--tcp-established") - } - if options.FileLocks { - args = append(args, "--file-locks") - } - if !options.PreCheckPoint && options.KeepRunning { - args = append(args, "--leave-running") - } - if options.PreCheckPoint { - args = append(args, "--pre-dump") - } - if !options.PreCheckPoint && options.WithPrevious { - args = append( - args, - "--parent-path", - filepath.Join("..", preCheckpointDir), - ) - } - - args = append(args, ctr.ID()) - logrus.Debugf("the args to checkpoint: %s %s", r.path, strings.Join(args, " ")) - - runtimeDir, err := util.GetRuntimeDir() - if err != nil { - return 0, err - } - env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} - if path, ok := os.LookupEnv("PATH"); ok { - env = append(env, fmt.Sprintf("PATH=%s", path)) - } - +// Run the closure with the container's socket label set +func (r *ConmonOCIRuntime) withContainerSocketLabel(ctr *Container, closure func() error) error { runtime.LockOSThread() if err := label.SetSocketLabel(ctr.ProcessLabel()); err != nil { - return 0, err + return err } - - runtimeCheckpointStarted := time.Now() - err = utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, args...) + err := closure() // Ignore error returned from SetSocketLabel("") call, // can't recover. if labelErr := label.SetSocketLabel(""); labelErr == nil { @@ -834,577 +97,7 @@ func (r *ConmonOCIRuntime) CheckpointContainer(ctr *Container, options Container } else { logrus.Errorf("Unable to reset socket label: %q", labelErr) } - - runtimeCheckpointDuration := func() int64 { - if options.PrintStats { - return time.Since(runtimeCheckpointStarted).Microseconds() - } - return 0 - }() - - return runtimeCheckpointDuration, err -} - -func (r *ConmonOCIRuntime) CheckConmonRunning(ctr *Container) (bool, error) { - if ctr.state.ConmonPID == 0 { - // If the container is running or paused, assume Conmon is - // running. We didn't record Conmon PID on some old versions, so - // that is likely what's going on... - // Unusual enough that we should print a warning message though. - if ctr.ensureState(define.ContainerStateRunning, define.ContainerStatePaused) { - logrus.Warnf("Conmon PID is not set, but container is running!") - return true, nil - } - // Container's not running, so conmon PID being unset is - // expected. Conmon is not running. - return false, nil - } - - // We have a conmon PID. Ping it with signal 0. - if err := unix.Kill(ctr.state.ConmonPID, 0); err != nil { - if err == unix.ESRCH { - return false, nil - } - return false, fmt.Errorf("error pinging container %s conmon with signal 0: %w", ctr.ID(), err) - } - return true, nil -} - -// SupportsCheckpoint checks if the OCI runtime supports checkpointing -// containers. -func (r *ConmonOCIRuntime) SupportsCheckpoint() bool { - return crutils.CRRuntimeSupportsCheckpointRestore(r.path) -} - -// SupportsJSONErrors checks if the OCI runtime supports JSON-formatted error -// messages. -func (r *ConmonOCIRuntime) SupportsJSONErrors() bool { - return r.supportsJSON -} - -// SupportsNoCgroups checks if the OCI runtime supports running containers -// without cgroups (the --cgroup-manager=disabled flag). -func (r *ConmonOCIRuntime) SupportsNoCgroups() bool { - return r.supportsNoCgroups -} - -// SupportsKVM checks if the OCI runtime supports running containers -// without KVM separation -func (r *ConmonOCIRuntime) SupportsKVM() bool { - return r.supportsKVM -} - -// AttachSocketPath is the path to a single container's attach socket. -func (r *ConmonOCIRuntime) AttachSocketPath(ctr *Container) (string, error) { - if ctr == nil { - return "", fmt.Errorf("must provide a valid container to get attach socket path: %w", define.ErrInvalidArg) - } - - return filepath.Join(ctr.bundlePath(), "attach"), nil -} - -// ExitFilePath is the path to a container's exit file. -func (r *ConmonOCIRuntime) ExitFilePath(ctr *Container) (string, error) { - if ctr == nil { - return "", fmt.Errorf("must provide a valid container to get exit file path: %w", define.ErrInvalidArg) - } - return filepath.Join(r.exitsDir, ctr.ID()), nil -} - -// RuntimeInfo provides information on the runtime. -func (r *ConmonOCIRuntime) RuntimeInfo() (*define.ConmonInfo, *define.OCIRuntimeInfo, error) { - runtimePackage := packageVersion(r.path) - conmonPackage := packageVersion(r.conmonPath) - runtimeVersion, err := r.getOCIRuntimeVersion() - if err != nil { - return nil, nil, fmt.Errorf("error getting version of OCI runtime %s: %w", r.name, err) - } - conmonVersion, err := r.getConmonVersion() - if err != nil { - return nil, nil, fmt.Errorf("error getting conmon version: %w", err) - } - - conmon := define.ConmonInfo{ - Package: conmonPackage, - Path: r.conmonPath, - Version: conmonVersion, - } - ocirt := define.OCIRuntimeInfo{ - Name: r.name, - Path: r.path, - Package: runtimePackage, - Version: runtimeVersion, - } - return &conmon, &ocirt, nil -} - -// makeAccessible changes the path permission and each parent directory to have --x--x--x -func makeAccessible(path string, uid, gid int) error { - for ; path != "/"; path = filepath.Dir(path) { - st, err := os.Stat(path) - if err != nil { - if os.IsNotExist(err) { - return nil - } - return err - } - if int(st.Sys().(*syscall.Stat_t).Uid) == uid && int(st.Sys().(*syscall.Stat_t).Gid) == gid { - continue - } - if st.Mode()&0111 != 0111 { - if err := os.Chmod(path, st.Mode()|0111); err != nil { - return err - } - } - } - return nil -} - -// Wait for a container which has been sent a signal to stop -func waitContainerStop(ctr *Container, timeout time.Duration) error { - return waitPidStop(ctr.state.PID, timeout) -} - -// Wait for a given PID to stop -func waitPidStop(pid int, timeout time.Duration) error { - done := make(chan struct{}) - chControl := make(chan struct{}) - go func() { - for { - select { - case <-chControl: - return - default: - if err := unix.Kill(pid, 0); err != nil { - if err == unix.ESRCH { - close(done) - return - } - logrus.Errorf("Pinging PID %d with signal 0: %v", pid, err) - } - time.Sleep(100 * time.Millisecond) - } - } - }() - select { - case <-done: - return nil - case <-time.After(timeout): - close(chControl) - return fmt.Errorf("given PIDs did not die within timeout") - } -} - -func (r *ConmonOCIRuntime) getLogTag(ctr *Container) (string, error) { - logTag := ctr.LogTag() - if logTag == "" { - return "", nil - } - data, err := ctr.inspectLocked(false) - if err != nil { - // FIXME: this error should probably be returned - return "", nil //nolint: nilerr - } - tmpl, err := template.New("container").Parse(logTag) - if err != nil { - return "", fmt.Errorf("template parsing error %s: %w", logTag, err) - } - var b bytes.Buffer - err = tmpl.Execute(&b, data) - if err != nil { - return "", err - } - return b.String(), nil -} - -// createOCIContainer generates this container's main conmon instance and prepares it for starting -func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) { - var stderrBuf bytes.Buffer - - runtimeDir, err := util.GetRuntimeDir() - if err != nil { - return 0, err - } - - parentSyncPipe, childSyncPipe, err := newPipe() - if err != nil { - return 0, fmt.Errorf("error creating socket pair: %w", err) - } - defer errorhandling.CloseQuiet(parentSyncPipe) - - childStartPipe, parentStartPipe, err := newPipe() - if err != nil { - return 0, fmt.Errorf("error creating socket pair for start pipe: %w", err) - } - - defer errorhandling.CloseQuiet(parentStartPipe) - - var ociLog string - if logrus.GetLevel() != logrus.DebugLevel && r.supportsJSON { - ociLog = filepath.Join(ctr.state.RunDir, "oci-log") - } - - logTag, err := r.getLogTag(ctr) - if err != nil { - return 0, err - } - - if ctr.config.CgroupsMode == cgroupSplit { - if err := utils.MoveUnderCgroupSubtree("runtime"); err != nil { - return 0, err - } - } - - pidfile := ctr.config.PidFile - if pidfile == "" { - pidfile = filepath.Join(ctr.state.RunDir, "pidfile") - } - - args := r.sharedConmonArgs(ctr, ctr.ID(), ctr.bundlePath(), pidfile, ctr.LogPath(), r.exitsDir, ociLog, ctr.LogDriver(), logTag) - - if ctr.config.SdNotifyMode == define.SdNotifyModeContainer && ctr.notifySocket != "" { - args = append(args, fmt.Sprintf("--sdnotify-socket=%s", ctr.notifySocket)) - } - - if ctr.config.Spec.Process.Terminal { - args = append(args, "-t") - } else if ctr.config.Stdin { - args = append(args, "-i") - } - - if ctr.config.Timeout > 0 { - args = append(args, fmt.Sprintf("--timeout=%d", ctr.config.Timeout)) - } - - if !r.enableKeyring { - args = append(args, "--no-new-keyring") - } - if ctr.config.ConmonPidFile != "" { - args = append(args, "--conmon-pidfile", ctr.config.ConmonPidFile) - } - - if r.noPivot { - args = append(args, "--no-pivot") - } - - exitCommand, err := specgenutil.CreateExitCommandArgs(ctr.runtime.storageConfig, ctr.runtime.config, logrus.IsLevelEnabled(logrus.DebugLevel), ctr.AutoRemove(), false) - if err != nil { - return 0, err - } - exitCommand = append(exitCommand, ctr.config.ID) - - args = append(args, "--exit-command", exitCommand[0]) - for _, arg := range exitCommand[1:] { - args = append(args, []string{"--exit-command-arg", arg}...) - } - - // Pass down the LISTEN_* environment (see #10443). - preserveFDs := ctr.config.PreserveFDs - if val := os.Getenv("LISTEN_FDS"); val != "" { - if ctr.config.PreserveFDs > 0 { - logrus.Warnf("Ignoring LISTEN_FDS to preserve custom user-specified FDs") - } else { - fds, err := strconv.Atoi(val) - if err != nil { - return 0, fmt.Errorf("converting LISTEN_FDS=%s: %w", val, err) - } - preserveFDs = uint(fds) - } - } - - if preserveFDs > 0 { - args = append(args, formatRuntimeOpts("--preserve-fds", fmt.Sprintf("%d", preserveFDs))...) - } - - if restoreOptions != nil { - args = append(args, "--restore", ctr.CheckpointPath()) - if restoreOptions.TCPEstablished { - args = append(args, "--runtime-opt", "--tcp-established") - } - if restoreOptions.FileLocks { - args = append(args, "--runtime-opt", "--file-locks") - } - if restoreOptions.Pod != "" { - mountLabel := ctr.config.MountLabel - processLabel := ctr.config.ProcessLabel - if mountLabel != "" { - args = append( - args, - "--runtime-opt", - fmt.Sprintf( - "--lsm-mount-context=%s", - mountLabel, - ), - ) - } - if processLabel != "" { - args = append( - args, - "--runtime-opt", - fmt.Sprintf( - "--lsm-profile=selinux:%s", - processLabel, - ), - ) - } - } - } - - logrus.WithFields(logrus.Fields{ - "args": args, - }).Debugf("running conmon: %s", r.conmonPath) - - cmd := exec.Command(r.conmonPath, args...) - cmd.SysProcAttr = &syscall.SysProcAttr{ - Setpgid: true, - } - // TODO this is probably a really bad idea for some uses - // Make this configurable - cmd.Stdin = os.Stdin - cmd.Stdout = os.Stdout - cmd.Stderr = os.Stderr - if ctr.config.Spec.Process.Terminal { - cmd.Stderr = &stderrBuf - } - - // 0, 1 and 2 are stdin, stdout and stderr - conmonEnv := r.configureConmonEnv(runtimeDir) - - var filesToClose []*os.File - if preserveFDs > 0 { - for fd := 3; fd < int(3+preserveFDs); fd++ { - f := os.NewFile(uintptr(fd), fmt.Sprintf("fd-%d", fd)) - filesToClose = append(filesToClose, f) - cmd.ExtraFiles = append(cmd.ExtraFiles, f) - } - } - - cmd.Env = r.conmonEnv - // we don't want to step on users fds they asked to preserve - // Since 0-2 are used for stdio, start the fds we pass in at preserveFDs+3 - cmd.Env = append(cmd.Env, fmt.Sprintf("_OCI_SYNCPIPE=%d", preserveFDs+3), fmt.Sprintf("_OCI_STARTPIPE=%d", preserveFDs+4)) - cmd.Env = append(cmd.Env, conmonEnv...) - cmd.ExtraFiles = append(cmd.ExtraFiles, childSyncPipe, childStartPipe) - - if r.reservePorts && !rootless.IsRootless() && !ctr.config.NetMode.IsSlirp4netns() { - ports, err := bindPorts(ctr.convertPortMappings()) - if err != nil { - return 0, err - } - filesToClose = append(filesToClose, ports...) - - // Leak the port we bound in the conmon process. These fd's won't be used - // by the container and conmon will keep the ports busy so that another - // process cannot use them. - cmd.ExtraFiles = append(cmd.ExtraFiles, ports...) - } - - if ctr.config.NetMode.IsSlirp4netns() || rootless.IsRootless() { - if ctr.config.PostConfigureNetNS { - havePortMapping := len(ctr.config.PortMappings) > 0 - if havePortMapping { - ctr.rootlessPortSyncR, ctr.rootlessPortSyncW, err = os.Pipe() - if err != nil { - return 0, fmt.Errorf("failed to create rootless port sync pipe: %w", err) - } - } - ctr.rootlessSlirpSyncR, ctr.rootlessSlirpSyncW, err = os.Pipe() - if err != nil { - return 0, fmt.Errorf("failed to create rootless network sync pipe: %w", err) - } - } else { - if ctr.rootlessSlirpSyncR != nil { - defer errorhandling.CloseQuiet(ctr.rootlessSlirpSyncR) - } - if ctr.rootlessSlirpSyncW != nil { - defer errorhandling.CloseQuiet(ctr.rootlessSlirpSyncW) - } - } - // Leak one end in conmon, the other one will be leaked into slirp4netns - cmd.ExtraFiles = append(cmd.ExtraFiles, ctr.rootlessSlirpSyncW) - - if ctr.rootlessPortSyncW != nil { - defer errorhandling.CloseQuiet(ctr.rootlessPortSyncW) - // Leak one end in conmon, the other one will be leaked into rootlessport - cmd.ExtraFiles = append(cmd.ExtraFiles, ctr.rootlessPortSyncW) - } - } - var runtimeRestoreStarted time.Time - if restoreOptions != nil { - runtimeRestoreStarted = time.Now() - } - err = startCommand(cmd, ctr) - - // regardless of whether we errored or not, we no longer need the children pipes - childSyncPipe.Close() - childStartPipe.Close() - if err != nil { - return 0, err - } - if err := r.moveConmonToCgroupAndSignal(ctr, cmd, parentStartPipe); err != nil { - return 0, err - } - /* Wait for initial setup and fork, and reap child */ - err = cmd.Wait() - if err != nil { - return 0, err - } - - pid, err := readConmonPipeData(r.name, parentSyncPipe, ociLog) - if err != nil { - if err2 := r.DeleteContainer(ctr); err2 != nil { - logrus.Errorf("Removing container %s from runtime after creation failed", ctr.ID()) - } - return 0, err - } - ctr.state.PID = pid - - conmonPID, err := readConmonPidFile(ctr.config.ConmonPidFile) - if err != nil { - logrus.Warnf("Error reading conmon pid file for container %s: %v", ctr.ID(), err) - } else if conmonPID > 0 { - // conmon not having a pid file is a valid state, so don't set it if we don't have it - logrus.Infof("Got Conmon PID as %d", conmonPID) - ctr.state.ConmonPID = conmonPID - } - - runtimeRestoreDuration := func() int64 { - if restoreOptions != nil && restoreOptions.PrintStats { - return time.Since(runtimeRestoreStarted).Microseconds() - } - return 0 - }() - - // These fds were passed down to the runtime. Close them - // and not interfere - for _, f := range filesToClose { - errorhandling.CloseQuiet(f) - } - - return runtimeRestoreDuration, nil -} - -// configureConmonEnv gets the environment values to add to conmon's exec struct -// TODO this may want to be less hardcoded/more configurable in the future -func (r *ConmonOCIRuntime) configureConmonEnv(runtimeDir string) []string { - var env []string - for _, e := range os.Environ() { - if strings.HasPrefix(e, "LC_") { - env = append(env, e) - } - } - conf, ok := os.LookupEnv("CONTAINERS_CONF") - if ok { - env = append(env, fmt.Sprintf("CONTAINERS_CONF=%s", conf)) - } - env = append(env, fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)) - env = append(env, fmt.Sprintf("_CONTAINERS_USERNS_CONFIGURED=%s", os.Getenv("_CONTAINERS_USERNS_CONFIGURED"))) - env = append(env, fmt.Sprintf("_CONTAINERS_ROOTLESS_UID=%s", os.Getenv("_CONTAINERS_ROOTLESS_UID"))) - home := homedir.Get() - if home != "" { - env = append(env, fmt.Sprintf("HOME=%s", home)) - } - - return env -} - -// sharedConmonArgs takes common arguments for exec and create/restore and formats them for the conmon CLI -func (r *ConmonOCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, pidPath, logPath, exitDir, ociLogPath, logDriver, logTag string) []string { - // set the conmon API version to be able to use the correct sync struct keys - args := []string{ - "--api-version", "1", - "-c", ctr.ID(), - "-u", cuuid, - "-r", r.path, - "-b", bundlePath, - "-p", pidPath, - "-n", ctr.Name(), - "--exit-dir", exitDir, - "--full-attach", - } - if len(r.runtimeFlags) > 0 { - rFlags := []string{} - for _, arg := range r.runtimeFlags { - rFlags = append(rFlags, "--runtime-arg", arg) - } - args = append(args, rFlags...) - } - - if ctr.CgroupManager() == config.SystemdCgroupsManager && !ctr.config.NoCgroups && ctr.config.CgroupsMode != cgroupSplit { - args = append(args, "-s") - } - - var logDriverArg string - switch logDriver { - case define.JournaldLogging: - logDriverArg = define.JournaldLogging - case define.NoLogging: - logDriverArg = define.NoLogging - case define.PassthroughLogging: - logDriverArg = define.PassthroughLogging - //lint:ignore ST1015 the default case has to be here - default: //nolint:stylecheck,gocritic - // No case here should happen except JSONLogging, but keep this here in case the options are extended - logrus.Errorf("%s logging specified but not supported. Choosing k8s-file logging instead", ctr.LogDriver()) - fallthrough - case "": - // to get here, either a user would specify `--log-driver ""`, or this came from another place in libpod - // since the former case is obscure, and the latter case isn't an error, let's silently fallthrough - fallthrough - case define.JSONLogging: - fallthrough - case define.KubernetesLogging: - logDriverArg = fmt.Sprintf("%s:%s", define.KubernetesLogging, logPath) - } - - args = append(args, "-l", logDriverArg) - logLevel := logrus.GetLevel() - args = append(args, "--log-level", logLevel.String()) - - if logLevel == logrus.DebugLevel { - logrus.Debugf("%s messages will be logged to syslog", r.conmonPath) - args = append(args, "--syslog") - } - - size := r.logSizeMax - if ctr.config.LogSize > 0 { - size = ctr.config.LogSize - } - if size > 0 { - args = append(args, "--log-size-max", fmt.Sprintf("%v", size)) - } - - if ociLogPath != "" { - args = append(args, "--runtime-arg", "--log-format=json", "--runtime-arg", "--log", fmt.Sprintf("--runtime-arg=%s", ociLogPath)) - } - if logTag != "" { - args = append(args, "--log-tag", logTag) - } - if ctr.config.NoCgroups { - logrus.Debugf("Running with no Cgroups") - args = append(args, "--runtime-arg", "--cgroup-manager", "--runtime-arg", "disabled") - } - return args -} - -func startCommand(cmd *exec.Cmd, ctr *Container) error { - // Make sure to unset the NOTIFY_SOCKET and reset it afterwards if needed. - switch ctr.config.SdNotifyMode { - case define.SdNotifyModeContainer, define.SdNotifyModeIgnore: - if ctr.notifySocket != "" { - if err := os.Unsetenv("NOTIFY_SOCKET"); err != nil { - logrus.Warnf("Error unsetting NOTIFY_SOCKET %v", err) - } - - defer func() { - if err := os.Setenv("NOTIFY_SOCKET", ctr.notifySocket); err != nil { - logrus.Errorf("Resetting NOTIFY_SOCKET=%s", ctr.notifySocket) - } - }() - } - } - - return cmd.Start() + return err } // moveConmonToCgroupAndSignal gets a container's cgroupParent and moves the conmon process to that cgroup @@ -1476,271 +169,6 @@ func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec return nil } -// newPipe creates a unix socket pair for communication. -// Returns two files - first is parent, second is child. -func newPipe() (*os.File, *os.File, error) { - fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0) - if err != nil { - return nil, nil, err - } - return os.NewFile(uintptr(fds[1]), "parent"), os.NewFile(uintptr(fds[0]), "child"), nil -} - -// readConmonPidFile attempts to read conmon's pid from its pid file -func readConmonPidFile(pidFile string) (int, error) { - // Let's try reading the Conmon pid at the same time. - if pidFile != "" { - contents, err := ioutil.ReadFile(pidFile) - if err != nil { - return -1, err - } - // Convert it to an int - conmonPID, err := strconv.Atoi(string(contents)) - if err != nil { - return -1, err - } - return conmonPID, nil - } - return 0, nil -} - -// readConmonPipeData attempts to read a syncInfo struct from the pipe -func readConmonPipeData(runtimeName string, pipe *os.File, ociLog string) (int, error) { - // syncInfo is used to return data from monitor process to daemon - type syncInfo struct { - Data int `json:"data"` - Message string `json:"message,omitempty"` - } - - // Wait to get container pid from conmon - type syncStruct struct { - si *syncInfo - err error - } - ch := make(chan syncStruct) - go func() { - var si *syncInfo - rdr := bufio.NewReader(pipe) - b, err := rdr.ReadBytes('\n') - // ignore EOF here, error is returned even when data was read - // if it is no valid json unmarshal will fail below - if err != nil && !errors.Is(err, io.EOF) { - ch <- syncStruct{err: err} - } - if err := json.Unmarshal(b, &si); err != nil { - ch <- syncStruct{err: fmt.Errorf("conmon bytes %q: %w", string(b), err)} - return - } - ch <- syncStruct{si: si} - }() - - data := -1 //nolint: wastedassign - select { - case ss := <-ch: - if ss.err != nil { - if ociLog != "" { - ociLogData, err := ioutil.ReadFile(ociLog) - if err == nil { - var ociErr ociError - if err := json.Unmarshal(ociLogData, &ociErr); err == nil { - return -1, getOCIRuntimeError(runtimeName, ociErr.Msg) - } - } - } - return -1, fmt.Errorf("container create failed (no logs from conmon): %w", ss.err) - } - logrus.Debugf("Received: %d", ss.si.Data) - if ss.si.Data < 0 { - if ociLog != "" { - ociLogData, err := ioutil.ReadFile(ociLog) - if err == nil { - var ociErr ociError - if err := json.Unmarshal(ociLogData, &ociErr); err == nil { - return ss.si.Data, getOCIRuntimeError(runtimeName, ociErr.Msg) - } - } - } - // If we failed to parse the JSON errors, then print the output as it is - if ss.si.Message != "" { - return ss.si.Data, getOCIRuntimeError(runtimeName, ss.si.Message) - } - return ss.si.Data, fmt.Errorf("container create failed: %w", define.ErrInternal) - } - data = ss.si.Data - case <-time.After(define.ContainerCreateTimeout): - return -1, fmt.Errorf("container creation timeout: %w", define.ErrInternal) - } - return data, nil -} - -// writeConmonPipeData writes nonce data to a pipe -func writeConmonPipeData(pipe *os.File) error { - someData := []byte{0} - _, err := pipe.Write(someData) - return err -} - -// formatRuntimeOpts prepends opts passed to it with --runtime-opt for passing to conmon -func formatRuntimeOpts(opts ...string) []string { - args := make([]string, 0, len(opts)*2) - for _, o := range opts { - args = append(args, "--runtime-opt", o) - } - return args -} - -// getConmonVersion returns a string representation of the conmon version. -func (r *ConmonOCIRuntime) getConmonVersion() (string, error) { - output, err := utils.ExecCmd(r.conmonPath, "--version") - if err != nil { - return "", err - } - return strings.TrimSuffix(strings.Replace(output, "\n", ", ", 1), "\n"), nil -} - -// getOCIRuntimeVersion returns a string representation of the OCI runtime's -// version. -func (r *ConmonOCIRuntime) getOCIRuntimeVersion() (string, error) { - output, err := utils.ExecCmd(r.path, "--version") - if err != nil { - return "", err - } - return strings.TrimSuffix(output, "\n"), nil -} - -// Copy data from container to HTTP connection, for terminal attach. -// Container is the container's attach socket connection, http is a buffer for -// the HTTP connection. cid is the ID of the container the attach session is -// running for (used solely for error messages). -func httpAttachTerminalCopy(container *net.UnixConn, http *bufio.ReadWriter, cid string) error { - buf := make([]byte, bufferSize) - for { - numR, err := container.Read(buf) - logrus.Debugf("Read fd(%d) %d/%d bytes for container %s", int(buf[0]), numR, len(buf), cid) - - if numR > 0 { - switch buf[0] { - case AttachPipeStdout: - // Do nothing - default: - logrus.Errorf("Received unexpected attach type %+d, discarding %d bytes", buf[0], numR) - continue - } - - numW, err2 := http.Write(buf[1:numR]) - if err2 != nil { - if err != nil { - logrus.Errorf("Reading container %s STDOUT: %v", cid, err) - } - return err2 - } else if numW+1 != numR { - return io.ErrShortWrite - } - // We need to force the buffer to write immediately, so - // there isn't a delay on the terminal side. - if err2 := http.Flush(); err2 != nil { - if err != nil { - logrus.Errorf("Reading container %s STDOUT: %v", cid, err) - } - return err2 - } - } - if err != nil { - if err == io.EOF { - return nil - } - return err - } - } -} - -// Copy data from a container to an HTTP connection, for non-terminal attach. -// Appends a header to multiplex input. -func httpAttachNonTerminalCopy(container *net.UnixConn, http *bufio.ReadWriter, cid string, stdin, stdout, stderr bool) error { - buf := make([]byte, bufferSize) - for { - numR, err := container.Read(buf) - if numR > 0 { - var headerBuf []byte - - // Subtract 1 because we strip the first byte (used for - // multiplexing by Conmon). - headerLen := uint32(numR - 1) - // Practically speaking, we could make this buf[0] - 1, - // but we need to validate it anyway. - switch buf[0] { - case AttachPipeStdin: - headerBuf = makeHTTPAttachHeader(0, headerLen) - if !stdin { - continue - } - case AttachPipeStdout: - if !stdout { - continue - } - headerBuf = makeHTTPAttachHeader(1, headerLen) - case AttachPipeStderr: - if !stderr { - continue - } - headerBuf = makeHTTPAttachHeader(2, headerLen) - default: - logrus.Errorf("Received unexpected attach type %+d, discarding %d bytes", buf[0], numR) - continue - } - - numH, err2 := http.Write(headerBuf) - if err2 != nil { - if err != nil { - logrus.Errorf("Reading container %s standard streams: %v", cid, err) - } - - return err2 - } - // Hardcoding header length is pretty gross, but - // fast. Should be safe, as this is a fixed part - // of the protocol. - if numH != 8 { - if err != nil { - logrus.Errorf("Reading container %s standard streams: %v", cid, err) - } - - return io.ErrShortWrite - } - - numW, err2 := http.Write(buf[1:numR]) - if err2 != nil { - if err != nil { - logrus.Errorf("Reading container %s standard streams: %v", cid, err) - } - - return err2 - } else if numW+1 != numR { - if err != nil { - logrus.Errorf("Reading container %s standard streams: %v", cid, err) - } - - return io.ErrShortWrite - } - // We need to force the buffer to write immediately, so - // there isn't a delay on the terminal side. - if err2 := http.Flush(); err2 != nil { - if err != nil { - logrus.Errorf("Reading container %s STDOUT: %v", cid, err) - } - return err2 - } - } - if err != nil { - if err == io.EOF { - return nil - } - - return err - } - } -} - // GetLimits converts spec resource limits to cgroup consumable limits func GetLimits(resource *spec.LinuxResources) (runcconfig.Resources, error) { if resource == nil { diff --git a/libpod/oci_conmon_unsupported.go b/libpod/oci_conmon_unsupported.go new file mode 100644 index 000000000..cc6d68e89 --- /dev/null +++ b/libpod/oci_conmon_unsupported.go @@ -0,0 +1,24 @@ +//go:build !linux && !freebsd +// +build !linux,!freebsd + +package libpod + +import ( + "errors" + + "github.com/containers/common/pkg/config" + "github.com/containers/common/pkg/resize" +) + +// Make a new Conmon-based OCI runtime with the given options. +// Conmon will wrap the given OCI runtime, which can be `runc`, `crun`, or +// any runtime with a runc-compatible CLI. +// The first path that points to a valid executable will be used. +// Deliberately private. Someone should not be able to construct this outside of +// libpod. +func newConmonOCIRuntime(name string, paths []string, conmonPath string, runtimeFlags []string, runtimeCfg *config.Config) (OCIRuntime, error) { + return nil, errors.New("newConmonOCIRuntime not supported on this platform") +} + +func registerResizeFunc(r <-chan resize.TerminalSize, bundlePath string) { +} diff --git a/libpod/oci_missing.go b/libpod/oci_missing.go index 2ab2b4577..bbf2957ff 100644 --- a/libpod/oci_missing.go +++ b/libpod/oci_missing.go @@ -8,6 +8,7 @@ import ( "github.com/containers/common/pkg/resize" "github.com/containers/podman/v4/libpod/define" + spec "github.com/opencontainers/runtime-spec/specs-go" "github.com/sirupsen/logrus" ) @@ -80,6 +81,11 @@ func (r *MissingRuntime) StartContainer(ctr *Container) error { return r.printError() } +// UpdateContainer is not available as the runtime is missing +func (r *MissingRuntime) UpdateContainer(ctr *Container, resources *spec.LinuxResources) error { + return r.printError() +} + // KillContainer is not available as the runtime is missing // TODO: We could attempt to unix.Kill() the PID as recorded in the state if we // really want to smooth things out? Won't be perfect, but if the container has diff --git a/libpod/options.go b/libpod/options.go index b31cb4ab2..71ad3d11e 100644 --- a/libpod/options.go +++ b/libpod/options.go @@ -6,14 +6,12 @@ import ( "net" "os" "path/filepath" - "strings" "syscall" "github.com/containers/buildah/pkg/parse" nettypes "github.com/containers/common/libnetwork/types" "github.com/containers/common/pkg/config" "github.com/containers/common/pkg/secrets" - cutil "github.com/containers/common/pkg/util" "github.com/containers/image/v5/manifest" "github.com/containers/image/v5/types" "github.com/containers/podman/v4/libpod/define" @@ -29,12 +27,6 @@ import ( "github.com/sirupsen/logrus" ) -// Runtime Creation Options -var ( - // SdNotifyModeValues describes the only values that SdNotifyMode can be - SdNotifyModeValues = []string{define.SdNotifyModeContainer, define.SdNotifyModeConmon, define.SdNotifyModeIgnore} -) - // WithStorageConfig uses the given configuration to set up container storage. // If this is not specified, the system default configuration will be used // instead. @@ -613,6 +605,17 @@ func WithSystemd() CtrCreateOption { } } +// WithSdNotifySocket sets the sd-notify of the container +func WithSdNotifySocket(socketPath string) CtrCreateOption { + return func(ctr *Container) error { + if ctr.valid { + return define.ErrCtrFinalized + } + ctr.config.SdNotifySocket = socketPath + return nil + } +} + // WithSdNotifyMode sets the sd-notify method func WithSdNotifyMode(mode string) CtrCreateOption { return func(ctr *Container) error { @@ -620,9 +623,8 @@ func WithSdNotifyMode(mode string) CtrCreateOption { return define.ErrCtrFinalized } - // verify values - if len(mode) > 0 && !cutil.StringInSlice(strings.ToLower(mode), SdNotifyModeValues) { - return fmt.Errorf("--sdnotify values must be one of %q: %w", strings.Join(SdNotifyModeValues, ", "), define.ErrInvalidArg) + if err := define.ValidateSdNotifyMode(mode); err != nil { + return err } ctr.config.SdNotifyMode = mode @@ -1411,9 +1413,10 @@ func WithNamedVolumes(volumes []*ContainerNamedVolume) CtrCreateOption { } ctr.config.NamedVolumes = append(ctr.config.NamedVolumes, &ContainerNamedVolume{ - Name: vol.Name, - Dest: vol.Dest, - Options: mountOpts, + Name: vol.Name, + Dest: vol.Dest, + Options: mountOpts, + IsAnonymous: vol.IsAnonymous, }) } @@ -1470,6 +1473,17 @@ func WithHealthCheck(healthCheck *manifest.Schema2HealthConfig) CtrCreateOption } } +// WithHealthCheckOnFailureAction adds an on-failure action to health-check config +func WithHealthCheckOnFailureAction(action define.HealthCheckOnFailureAction) CtrCreateOption { + return func(ctr *Container) error { + if ctr.valid { + return define.ErrCtrFinalized + } + ctr.config.HealthCheckOnFailureAction = action + return nil + } +} + // WithPreserveFDs forwards from the process running Libpod into the container // the given number of extra FDs (starting after the standard streams) to the created container func WithPreserveFDs(fd uint) CtrCreateOption { @@ -1693,14 +1707,22 @@ func withSetAnon() VolumeCreateOption { } } -// WithVolumeDriverTimeout sets the volume creation timeout period -func WithVolumeDriverTimeout(timeout int) VolumeCreateOption { +// WithVolumeDriverTimeout sets the volume creation timeout period. +// Only usable if a non-local volume driver is in use. +func WithVolumeDriverTimeout(timeout uint) VolumeCreateOption { return func(volume *Volume) error { if volume.valid { return define.ErrVolumeFinalized } - volume.config.Timeout = timeout + if volume.config.Driver == "" || volume.config.Driver == define.VolumeDriverLocal { + return fmt.Errorf("Volume driver timeout can only be used with non-local volume drivers: %w", define.ErrInvalidArg) + } + + tm := timeout + + volume.config.Timeout = &tm + return nil } } diff --git a/libpod/plugin/volume_api.go b/libpod/plugin/volume_api.go index 0a5eaae53..522895798 100644 --- a/libpod/plugin/volume_api.go +++ b/libpod/plugin/volume_api.go @@ -3,6 +3,7 @@ package plugin import ( "bytes" "context" + "errors" "fmt" "io/ioutil" "net" @@ -13,8 +14,7 @@ import ( "sync" "time" - "errors" - + "github.com/containers/common/pkg/config" "github.com/containers/podman/v4/libpod/define" "github.com/docker/go-plugins-helpers/sdk" "github.com/docker/go-plugins-helpers/volume" @@ -40,7 +40,6 @@ var ( ) const ( - defaultTimeout = 5 * time.Second volumePluginType = "VolumeDriver" ) @@ -77,7 +76,7 @@ func validatePlugin(newPlugin *VolumePlugin) error { // Hit the Activate endpoint to find out if it is, and if so what kind req, err := http.NewRequest("POST", "http://plugin"+activatePath, nil) if err != nil { - return fmt.Errorf("error making request to volume plugin %s activation endpoint: %w", newPlugin.Name, err) + return fmt.Errorf("making request to volume plugin %s activation endpoint: %w", newPlugin.Name, err) } req.Header.Set("Host", newPlugin.getURI()) @@ -85,7 +84,7 @@ func validatePlugin(newPlugin *VolumePlugin) error { resp, err := newPlugin.Client.Do(req) if err != nil { - return fmt.Errorf("error sending request to plugin %s activation endpoint: %w", newPlugin.Name, err) + return fmt.Errorf("sending request to plugin %s activation endpoint: %w", newPlugin.Name, err) } defer resp.Body.Close() @@ -98,12 +97,12 @@ func validatePlugin(newPlugin *VolumePlugin) error { // Read and decode the body so we can tell if this is a volume plugin. respBytes, err := ioutil.ReadAll(resp.Body) if err != nil { - return fmt.Errorf("error reading activation response body from plugin %s: %w", newPlugin.Name, err) + return fmt.Errorf("reading activation response body from plugin %s: %w", newPlugin.Name, err) } respStruct := new(activateResponse) if err := json.Unmarshal(respBytes, respStruct); err != nil { - return fmt.Errorf("error unmarshalling plugin %s activation response: %w", newPlugin.Name, err) + return fmt.Errorf("unmarshalling plugin %s activation response: %w", newPlugin.Name, err) } foundVolume := false @@ -129,7 +128,7 @@ func validatePlugin(newPlugin *VolumePlugin) error { // GetVolumePlugin gets a single volume plugin, with the given name, at the // given path. -func GetVolumePlugin(name string, path string, timeout int) (*VolumePlugin, error) { +func GetVolumePlugin(name string, path string, timeout *uint, cfg *config.Config) (*VolumePlugin, error) { pluginsLock.Lock() defer pluginsLock.Unlock() @@ -152,13 +151,11 @@ func GetVolumePlugin(name string, path string, timeout int) (*VolumePlugin, erro // Need an HTTP client to force a Unix connection. // And since we can reuse it, might as well cache it. client := new(http.Client) - client.Timeout = defaultTimeout - // if the user specified a non-zero timeout, use their value. Else, keep the default. - if timeout != 0 { - if time.Duration(timeout)*time.Second < defaultTimeout { - logrus.Warnf("the default timeout for volume creation is %d seconds, setting a time less than that may break this feature.", defaultTimeout) - } - client.Timeout = time.Duration(timeout) * time.Second + client.Timeout = 5 * time.Second + if timeout != nil { + client.Timeout = time.Duration(*timeout) * time.Second + } else if cfg != nil { + client.Timeout = time.Duration(cfg.Engine.VolumePluginTimeout) * time.Second } // This bit borrowed from pkg/bindings/connection.go client.Transport = &http.Transport{ @@ -199,7 +196,7 @@ func (p *VolumePlugin) verifyReachable() error { return fmt.Errorf("%s: %w", p.Name, ErrPluginRemoved) } - return fmt.Errorf("error accessing plugin %s: %w", p.Name, err) + return fmt.Errorf("accessing plugin %s: %w", p.Name, err) } return nil } @@ -215,13 +212,13 @@ func (p *VolumePlugin) sendRequest(toJSON interface{}, endpoint string) (*http.R if toJSON != nil { reqJSON, err = json.Marshal(toJSON) if err != nil { - return nil, fmt.Errorf("error marshalling request JSON for volume plugin %s endpoint %s: %w", p.Name, endpoint, err) + return nil, fmt.Errorf("marshalling request JSON for volume plugin %s endpoint %s: %w", p.Name, endpoint, err) } } req, err := http.NewRequest("POST", "http://plugin"+endpoint, bytes.NewReader(reqJSON)) if err != nil { - return nil, fmt.Errorf("error making request to volume plugin %s endpoint %s: %w", p.Name, endpoint, err) + return nil, fmt.Errorf("making request to volume plugin %s endpoint %s: %w", p.Name, endpoint, err) } req.Header.Set("Host", p.getURI()) @@ -229,7 +226,7 @@ func (p *VolumePlugin) sendRequest(toJSON interface{}, endpoint string) (*http.R resp, err := p.Client.Do(req) if err != nil { - return nil, fmt.Errorf("error sending request to volume plugin %s endpoint %s: %w", p.Name, endpoint, err) + return nil, fmt.Errorf("sending request to volume plugin %s endpoint %s: %w", p.Name, endpoint, err) } // We are *deliberately not closing* response here. It is the // responsibility of the caller to do so after reading the response. @@ -243,9 +240,9 @@ func (p *VolumePlugin) makeErrorResponse(err, endpoint, volName string) error { err = "empty error from plugin" } if volName != "" { - return fmt.Errorf("error on %s on volume %s in volume plugin %s: %w", endpoint, volName, p.Name, errors.New(err)) + return fmt.Errorf("on %s on volume %s in volume plugin %s: %w", endpoint, volName, p.Name, errors.New(err)) } - return fmt.Errorf("error on %s in volume plugin %s: %w", endpoint, p.Name, errors.New(err)) + return fmt.Errorf("on %s in volume plugin %s: %w", endpoint, p.Name, errors.New(err)) } // Handle error responses from plugin @@ -257,12 +254,12 @@ func (p *VolumePlugin) handleErrorResponse(resp *http.Response, endpoint, volNam if resp.StatusCode != 200 { errResp, err := ioutil.ReadAll(resp.Body) if err != nil { - return fmt.Errorf("error reading response body from volume plugin %s: %w", p.Name, err) + return fmt.Errorf("reading response body from volume plugin %s: %w", p.Name, err) } errStruct := new(volume.ErrorResponse) if err := json.Unmarshal(errResp, errStruct); err != nil { - return fmt.Errorf("error unmarshalling JSON response from volume plugin %s: %w", p.Name, err) + return fmt.Errorf("unmarshalling JSON response from volume plugin %s: %w", p.Name, err) } return p.makeErrorResponse(errStruct.Err, endpoint, volName) @@ -312,12 +309,12 @@ func (p *VolumePlugin) ListVolumes() ([]*volume.Volume, error) { volumeRespBytes, err := ioutil.ReadAll(resp.Body) if err != nil { - return nil, fmt.Errorf("error reading response body from volume plugin %s: %w", p.Name, err) + return nil, fmt.Errorf("reading response body from volume plugin %s: %w", p.Name, err) } volumeResp := new(volume.ListResponse) if err := json.Unmarshal(volumeRespBytes, volumeResp); err != nil { - return nil, fmt.Errorf("error unmarshalling volume plugin %s list response: %w", p.Name, err) + return nil, fmt.Errorf("unmarshalling volume plugin %s list response: %w", p.Name, err) } return volumeResp.Volumes, nil @@ -347,12 +344,12 @@ func (p *VolumePlugin) GetVolume(req *volume.GetRequest) (*volume.Volume, error) getRespBytes, err := ioutil.ReadAll(resp.Body) if err != nil { - return nil, fmt.Errorf("error reading response body from volume plugin %s: %w", p.Name, err) + return nil, fmt.Errorf("reading response body from volume plugin %s: %w", p.Name, err) } getResp := new(volume.GetResponse) if err := json.Unmarshal(getRespBytes, getResp); err != nil { - return nil, fmt.Errorf("error unmarshalling volume plugin %s get response: %w", p.Name, err) + return nil, fmt.Errorf("unmarshalling volume plugin %s get response: %w", p.Name, err) } return getResp.Volume, nil @@ -403,12 +400,12 @@ func (p *VolumePlugin) GetVolumePath(req *volume.PathRequest) (string, error) { pathRespBytes, err := ioutil.ReadAll(resp.Body) if err != nil { - return "", fmt.Errorf("error reading response body from volume plugin %s: %w", p.Name, err) + return "", fmt.Errorf("reading response body from volume plugin %s: %w", p.Name, err) } pathResp := new(volume.PathResponse) if err := json.Unmarshal(pathRespBytes, pathResp); err != nil { - return "", fmt.Errorf("error unmarshalling volume plugin %s path response: %w", p.Name, err) + return "", fmt.Errorf("unmarshalling volume plugin %s path response: %w", p.Name, err) } return pathResp.Mountpoint, nil @@ -440,12 +437,12 @@ func (p *VolumePlugin) MountVolume(req *volume.MountRequest) (string, error) { mountRespBytes, err := ioutil.ReadAll(resp.Body) if err != nil { - return "", fmt.Errorf("error reading response body from volume plugin %s: %w", p.Name, err) + return "", fmt.Errorf("reading response body from volume plugin %s: %w", p.Name, err) } mountResp := new(volume.MountResponse) if err := json.Unmarshal(mountRespBytes, mountResp); err != nil { - return "", fmt.Errorf("error unmarshalling volume plugin %s path response: %w", p.Name, err) + return "", fmt.Errorf("unmarshalling volume plugin %s path response: %w", p.Name, err) } return mountResp.Mountpoint, nil diff --git a/libpod/pod_api.go b/libpod/pod_api.go index 29964ae95..924d43436 100644 --- a/libpod/pod_api.go +++ b/libpod/pod_api.go @@ -40,7 +40,7 @@ func (p *Pod) startInitContainers(ctx context.Context) error { icLock := initCon.lock icLock.Lock() var time *uint - if err := p.runtime.removeContainer(ctx, initCon, false, false, true, time); err != nil { + if err := p.runtime.removeContainer(ctx, initCon, false, false, true, false, time); err != nil { icLock.Unlock() return fmt.Errorf("failed to remove once init container %s: %w", initCon.ID(), err) } @@ -92,7 +92,7 @@ func (p *Pod) Start(ctx context.Context) (map[string]error, error) { // Build a dependency graph of containers in the pod graph, err := BuildContainerGraph(allCtrs) if err != nil { - return nil, fmt.Errorf("error generating dependency graph for pod %s: %w", p.ID(), err) + return nil, fmt.Errorf("generating dependency graph for pod %s: %w", p.ID(), err) } // If there are no containers without dependencies, we can't start // Error out @@ -109,7 +109,7 @@ func (p *Pod) Start(ctx context.Context) (map[string]error, error) { } if len(ctrErrors) > 0 { - return ctrErrors, fmt.Errorf("error starting some containers: %w", define.ErrPodPartialFail) + return ctrErrors, fmt.Errorf("starting some containers: %w", define.ErrPodPartialFail) } defer p.newPodEvent(events.Start) return nil, nil @@ -201,7 +201,7 @@ func (p *Pod) stopWithTimeout(ctx context.Context, cleanup bool, timeout int) (m } if len(ctrErrors) > 0 { - return ctrErrors, fmt.Errorf("error stopping some containers: %w", define.ErrPodPartialFail) + return ctrErrors, fmt.Errorf("stopping some containers: %w", define.ErrPodPartialFail) } if err := p.maybeStopServiceContainer(); err != nil { @@ -305,7 +305,7 @@ func (p *Pod) Cleanup(ctx context.Context) (map[string]error, error) { } if len(ctrErrors) > 0 { - return ctrErrors, fmt.Errorf("error cleaning up some containers: %w", define.ErrPodPartialFail) + return ctrErrors, fmt.Errorf("cleaning up some containers: %w", define.ErrPodPartialFail) } if err := p.maybeStopServiceContainer(); err != nil { @@ -376,7 +376,7 @@ func (p *Pod) Pause(ctx context.Context) (map[string]error, error) { } if len(ctrErrors) > 0 { - return ctrErrors, fmt.Errorf("error pausing some containers: %w", define.ErrPodPartialFail) + return ctrErrors, fmt.Errorf("pausing some containers: %w", define.ErrPodPartialFail) } return nil, nil } @@ -432,7 +432,7 @@ func (p *Pod) Unpause(ctx context.Context) (map[string]error, error) { } if len(ctrErrors) > 0 { - return ctrErrors, fmt.Errorf("error unpausing some containers: %w", define.ErrPodPartialFail) + return ctrErrors, fmt.Errorf("unpausing some containers: %w", define.ErrPodPartialFail) } return nil, nil } @@ -470,7 +470,7 @@ func (p *Pod) Restart(ctx context.Context) (map[string]error, error) { // Build a dependency graph of containers in the pod graph, err := BuildContainerGraph(allCtrs) if err != nil { - return nil, fmt.Errorf("error generating dependency graph for pod %s: %w", p.ID(), err) + return nil, fmt.Errorf("generating dependency graph for pod %s: %w", p.ID(), err) } ctrErrors := make(map[string]error) @@ -488,7 +488,7 @@ func (p *Pod) Restart(ctx context.Context) (map[string]error, error) { } if len(ctrErrors) > 0 { - return ctrErrors, fmt.Errorf("error stopping some containers: %w", define.ErrPodPartialFail) + return ctrErrors, fmt.Errorf("stopping some containers: %w", define.ErrPodPartialFail) } p.newPodEvent(events.Stop) p.newPodEvent(events.Start) @@ -547,7 +547,7 @@ func (p *Pod) Kill(ctx context.Context, signal uint) (map[string]error, error) { } if len(ctrErrors) > 0 { - return ctrErrors, fmt.Errorf("error killing some containers: %w", define.ErrPodPartialFail) + return ctrErrors, fmt.Errorf("killing some containers: %w", define.ErrPodPartialFail) } if err := p.maybeStopServiceContainer(); err != nil { diff --git a/libpod/pod_internal.go b/libpod/pod_internal.go index a86cd6d21..d63bff2c9 100644 --- a/libpod/pod_internal.go +++ b/libpod/pod_internal.go @@ -16,7 +16,7 @@ import ( func newPod(runtime *Runtime) *Pod { pod := new(Pod) pod.config = new(PodConfig) - pod.config.ID = stringid.GenerateNonCryptoID() + pod.config.ID = stringid.GenerateRandomID() pod.config.Labels = make(map[string]string) pod.config.CreatedTime = time.Now() // pod.config.InfraContainer = new(ContainerConfig) @@ -38,7 +38,7 @@ func (p *Pod) updatePod() error { // Save pod state to database func (p *Pod) save() error { if err := p.runtime.state.SavePod(p); err != nil { - return fmt.Errorf("error saving pod %s state: %w", p.ID(), err) + return fmt.Errorf("saving pod %s state: %w", p.ID(), err) } return nil @@ -60,7 +60,7 @@ func (p *Pod) refresh() error { // Retrieve the pod's lock lock, err := p.runtime.lockManager.AllocateAndRetrieveLock(p.config.LockID) if err != nil { - return fmt.Errorf("error retrieving lock %d for pod %s: %w", p.config.LockID, p.ID(), err) + return fmt.Errorf("retrieving lock %d for pod %s: %w", p.config.LockID, p.ID(), err) } p.lock = lock diff --git a/libpod/pod_top_unsupported.go b/libpod/pod_top_unsupported.go new file mode 100644 index 000000000..92323043a --- /dev/null +++ b/libpod/pod_top_unsupported.go @@ -0,0 +1,20 @@ +//go:build !linux +// +build !linux + +package libpod + +import ( + "errors" +) + +// GetPodPidInformation returns process-related data of all processes in +// the pod. The output data can be controlled via the `descriptors` +// argument which expects format descriptors and supports all AIXformat +// descriptors of ps (1) plus some additional ones to for instance inspect the +// set of effective capabilities. Each element in the returned string slice +// is a tab-separated string. +// +// For more details, please refer to github.com/containers/psgo. +func (p *Pod) GetPodPidInformation(descriptors []string) ([]string, error) { + return nil, errors.New("not implemented (*Pod) GetPodPidInformation") +} diff --git a/libpod/runtime.go b/libpod/runtime.go index ea4b34954..83c9f53e2 100644 --- a/libpod/runtime.go +++ b/libpod/runtime.go @@ -2,15 +2,12 @@ package libpod import ( "bufio" - "bytes" "context" "errors" "fmt" + "math/rand" "os" - "os/exec" "path/filepath" - "regexp" - "strconv" "strings" "sync" "syscall" @@ -44,17 +41,6 @@ import ( "github.com/sirupsen/logrus" ) -const ( - // conmonMinMajorVersion is the major version required for conmon. - conmonMinMajorVersion = 2 - - // conmonMinMinorVersion is the minor version required for conmon. - conmonMinMinorVersion = 0 - - // conmonMinPatchVersion is the sub-minor version required for conmon. - conmonMinPatchVersion = 24 -) - // A RuntimeOption is a functional option which alters the Runtime created by // NewRuntime type RuntimeOption func(*Runtime) error @@ -127,6 +113,13 @@ type Runtime struct { secretsManager *secrets.SecretsManager } +func init() { + // generateName calls namesgenerator.GetRandomName which the + // global RNG from math/rand. Seed it here to make sure we + // don't get the same name every time. + rand.Seed(time.Now().UnixNano()) +} + // SetXdgDirs ensures the XDG_RUNTIME_DIR env and XDG_CONFIG_HOME variables are set. // containers/image uses XDG_RUNTIME_DIR to locate the auth file, XDG_CONFIG_HOME is // use for the containers.conf configuration file. @@ -214,7 +207,7 @@ func newRuntimeFromConfig(conf *config.Config, options ...RuntimeOption) (*Runti // Overwrite config with user-given configuration options for _, opt := range options { if err := opt(runtime); err != nil { - return nil, fmt.Errorf("error configuring runtime: %w", err) + return nil, fmt.Errorf("configuring runtime: %w", err) } } @@ -230,7 +223,7 @@ func newRuntimeFromConfig(conf *config.Config, options ...RuntimeOption) (*Runti } if err := shutdown.Start(); err != nil { - return nil, fmt.Errorf("error starting shutdown signal handler: %w", err) + return nil, fmt.Errorf("starting shutdown signal handler: %w", err) } if err := makeRuntime(runtime); err != nil { @@ -287,7 +280,7 @@ func getLockManager(runtime *Runtime) (lock.Manager, error) { // Since we're renumbering, this is not fatal. // Remove the earlier set of locks and recreate. if err := os.Remove(filepath.Join("/dev/shm", lockPath)); err != nil { - return nil, fmt.Errorf("error removing libpod locks file %s: %w", lockPath, err) + return nil, fmt.Errorf("removing libpod locks file %s: %w", lockPath, err) } manager, err = lock.NewSHMLockManager(lockPath, runtime.config.Engine.NumLocks) @@ -308,7 +301,7 @@ func getLockManager(runtime *Runtime) (lock.Manager, error) { // Sets up containers/storage, state store, OCI runtime func makeRuntime(runtime *Runtime) (retErr error) { // Find a working conmon binary - cPath, err := findConmon(runtime.config.Engine.ConmonPath) + cPath, err := runtime.config.FindConmon() if err != nil { return err } @@ -325,7 +318,7 @@ func makeRuntime(runtime *Runtime) (retErr error) { if err := os.MkdirAll(runtime.config.Engine.StaticDir, 0700); err != nil { // The directory is allowed to exist if !errors.Is(err, os.ErrExist) { - return fmt.Errorf("error creating runtime static files directory: %w", err) + return fmt.Errorf("creating runtime static files directory: %w", err) } } @@ -369,7 +362,7 @@ func makeRuntime(runtime *Runtime) (retErr error) { } } - return fmt.Errorf("error retrieving runtime configuration from database: %w", err) + return fmt.Errorf("retrieving runtime configuration from database: %w", err) } runtime.mergeDBConfig(dbConfig) @@ -412,7 +405,7 @@ func makeRuntime(runtime *Runtime) (retErr error) { } if err := runtime.state.SetNamespace(runtime.config.Engine.Namespace); err != nil { - return fmt.Errorf("error setting libpod namespace in state: %w", err) + return fmt.Errorf("setting libpod namespace in state: %w", err) } logrus.Debugf("Set libpod namespace to %q", runtime.config.Engine.Namespace) @@ -469,15 +462,7 @@ func makeRuntime(runtime *Runtime) (retErr error) { if err := os.MkdirAll(runtime.config.Engine.TmpDir, 0751); err != nil { // The directory is allowed to exist if !errors.Is(err, os.ErrExist) { - return fmt.Errorf("error creating tmpdir: %w", err) - } - } - - // Create events log dir - if err := os.MkdirAll(filepath.Dir(runtime.config.Engine.EventsLogFilePath), 0700); err != nil { - // The directory is allowed to exist - if !errors.Is(err, os.ErrExist) { - return fmt.Errorf("error creating events dirs: %w", err) + return fmt.Errorf("creating tmpdir: %w", err) } } @@ -535,7 +520,7 @@ func makeRuntime(runtime *Runtime) (retErr error) { if err := os.MkdirAll(runtime.config.Engine.TmpDir, 0755); err != nil { // The directory is allowed to exist if !errors.Is(err, os.ErrExist) { - return fmt.Errorf("error creating runtime temporary files directory: %w", err) + return fmt.Errorf("creating runtime temporary files directory: %w", err) } } @@ -556,7 +541,7 @@ func makeRuntime(runtime *Runtime) (retErr error) { runtimeAliveFile := filepath.Join(runtime.config.Engine.TmpDir, "alive") aliveLock, err := storage.GetLockfile(runtimeAliveLock) if err != nil { - return fmt.Errorf("error acquiring runtime init lock: %w", err) + return fmt.Errorf("acquiring runtime init lock: %w", err) } // Acquire the lock and hold it until we return // This ensures that no two processes will be in runtime.refresh at once @@ -610,7 +595,7 @@ func makeRuntime(runtime *Runtime) (retErr error) { if errors.Is(err, os.ErrNotExist) { doRefresh = true } else { - return fmt.Errorf("error reading runtime status file %s: %w", runtimeAliveFile, err) + return fmt.Errorf("reading runtime status file %s: %w", runtimeAliveFile, err) } } @@ -670,102 +655,6 @@ func makeRuntime(runtime *Runtime) (retErr error) { return nil } -// findConmon iterates over conmonPaths and returns the path -// to the first conmon binary with a new enough version. If none is found, -// we try to do a path lookup of "conmon". -func findConmon(conmonPaths []string) (string, error) { - foundOutdatedConmon := false - for _, path := range conmonPaths { - stat, err := os.Stat(path) - if err != nil { - continue - } - if stat.IsDir() { - continue - } - if err := probeConmon(path); err != nil { - logrus.Warnf("Conmon at %s invalid: %v", path, err) - foundOutdatedConmon = true - continue - } - logrus.Debugf("Using conmon: %q", path) - return path, nil - } - - // Search the $PATH as last fallback - if path, err := exec.LookPath("conmon"); err == nil { - if err := probeConmon(path); err != nil { - logrus.Warnf("Conmon at %s is invalid: %v", path, err) - foundOutdatedConmon = true - } else { - logrus.Debugf("Using conmon from $PATH: %q", path) - return path, nil - } - } - - if foundOutdatedConmon { - return "", fmt.Errorf( - "please update to v%d.%d.%d or later: %w", - conmonMinMajorVersion, conmonMinMinorVersion, conmonMinPatchVersion, define.ErrConmonOutdated) - } - - return "", fmt.Errorf( - "could not find a working conmon binary (configured options: %v): %w", - conmonPaths, define.ErrInvalidArg) -} - -// probeConmon calls conmon --version and verifies it is a new enough version for -// the runtime expectations the container engine currently has. -func probeConmon(conmonBinary string) error { - cmd := exec.Command(conmonBinary, "--version") - var out bytes.Buffer - cmd.Stdout = &out - err := cmd.Run() - if err != nil { - return err - } - r := regexp.MustCompile(`^conmon version (?P<Major>\d+).(?P<Minor>\d+).(?P<Patch>\d+)`) - - matches := r.FindStringSubmatch(out.String()) - if len(matches) != 4 { - return fmt.Errorf("%v: %w", define.ErrConmonVersionFormat, err) - } - major, err := strconv.Atoi(matches[1]) - if err != nil { - return fmt.Errorf("%v: %w", define.ErrConmonVersionFormat, err) - } - if major < conmonMinMajorVersion { - return define.ErrConmonOutdated - } - if major > conmonMinMajorVersion { - return nil - } - - minor, err := strconv.Atoi(matches[2]) - if err != nil { - return fmt.Errorf("%v: %w", define.ErrConmonVersionFormat, err) - } - if minor < conmonMinMinorVersion { - return define.ErrConmonOutdated - } - if minor > conmonMinMinorVersion { - return nil - } - - patch, err := strconv.Atoi(matches[3]) - if err != nil { - return fmt.Errorf("%v: %w", define.ErrConmonVersionFormat, err) - } - if patch < conmonMinPatchVersion { - return define.ErrConmonOutdated - } - if patch > conmonMinPatchVersion { - return nil - } - - return nil -} - // TmpDir gets the current Libpod temporary files directory. func (r *Runtime) TmpDir() (string, error) { if !r.valid { @@ -798,7 +687,7 @@ func (r *Runtime) GetConfig() (*config.Config, error) { // Copy so the caller won't be able to modify the actual config if err := JSONDeepCopy(rtConfig, config); err != nil { - return nil, fmt.Errorf("error copying config: %w", err) + return nil, fmt.Errorf("copying config: %w", err) } return config, nil @@ -909,7 +798,7 @@ func (r *Runtime) Shutdown(force bool) error { // Note that the libimage runtime shuts down the store. if err := r.libimageRuntime.Shutdown(force); err != nil { - lastError = fmt.Errorf("error shutting down container storage: %w", err) + lastError = fmt.Errorf("shutting down container storage: %w", err) } } if err := r.state.Close(); err != nil { @@ -941,15 +830,15 @@ func (r *Runtime) refresh(alivePath string) error { // Containers, pods, and volumes must also reacquire their locks. ctrs, err := r.state.AllContainers() if err != nil { - return fmt.Errorf("error retrieving all containers from state: %w", err) + return fmt.Errorf("retrieving all containers from state: %w", err) } pods, err := r.state.AllPods() if err != nil { - return fmt.Errorf("error retrieving all pods from state: %w", err) + return fmt.Errorf("retrieving all pods from state: %w", err) } vols, err := r.state.AllVolumes() if err != nil { - return fmt.Errorf("error retrieving all volumes from state: %w", err) + return fmt.Errorf("retrieving all volumes from state: %w", err) } // No locks are taken during pod, volume, and container refresh. // Furthermore, the pod/volume/container refresh() functions are not @@ -977,7 +866,7 @@ func (r *Runtime) refresh(alivePath string) error { // Create a file indicating the runtime is alive and ready file, err := os.OpenFile(alivePath, os.O_RDONLY|os.O_CREATE, 0644) if err != nil { - return fmt.Errorf("error creating runtime status file: %w", err) + return fmt.Errorf("creating runtime status file: %w", err) } defer file.Close() @@ -1141,9 +1030,6 @@ func (r *Runtime) mergeDBConfig(dbConfig *DBConfig) { logrus.Debugf("Overriding tmp dir %q with %q from database", c.TmpDir, dbConfig.LibpodTmp) } c.TmpDir = dbConfig.LibpodTmp - if c.EventsLogFilePath == "" { - c.EventsLogFilePath = filepath.Join(dbConfig.LibpodTmp, "events", "events.log") - } } if !r.storageSet.VolumePathSet && dbConfig.VolumePath != "" { @@ -1208,7 +1094,7 @@ func (r *Runtime) getVolumePlugin(volConfig *VolumeConfig) (*plugin.VolumePlugin return nil, fmt.Errorf("no volume plugin with name %s available: %w", name, define.ErrMissingPlugin) } - return plugin.GetVolumePlugin(name, pluginPath, timeout) + return plugin.GetVolumePlugin(name, pluginPath, timeout, r.config) } // GetSecretsStorageDir returns the directory that the secrets manager should take diff --git a/libpod/runtime_cstorage.go b/libpod/runtime_cstorage.go index 047375628..372434b49 100644 --- a/libpod/runtime_cstorage.go +++ b/libpod/runtime_cstorage.go @@ -39,7 +39,7 @@ func (r *Runtime) ListStorageContainers() ([]*StorageContainer, error) { // Look up if container is in state hasCtr, err := r.state.HasContainer(ctr.ID) if err != nil { - return nil, fmt.Errorf("error looking up container %s in state: %w", ctr.ID, err) + return nil, fmt.Errorf("looking up container %s in state: %w", ctr.ID, err) } storageCtr.PresentInLibpod = hasCtr @@ -64,7 +64,7 @@ func (r *Runtime) RemoveStorageContainer(idOrName string, force bool) error { if errors.Is(err, storage.ErrLayerUnknown) { return fmt.Errorf("no container with ID or name %q found: %w", idOrName, define.ErrNoSuchCtr) } - return fmt.Errorf("error looking up container %q: %w", idOrName, err) + return fmt.Errorf("looking up container %q: %w", idOrName, err) } // Lookup returns an ID but it's not guaranteed to be a container ID. @@ -74,7 +74,7 @@ func (r *Runtime) RemoveStorageContainer(idOrName string, force bool) error { if errors.Is(err, storage.ErrContainerUnknown) { return fmt.Errorf("%q does not refer to a container: %w", idOrName, define.ErrNoSuchCtr) } - return fmt.Errorf("error retrieving container %q: %w", idOrName, err) + return fmt.Errorf("retrieving container %q: %w", idOrName, err) } // Error out if the container exists in libpod @@ -115,7 +115,7 @@ func (r *Runtime) RemoveStorageContainer(idOrName string, force bool) error { logrus.Infof("Storage for container %s already removed", ctr.ID) return nil } - return fmt.Errorf("error removing storage for container %q: %w", idOrName, err) + return fmt.Errorf("removing storage for container %q: %w", idOrName, err) } return nil diff --git a/libpod/runtime_ctr.go b/libpod/runtime_ctr.go index ce0fd869d..7b3cbadfa 100644 --- a/libpod/runtime_ctr.go +++ b/libpod/runtime_ctr.go @@ -86,7 +86,7 @@ func (r *Runtime) RestoreContainer(ctx context.Context, rSpec *spec.Spec, config ctr, err := r.initContainerVariables(rSpec, config) if err != nil { - return nil, fmt.Errorf("error initializing container variables: %w", err) + return nil, fmt.Errorf("initializing container variables: %w", err) } // For an imported checkpoint no one has ever set the StartedTime. Set it now. ctr.state.StartedTime = time.Now() @@ -126,7 +126,7 @@ func (r *Runtime) RenameContainer(ctx context.Context, ctr *Container, newName s // the config was re-written. newConf, err := r.state.GetContainerConfig(ctr.ID()) if err != nil { - return nil, fmt.Errorf("error retrieving container %s configuration from DB to remove: %w", ctr.ID(), err) + return nil, fmt.Errorf("retrieving container %s configuration from DB to remove: %w", ctr.ID(), err) } ctr.config = newConf @@ -143,7 +143,7 @@ func (r *Runtime) RenameContainer(ctx context.Context, ctr *Container, newName s // Set config back to the old name so reflect what is actually // present in the DB. ctr.config.Name = oldName - return nil, fmt.Errorf("error renaming container %s: %w", ctr.ID(), err) + return nil, fmt.Errorf("renaming container %s: %w", ctr.ID(), err) } // Step 3: rename the container in c/storage. @@ -169,14 +169,19 @@ func (r *Runtime) initContainerVariables(rSpec *spec.Spec, config *ContainerConf ctr.state = new(ContainerState) if config == nil { - ctr.config.ID = stringid.GenerateNonCryptoID() + ctr.config.ID = stringid.GenerateRandomID() size, err := units.FromHumanSize(r.config.Containers.ShmSize) - if err != nil { - return nil, fmt.Errorf("converting containers.conf ShmSize %s to an int: %w", r.config.Containers.ShmSize, err) + if useDevShm { + if err != nil { + return nil, fmt.Errorf("converting containers.conf ShmSize %s to an int: %w", r.config.Containers.ShmSize, err) + } + ctr.config.ShmSize = size + ctr.config.NoShm = false + ctr.config.NoShmShare = false + } else { + ctr.config.NoShm = true + ctr.config.NoShmShare = true } - ctr.config.ShmSize = size - ctr.config.NoShm = false - ctr.config.NoShmShare = false ctr.config.StopSignal = 15 ctr.config.StopTimeout = r.config.Engine.StopTimeout @@ -184,11 +189,11 @@ func (r *Runtime) initContainerVariables(rSpec *spec.Spec, config *ContainerConf // This is a restore from an imported checkpoint ctr.restoreFromCheckpoint = true if err := JSONDeepCopy(config, ctr.config); err != nil { - return nil, fmt.Errorf("error copying container config for restore: %w", err) + return nil, fmt.Errorf("copying container config for restore: %w", err) } // If the ID is empty a new name for the restored container was requested if ctr.config.ID == "" { - ctr.config.ID = stringid.GenerateNonCryptoID() + ctr.config.ID = stringid.GenerateRandomID() } // Reset the log path to point to the default ctr.config.LogPath = "" @@ -224,12 +229,12 @@ func (r *Runtime) newContainer(ctx context.Context, rSpec *spec.Spec, options .. ctr, err = r.initContainerVariables(rSpec, nil) if err != nil { - return nil, fmt.Errorf("error initializing container variables: %w", err) + return nil, fmt.Errorf("initializing container variables: %w", err) } for _, option := range options { if err := option(ctr); err != nil { - return nil, fmt.Errorf("error running container create option: %w", err) + return nil, fmt.Errorf("running container create option: %w", err) } } @@ -296,7 +301,7 @@ func (r *Runtime) setupContainer(ctx context.Context, ctr *Container) (_ *Contai // Allocate a lock for the container lock, err := r.lockManager.AllocateLock() if err != nil { - return nil, fmt.Errorf("error allocating lock for new container: %w", err) + return nil, fmt.Errorf("allocating lock for new container: %w", err) } ctr.lock = lock ctr.config.LockID = ctr.lock.ID() @@ -350,7 +355,7 @@ func (r *Runtime) setupContainer(ctx context.Context, ctr *Container) (_ *Contai if pod != nil && pod.config.UsePodCgroup && !ctr.IsInfra() { podCgroup, err := pod.CgroupPath() if err != nil { - return nil, fmt.Errorf("error retrieving pod %s cgroup: %w", pod.ID(), err) + return nil, fmt.Errorf("retrieving pod %s cgroup: %w", pod.ID(), err) } expectPodCgroup, err := ctr.expectPodCgroup() if err != nil { @@ -375,7 +380,7 @@ func (r *Runtime) setupContainer(ctx context.Context, ctr *Container) (_ *Contai case pod != nil && pod.config.UsePodCgroup && !ctr.IsInfra(): podCgroup, err := pod.CgroupPath() if err != nil { - return nil, fmt.Errorf("error retrieving pod %s cgroup: %w", pod.ID(), err) + return nil, fmt.Errorf("retrieving pod %s cgroup: %w", pod.ID(), err) } ctr.config.CgroupParent = podCgroup case rootless.IsRootless() && ctr.config.CgroupsMode != cgroupSplit: @@ -427,7 +432,7 @@ func (r *Runtime) setupContainer(ctx context.Context, ctr *Container) (_ *Contai defer func() { if retErr != nil { if err := ctr.teardownStorage(); err != nil { - logrus.Errorf("Removing partially-created container root filesystem: %s", err) + logrus.Errorf("Removing partially-created container root filesystem: %v", err) } } }() @@ -461,7 +466,7 @@ func (r *Runtime) setupContainer(ctx context.Context, ctr *Container) (_ *Contai if vol.Name == "" { // Anonymous volume. We'll need to create it. // It needs a name first. - vol.Name = stringid.GenerateNonCryptoID() + vol.Name = stringid.GenerateRandomID() isAnonymous = true } else { // Check if it exists already @@ -471,9 +476,14 @@ func (r *Runtime) setupContainer(ctx context.Context, ctr *Container) (_ *Contai // The volume exists, we're good continue } else if !errors.Is(err, define.ErrNoSuchVolume) { - return nil, fmt.Errorf("error retrieving named volume %s for new container: %w", vol.Name, err) + return nil, fmt.Errorf("retrieving named volume %s for new container: %w", vol.Name, err) } } + if vol.IsAnonymous { + // If SetAnonymous is true, make this an anonymous volume + // this is needed for emptyDir volumes from kube yamls + isAnonymous = true + } logrus.Debugf("Creating new volume %s for container", vol.Name) @@ -504,7 +514,7 @@ func (r *Runtime) setupContainer(ctx context.Context, ctr *Container) (_ *Contai } newVol, err := r.newVolume(false, volOptions...) if err != nil { - return nil, fmt.Errorf("error creating named volume %q: %w", vol.Name, err) + return nil, fmt.Errorf("creating named volume %q: %w", vol.Name, err) } ctrNamedVolumes = append(ctrNamedVolumes, newVol) @@ -523,7 +533,7 @@ func (r *Runtime) setupContainer(ctx context.Context, ctr *Container) (_ *Contai } } - if !MountExists(ctr.config.Spec.Mounts, "/dev/shm") && ctr.config.ShmDir == "" && !ctr.config.NoShm { + if useDevShm && !MountExists(ctr.config.Spec.Mounts, "/dev/shm") && ctr.config.ShmDir == "" && !ctr.config.NoShm { ctr.config.ShmDir = filepath.Join(ctr.bundlePath(), "shm") if err := os.MkdirAll(ctr.config.ShmDir, 0700); err != nil { if !os.IsExist(err) { @@ -571,7 +581,7 @@ func (r *Runtime) setupContainer(ctx context.Context, ctr *Container) (_ *Contai // be removed also if and only if the container is the sole user // Otherwise, RemoveContainer will return an error if the container is running func (r *Runtime) RemoveContainer(ctx context.Context, c *Container, force bool, removeVolume bool, timeout *uint) error { - return r.removeContainer(ctx, c, force, removeVolume, false, timeout) + return r.removeContainer(ctx, c, force, removeVolume, false, false, timeout) } // Internal function to remove a container. @@ -579,7 +589,9 @@ func (r *Runtime) RemoveContainer(ctx context.Context, c *Container, force bool, // removePod is used only when removing pods. It instructs Podman to ignore // infra container protections, and *not* remove from the database (as pod // remove will handle that). -func (r *Runtime) removeContainer(ctx context.Context, c *Container, force, removeVolume, removePod bool, timeout *uint) error { +// ignoreDeps is *DANGEROUS* and should not be used outside of a very specific +// context (alternate pod removal code, where graph traversal is not possible). +func (r *Runtime) removeContainer(ctx context.Context, c *Container, force, removeVolume, removePod, ignoreDeps bool, timeout *uint) error { if !c.valid { if ok, _ := r.state.HasContainer(c.ID()); !ok { // Container probably already removed @@ -596,7 +608,7 @@ func (r *Runtime) removeContainer(ctx context.Context, c *Container, force, remo // exist once we're done. newConf, err := r.state.GetContainerConfig(c.ID()) if err != nil { - return fmt.Errorf("error retrieving container %s configuration from DB to remove: %w", c.ID(), err) + return fmt.Errorf("retrieving container %s configuration from DB to remove: %w", c.ID(), err) } c.config = newConf @@ -608,25 +620,27 @@ func (r *Runtime) removeContainer(ctx context.Context, c *Container, force, remo // pod. var pod *Pod runtime := c.runtime - if c.config.Pod != "" && !removePod { + if c.config.Pod != "" { pod, err = r.state.Pod(c.config.Pod) if err != nil { return fmt.Errorf("container %s is in pod %s, but pod cannot be retrieved: %w", c.ID(), pod.ID(), err) } - // Lock the pod while we're removing container - if pod.config.LockID == c.config.LockID { - return fmt.Errorf("container %s and pod %s share lock ID %d: %w", c.ID(), pod.ID(), c.config.LockID, define.ErrWillDeadlock) - } - pod.lock.Lock() - defer pod.lock.Unlock() - if err := pod.updatePod(); err != nil { - return err - } + if !removePod { + // Lock the pod while we're removing container + if pod.config.LockID == c.config.LockID { + return fmt.Errorf("container %s and pod %s share lock ID %d: %w", c.ID(), pod.ID(), c.config.LockID, define.ErrWillDeadlock) + } + pod.lock.Lock() + defer pod.lock.Unlock() + if err := pod.updatePod(); err != nil { + return err + } - infraID := pod.state.InfraContainerID - if c.ID() == infraID { - return fmt.Errorf("container %s is the infra container of pod %s and cannot be removed without removing the pod", c.ID(), pod.ID()) + infraID := pod.state.InfraContainerID + if c.ID() == infraID { + return fmt.Errorf("container %s is the infra container of pod %s and cannot be removed without removing the pod", c.ID(), pod.ID()) + } } } @@ -686,7 +700,7 @@ func (r *Runtime) removeContainer(ctx context.Context, c *Container, force, remo // Check that no other containers depend on the container. // Only used if not removing a pod - pods guarantee that all // deps will be evicted at the same time. - if !removePod { + if !ignoreDeps { deps, err := r.state.ContainerInUse(c) if err != nil { return err @@ -717,7 +731,7 @@ func (r *Runtime) removeContainer(ctx context.Context, c *Container, force, remo if ok, _ := r.state.HasContainer(c.ID()); !ok { // When the container has already been removed, the OCI runtime directory remain. if err := c.cleanupRuntime(ctx); err != nil { - return fmt.Errorf("error cleaning up container %s from OCI runtime: %w", c.ID(), err) + return fmt.Errorf("cleaning up container %s from OCI runtime: %w", c.ID(), err) } return nil } @@ -729,7 +743,7 @@ func (r *Runtime) removeContainer(ctx context.Context, c *Container, force, remo // Do this before we set ContainerStateRemoving, to ensure that we can // actually remove from the OCI runtime. if err := c.cleanup(ctx); err != nil { - cleanupErr = fmt.Errorf("error cleaning up container %s: %w", c.ID(), err) + cleanupErr = fmt.Errorf("cleaning up container %s: %w", c.ID(), err) } // Set ContainerStateRemoving @@ -767,13 +781,11 @@ func (r *Runtime) removeContainer(ctx context.Context, c *Container, force, remo if c.config.Pod != "" { // If we're removing the pod, the container will be evicted // from the state elsewhere - if !removePod { - if err := r.state.RemoveContainerFromPod(pod, c); err != nil { - if cleanupErr == nil { - cleanupErr = err - } else { - logrus.Errorf("Removing container %s from database: %v", c.ID(), err) - } + if err := r.state.RemoveContainerFromPod(pod, c); err != nil { + if cleanupErr == nil { + cleanupErr = err + } else { + logrus.Errorf("Removing container %s from database: %v", c.ID(), err) } } } else { @@ -788,8 +800,8 @@ func (r *Runtime) removeContainer(ctx context.Context, c *Container, force, remo // Deallocate the container's lock if err := c.lock.Free(); err != nil { - if cleanupErr == nil { - cleanupErr = fmt.Errorf("error freeing lock for container %s: %w", c.ID(), err) + if cleanupErr == nil && !os.IsNotExist(err) { + cleanupErr = fmt.Errorf("freeing lock for container %s: %w", c.ID(), err) } else { logrus.Errorf("Free container lock: %v", err) } @@ -814,11 +826,11 @@ func (r *Runtime) removeContainer(ctx context.Context, c *Container, force, remo // Ignore error, since podman will report original error volumesFrom, _ := c.volumesFrom() if len(volumesFrom) > 0 { - logrus.Debugf("Cleaning up volume not possible since volume is in use (%s)", v) + logrus.Debugf("Cleaning up volume not possible since volume is in use (%s)", v.Name) continue } } - logrus.Errorf("Cleaning up volume (%s): %v", v, err) + logrus.Errorf("Cleaning up volume (%s): %v", v.Name, err) } } } @@ -862,7 +874,7 @@ func (r *Runtime) evictContainer(ctx context.Context, idOrName string, removeVol if err == nil { logrus.Infof("Container %s successfully retrieved from state, attempting normal removal", id) // Assume force = true for the evict case - err = r.removeContainer(ctx, tmpCtr, true, removeVolume, false, timeout) + err = r.removeContainer(ctx, tmpCtr, true, removeVolume, false, false, timeout) if !tmpCtr.valid { // If the container is marked invalid, remove succeeded // in kicking it out of the state - no need to continue. @@ -968,7 +980,7 @@ func (r *Runtime) evictContainer(ctx context.Context, idOrName string, removeVol continue } if err := r.removeVolume(ctx, volume, false, timeout, false); err != nil && err != define.ErrNoSuchVolume && err != define.ErrVolumeBeingUsed { - logrus.Errorf("Cleaning up volume (%s): %v", v, err) + logrus.Errorf("Cleaning up volume (%s): %v", v.Name, err) } } } @@ -995,7 +1007,7 @@ func (r *Runtime) RemoveDepend(ctx context.Context, rmCtr *Container, force bool return nil, err } for _, cID := range podContainerIDS { - rmReports = append(rmReports, &reports.RmReport{Id: cID}) + rmReports = append(rmReports, &reports.RmReport{Id: cID, RawInput: cID}) } return rmReports, nil } @@ -1023,8 +1035,8 @@ func (r *Runtime) RemoveDepend(ctx context.Context, rmCtr *Container, force bool rmReports = append(rmReports, reports...) } - report := reports.RmReport{Id: rmCtr.ID()} - report.Err = r.removeContainer(ctx, rmCtr, force, removeVolume, false, timeout) + report := reports.RmReport{Id: rmCtr.ID(), RawInput: rmCtr.ID()} + report.Err = r.removeContainer(ctx, rmCtr, force, removeVolume, false, false, timeout) return append(rmReports, &report), nil } @@ -1217,7 +1229,7 @@ func (r *Runtime) MountStorageContainer(id string) (string, error) { } mountPoint, err := r.store.Mount(container.ID, "") if err != nil { - return "", fmt.Errorf("error mounting storage for container %s: %w", id, err) + return "", fmt.Errorf("mounting storage for container %s: %w", id, err) } return mountPoint, nil } @@ -1265,7 +1277,7 @@ func (r *Runtime) StorageContainers() ([]storage.Container, error) { storeContainers, err := r.store.Containers() if err != nil { - return nil, fmt.Errorf("error reading list of all storage containers: %w", err) + return nil, fmt.Errorf("reading list of all storage containers: %w", err) } retCtrs := []storage.Container{} for _, container := range storeContainers { diff --git a/libpod/runtime_ctr_freebsd.go b/libpod/runtime_ctr_freebsd.go new file mode 100644 index 000000000..a8870a38c --- /dev/null +++ b/libpod/runtime_ctr_freebsd.go @@ -0,0 +1,5 @@ +package libpod + +const ( + useDevShm = false +) diff --git a/libpod/runtime_ctr_linux.go b/libpod/runtime_ctr_linux.go new file mode 100644 index 000000000..7812d8238 --- /dev/null +++ b/libpod/runtime_ctr_linux.go @@ -0,0 +1,5 @@ +package libpod + +const ( + useDevShm = true +) diff --git a/libpod/runtime_img.go b/libpod/runtime_img.go index d04607d2e..dacbd752f 100644 --- a/libpod/runtime_img.go +++ b/libpod/runtime_img.go @@ -47,7 +47,7 @@ func (r *Runtime) RemoveContainersForImageCallback(ctx context.Context) libimage return fmt.Errorf("removing image %s: container %s using image could not be removed: %w", imageID, ctr.ID(), err) } } else { - if err := r.removeContainer(ctx, ctr, true, false, false, timeout); err != nil { + if err := r.removeContainer(ctx, ctr, true, false, false, false, timeout); err != nil { return fmt.Errorf("removing image %s: container %s using image could not be removed: %w", imageID, ctr.ID(), err) } } @@ -107,7 +107,7 @@ func (r *Runtime) Build(ctx context.Context, options buildahDefine.BuildOptions, func DownloadFromFile(reader *os.File) (string, error) { outFile, err := ioutil.TempFile(util.Tmpdir(), "import") if err != nil { - return "", fmt.Errorf("error creating file: %w", err) + return "", fmt.Errorf("creating file: %w", err) } defer outFile.Close() @@ -115,7 +115,7 @@ func DownloadFromFile(reader *os.File) (string, error) { _, err = io.Copy(outFile, reader) if err != nil { - return "", fmt.Errorf("error saving %s to %s: %w", reader.Name(), outFile.Name(), err) + return "", fmt.Errorf("saving %s to %s: %w", reader.Name(), outFile.Name(), err) } return outFile.Name(), nil diff --git a/libpod/runtime_migrate.go b/libpod/runtime_migrate.go index 139638a6b..36901d4d0 100644 --- a/libpod/runtime_migrate.go +++ b/libpod/runtime_migrate.go @@ -92,7 +92,7 @@ func (r *Runtime) migrate() error { if needsWrite { if err := r.state.RewriteContainerConfig(ctr, ctr.config); err != nil { - return fmt.Errorf("error rewriting config for container %s: %w", ctr.ID(), err) + return fmt.Errorf("rewriting config for container %s: %w", ctr.ID(), err) } } } diff --git a/libpod/runtime_migrate_unsupported.go b/libpod/runtime_migrate_unsupported.go new file mode 100644 index 000000000..77c2737a9 --- /dev/null +++ b/libpod/runtime_migrate_unsupported.go @@ -0,0 +1,16 @@ +//go:build !linux +// +build !linux + +package libpod + +import ( + "errors" +) + +func (r *Runtime) stopPauseProcess() error { + return errors.New("not implemented (*Runtime) stopPauseProcess") +} + +func (r *Runtime) migrate() error { + return errors.New("not implemented (*Runtime) migrate") +} diff --git a/libpod/runtime_pod_linux.go b/libpod/runtime_pod_linux.go index 57c0b5c48..24e9f3da7 100644 --- a/libpod/runtime_pod_linux.go +++ b/libpod/runtime_pod_linux.go @@ -17,6 +17,7 @@ import ( "github.com/containers/podman/v4/libpod/events" "github.com/containers/podman/v4/pkg/rootless" "github.com/containers/podman/v4/pkg/specgen" + "github.com/hashicorp/go-multierror" "github.com/sirupsen/logrus" ) @@ -36,14 +37,14 @@ func (r *Runtime) NewPod(ctx context.Context, p specgen.PodSpecGenerator, option for _, option := range options { if err := option(pod); err != nil { - return nil, fmt.Errorf("error running pod create option: %w", err) + return nil, fmt.Errorf("running pod create option: %w", err) } } // Allocate a lock for the pod lock, err := r.lockManager.AllocateLock() if err != nil { - return nil, fmt.Errorf("error allocating lock for new pod: %w", err) + return nil, fmt.Errorf("allocating lock for new pod: %w", err) } pod.lock = lock pod.config.LockID = pod.lock.ID() @@ -160,7 +161,7 @@ func (r *Runtime) NewPod(ctx context.Context, p specgen.PodSpecGenerator, option } } if addPodErr != nil { - return nil, fmt.Errorf("error adding pod to state: %w", addPodErr) + return nil, fmt.Errorf("adding pod to state: %w", addPodErr) } return pod, nil @@ -191,29 +192,9 @@ func (r *Runtime) SavePod(pod *Pod) error { return nil } -func (r *Runtime) removePod(ctx context.Context, p *Pod, removeCtrs, force bool, timeout *uint) error { - if err := p.updatePod(); err != nil { - return err - } - - ctrs, err := r.state.PodContainers(p) - if err != nil { - return err - } - numCtrs := len(ctrs) - - // If the only running container in the pod is the pause container, remove the pod and container unconditionally. - pauseCtrID := p.state.InfraContainerID - if numCtrs == 1 && ctrs[0].ID() == pauseCtrID { - removeCtrs = true - force = true - } - if !removeCtrs && numCtrs > 0 { - return fmt.Errorf("pod %s contains containers and cannot be removed: %w", p.ID(), define.ErrCtrExists) - } - - ctrNamedVolumes := make(map[string]*ContainerNamedVolume) - +// DO NOT USE THIS FUNCTION DIRECTLY. Use removePod(), below. It will call +// removeMalformedPod() if necessary. +func (r *Runtime) removeMalformedPod(ctx context.Context, p *Pod, ctrs []*Container, force bool, timeout *uint, ctrNamedVolumes map[string]*ContainerNamedVolume) error { var removalErr error for _, ctr := range ctrs { err := func() error { @@ -231,7 +212,7 @@ func (r *Runtime) removePod(ctx context.Context, p *Pod, removeCtrs, force bool, ctrNamedVolumes[vol.Name] = vol } - return r.removeContainer(ctx, ctr, force, false, true, timeout) + return r.removeContainer(ctx, ctr, force, false, true, true, timeout) }() if removalErr == nil { @@ -261,6 +242,69 @@ func (r *Runtime) removePod(ctx context.Context, p *Pod, removeCtrs, force bool, return err } + return nil +} + +func (r *Runtime) removePod(ctx context.Context, p *Pod, removeCtrs, force bool, timeout *uint) error { + if err := p.updatePod(); err != nil { + return err + } + + ctrs, err := r.state.PodContainers(p) + if err != nil { + return err + } + numCtrs := len(ctrs) + + // If the only running container in the pod is the pause container, remove the pod and container unconditionally. + pauseCtrID := p.state.InfraContainerID + if numCtrs == 1 && ctrs[0].ID() == pauseCtrID { + removeCtrs = true + force = true + } + if !removeCtrs && numCtrs > 0 { + return fmt.Errorf("pod %s contains containers and cannot be removed: %w", p.ID(), define.ErrCtrExists) + } + + var removalErr error + ctrNamedVolumes := make(map[string]*ContainerNamedVolume) + + // Build a graph of all containers in the pod. + graph, err := BuildContainerGraph(ctrs) + if err != nil { + // We have to allow the pod to be removed. + // But let's only do it if force is set. + if !force { + return fmt.Errorf("cannot create container graph for pod %s: %w", p.ID(), err) + } + + removalErr = fmt.Errorf("creating container graph for pod %s failed, fell back to loop removal: %w", p.ID(), err) + + if err := r.removeMalformedPod(ctx, p, ctrs, force, timeout, ctrNamedVolumes); err != nil { + logrus.Errorf("Error creating container graph for pod %s: %v. Falling back to loop removal.", p.ID(), err) + return err + } + } else { + ctrErrors := make(map[string]error) + ctrsVisited := make(map[string]bool) + + for _, node := range graph.notDependedOnNodes { + removeNode(ctx, node, p, force, timeout, false, ctrErrors, ctrsVisited, ctrNamedVolumes) + } + + // This is gross, but I don't want to change the signature on + // removePod - especially since any change here eventually has + // to map down to one error unless we want to make a breaking + // API change. + if len(ctrErrors) > 0 { + var allErrs error + for id, err := range ctrErrors { + allErrs = multierror.Append(allErrs, fmt.Errorf("removing container %s from pod %s: %w", id, p.ID(), err)) + } + return allErrs + } + } + for volName := range ctrNamedVolumes { volume, err := r.state.Volume(volName) if err != nil && !errors.Is(err, define.ErrNoSuchVolume) { @@ -286,7 +330,7 @@ func (r *Runtime) removePod(ctx context.Context, p *Pod, removeCtrs, force bool, case config.SystemdCgroupsManager: if err := deleteSystemdCgroup(p.state.CgroupPath, p.ResourceLim()); err != nil { if removalErr == nil { - removalErr = fmt.Errorf("error removing pod %s cgroup: %w", p.ID(), err) + removalErr = fmt.Errorf("removing pod %s cgroup: %w", p.ID(), err) } else { logrus.Errorf("Deleting pod %s cgroup %s: %v", p.ID(), p.state.CgroupPath, err) } @@ -300,7 +344,7 @@ func (r *Runtime) removePod(ctx context.Context, p *Pod, removeCtrs, force bool, conmonCgroup, err := cgroups.Load(conmonCgroupPath) if err != nil && err != cgroups.ErrCgroupDeleted && err != cgroups.ErrCgroupV1Rootless { if removalErr == nil { - removalErr = fmt.Errorf("error retrieving pod %s conmon cgroup: %w", p.ID(), err) + removalErr = fmt.Errorf("retrieving pod %s conmon cgroup: %w", p.ID(), err) } else { logrus.Debugf("Error retrieving pod %s conmon cgroup %s: %v", p.ID(), conmonCgroupPath, err) } @@ -308,7 +352,7 @@ func (r *Runtime) removePod(ctx context.Context, p *Pod, removeCtrs, force bool, if err == nil { if err = conmonCgroup.Delete(); err != nil { if removalErr == nil { - removalErr = fmt.Errorf("error removing pod %s conmon cgroup: %w", p.ID(), err) + removalErr = fmt.Errorf("removing pod %s conmon cgroup: %w", p.ID(), err) } else { logrus.Errorf("Deleting pod %s conmon cgroup %s: %v", p.ID(), conmonCgroupPath, err) } @@ -317,7 +361,7 @@ func (r *Runtime) removePod(ctx context.Context, p *Pod, removeCtrs, force bool, cgroup, err := cgroups.Load(p.state.CgroupPath) if err != nil && err != cgroups.ErrCgroupDeleted && err != cgroups.ErrCgroupV1Rootless { if removalErr == nil { - removalErr = fmt.Errorf("error retrieving pod %s cgroup: %w", p.ID(), err) + removalErr = fmt.Errorf("retrieving pod %s cgroup: %w", p.ID(), err) } else { logrus.Errorf("Retrieving pod %s cgroup %s: %v", p.ID(), p.state.CgroupPath, err) } @@ -325,7 +369,7 @@ func (r *Runtime) removePod(ctx context.Context, p *Pod, removeCtrs, force bool, if err == nil { if err := cgroup.Delete(); err != nil { if removalErr == nil { - removalErr = fmt.Errorf("error removing pod %s cgroup: %w", p.ID(), err) + removalErr = fmt.Errorf("removing pod %s cgroup: %w", p.ID(), err) } else { logrus.Errorf("Deleting pod %s cgroup %s: %v", p.ID(), p.state.CgroupPath, err) } @@ -362,7 +406,7 @@ func (r *Runtime) removePod(ctx context.Context, p *Pod, removeCtrs, force bool, // Deallocate the pod lock if err := p.lock.Free(); err != nil { if removalErr == nil { - removalErr = fmt.Errorf("error freeing pod %s lock: %w", p.ID(), err) + removalErr = fmt.Errorf("freeing pod %s lock: %w", p.ID(), err) } else { logrus.Errorf("Freeing pod %s lock: %v", p.ID(), err) } diff --git a/libpod/runtime_pod_unsupported.go b/libpod/runtime_pod_unsupported.go new file mode 100644 index 000000000..0c7ff8655 --- /dev/null +++ b/libpod/runtime_pod_unsupported.go @@ -0,0 +1,30 @@ +//go:build !linux +// +build !linux + +package libpod + +import ( + "context" + "errors" + + "github.com/containers/podman/v4/pkg/specgen" +) + +// NewPod makes a new, empty pod +func (r *Runtime) NewPod(ctx context.Context, p specgen.PodSpecGenerator, options ...PodCreateOption) (_ *Pod, deferredErr error) { + return nil, errors.New("not implemented (*Runtime) NewPod") +} + +// AddInfra adds the created infra container to the pod state +func (r *Runtime) AddInfra(ctx context.Context, pod *Pod, infraCtr *Container) (*Pod, error) { + return nil, errors.New("not implemented (*Runtime) AddInfra") +} + +// SavePod is a helper function to save the pod state from outside of libpod +func (r *Runtime) SavePod(pod *Pod) error { + return errors.New("not implemented (*Runtime) SavePod") +} + +func (r *Runtime) removePod(ctx context.Context, p *Pod, removeCtrs, force bool, timeout *uint) error { + return errors.New("not implemented (*Runtime) removePod") +} diff --git a/libpod/runtime_renumber.go b/libpod/runtime_renumber.go index 9149dd72f..ff70081d8 100644 --- a/libpod/runtime_renumber.go +++ b/libpod/runtime_renumber.go @@ -27,7 +27,7 @@ func (r *Runtime) renumberLocks() error { for _, ctr := range allCtrs { lock, err := r.lockManager.AllocateLock() if err != nil { - return fmt.Errorf("error allocating lock for container %s: %w", ctr.ID(), err) + return fmt.Errorf("allocating lock for container %s: %w", ctr.ID(), err) } ctr.config.LockID = lock.ID() @@ -44,7 +44,7 @@ func (r *Runtime) renumberLocks() error { for _, pod := range allPods { lock, err := r.lockManager.AllocateLock() if err != nil { - return fmt.Errorf("error allocating lock for pod %s: %w", pod.ID(), err) + return fmt.Errorf("allocating lock for pod %s: %w", pod.ID(), err) } pod.config.LockID = lock.ID() @@ -61,7 +61,7 @@ func (r *Runtime) renumberLocks() error { for _, vol := range allVols { lock, err := r.lockManager.AllocateLock() if err != nil { - return fmt.Errorf("error allocating lock for volume %s: %w", vol.Name(), err) + return fmt.Errorf("allocating lock for volume %s: %w", vol.Name(), err) } vol.config.LockID = lock.ID() diff --git a/libpod/runtime_test.go b/libpod/runtime_test.go new file mode 100644 index 000000000..2e16c7fcd --- /dev/null +++ b/libpod/runtime_test.go @@ -0,0 +1,28 @@ +package libpod + +import ( + "math/rand" + "os" + "testing" + + "github.com/stretchr/testify/assert" +) + +func Test_generateName(t *testing.T) { + state, path, _, err := getEmptyBoltState() + assert.NoError(t, err) + defer os.RemoveAll(path) + defer state.Close() + + r := &Runtime{ + state: state, + } + + // Test that (*Runtime).generateName returns different names + // if called twice, even if the global RNG has the default + // seed. + n1, _ := r.generateName() + rand.Seed(1) + n2, _ := r.generateName() + assert.NotEqual(t, n1, n2) +} diff --git a/libpod/runtime_volume_linux.go b/libpod/runtime_volume_linux.go index 1f354e41b..08fdbf977 100644 --- a/libpod/runtime_volume_linux.go +++ b/libpod/runtime_volume_linux.go @@ -42,7 +42,7 @@ func (r *Runtime) newVolume(noCreatePluginVolume bool, options ...VolumeCreateOp } if volume.config.Name == "" { - volume.config.Name = stringid.GenerateNonCryptoID() + volume.config.Name = stringid.GenerateRandomID() } if volume.config.Driver == "" { volume.config.Driver = define.VolumeDriverLocal @@ -184,7 +184,7 @@ func (r *Runtime) UpdateVolumePlugins(ctx context.Context) *define.VolumeReload ) for driverName, socket := range r.config.Engine.VolumePlugins { - driver, err := volplugin.GetVolumePlugin(driverName, socket, 0) + driver, err := volplugin.GetVolumePlugin(driverName, socket, nil, r.config) if err != nil { errs = append(errs, err) continue @@ -324,7 +324,7 @@ func (r *Runtime) removeVolume(ctx context.Context, v *Volume, force bool, timeo logrus.Debugf("Removing container %s (depends on volume %q)", ctr.ID(), v.Name()) - if err := r.removeContainer(ctx, ctr, force, false, false, timeout); err != nil { + if err := r.removeContainer(ctx, ctr, force, false, false, false, timeout); err != nil { return fmt.Errorf("removing container %s that depends on volume %s: %w", ctr.ID(), v.Name(), err) } } diff --git a/libpod/runtime_volume_unsupported.go b/libpod/runtime_volume_unsupported.go new file mode 100644 index 000000000..c2816b817 --- /dev/null +++ b/libpod/runtime_volume_unsupported.go @@ -0,0 +1,42 @@ +//go:build !linux +// +build !linux + +package libpod + +import ( + "context" + "errors" + + "github.com/containers/podman/v4/libpod/define" +) + +// NewVolume creates a new empty volume +func (r *Runtime) NewVolume(ctx context.Context, options ...VolumeCreateOption) (*Volume, error) { + if !r.valid { + return nil, define.ErrRuntimeStopped + } + return r.newVolume(false, options...) +} + +// NewVolume creates a new empty volume +func (r *Runtime) newVolume(noCreatePluginVolume bool, options ...VolumeCreateOption) (*Volume, error) { + return nil, errors.New("not implemented (*Runtime) newVolume") +} + +// UpdateVolumePlugins reads all volumes from all configured volume plugins and +// imports them into the libpod db. It also checks if existing libpod volumes +// are removed in the plugin, in this case we try to remove it from libpod. +// On errors we continue and try to do as much as possible. all errors are +// returned as array in the returned struct. +// This function has many race conditions, it is best effort but cannot guarantee +// a perfect state since plugins can be modified from the outside at any time. +func (r *Runtime) UpdateVolumePlugins(ctx context.Context) *define.VolumeReload { + return nil +} + +// removeVolume removes the specified volume from state as well tears down its mountpoint and storage. +// ignoreVolumePlugin is used to only remove the volume from the db and not the plugin, +// this is required when the volume was already removed from the plugin, i.e. in UpdateVolumePlugins(). +func (r *Runtime) removeVolume(ctx context.Context, v *Volume, force bool, timeout *uint, ignoreVolumePlugin bool) error { + return errors.New("not implemented (*Runtime) removeVolume") +} diff --git a/libpod/stats_common.go b/libpod/stats_common.go new file mode 100644 index 000000000..122160bda --- /dev/null +++ b/libpod/stats_common.go @@ -0,0 +1,49 @@ +//go:build linux || freebsd +// +build linux freebsd + +package libpod + +import ( + "fmt" + + "github.com/containers/podman/v4/libpod/define" +) + +// GetContainerStats gets the running stats for a given container. +// The previousStats is used to correctly calculate cpu percentages. You +// should pass nil if there is no previous stat for this container. +func (c *Container) GetContainerStats(previousStats *define.ContainerStats) (*define.ContainerStats, error) { + stats := new(define.ContainerStats) + stats.ContainerID = c.ID() + stats.Name = c.Name() + + if c.config.NoCgroups { + return nil, fmt.Errorf("cannot run top on container %s as it did not create a cgroup: %w", c.ID(), define.ErrNoCgroups) + } + + if !c.batched { + c.lock.Lock() + defer c.lock.Unlock() + if err := c.syncContainer(); err != nil { + return stats, err + } + } + + // returns stats with the fields' default values respective of their type + if c.state.State != define.ContainerStateRunning && c.state.State != define.ContainerStatePaused { + return stats, nil + } + + if previousStats == nil { + previousStats = &define.ContainerStats{ + // if we have no prev stats use the container start time as prev time + // otherwise we cannot correctly calculate the CPU percentage + SystemNano: uint64(c.state.StartedTime.UnixNano()), + } + } + + if err := c.getPlatformContainerStats(stats, previousStats); err != nil { + return nil, err + } + return stats, nil +} diff --git a/libpod/stats_freebsd.go b/libpod/stats_freebsd.go new file mode 100644 index 000000000..53bc3f19a --- /dev/null +++ b/libpod/stats_freebsd.go @@ -0,0 +1,153 @@ +package libpod + +import ( + "fmt" + "math" + "strings" + "time" + + "github.com/containers/common/pkg/cgroups" + "github.com/containers/podman/v4/libpod/define" + "github.com/containers/podman/v4/pkg/rctl" + "github.com/containers/storage/pkg/system" + "github.com/sirupsen/logrus" +) + +// getPlatformContainerStats gets the platform-specific running stats +// for a given container. The previousStats is used to correctly +// calculate cpu percentages. You should pass nil if there is no +// previous stat for this container. +func (c *Container) getPlatformContainerStats(stats *define.ContainerStats, previousStats *define.ContainerStats) error { + now := uint64(time.Now().UnixNano()) + + jailName := c.ID() + if c.state.NetNS != nil { + jailName = c.state.NetNS.Name + "." + jailName + } + entries, err := rctl.GetRacct("jail:" + jailName) + if err != nil { + return fmt.Errorf("unable to read accounting for %s: %w", jailName, err) + } + + // If the current total usage is less than what was previously + // recorded then it means the container was restarted and runs + // in a new jail + if dur, ok := entries["wallclock"]; ok { + if previousStats.Duration > dur*1000000000 { + previousStats = &define.ContainerStats{} + } + } + + for key, val := range entries { + switch key { + case "cputime": // CPU time, in seconds + stats.CPUNano = val * 1000000000 + stats.AvgCPU = calculateCPUPercent(stats.CPUNano, 0, now, uint64(c.state.StartedTime.UnixNano())) + case "datasize": // data size, in bytes + case "stacksize": // stack size, in bytes + case "coredumpsize": // core dump size, in bytes + case "memoryuse": // resident set size, in bytes + stats.MemUsage = val + case "memorylocked": // locked memory, in bytes + case "maxproc": // number of processes + stats.PIDs = val + case "openfiles": // file descriptor table size + case "vmemoryuse": // address space limit, in bytes + case "pseudoterminals": // number of PTYs + case "swapuse": // swap space that may be reserved or used, in bytes + case "nthr": // number of threads + case "msgqqueued": // number of queued SysV messages + case "msgqsize": // SysV message queue size, in bytes + case "nmsgq": // number of SysV message queues + case "nsem": // number of SysV semaphores + case "nsemop": // number of SysV semaphores modified in a single semop(2) call + case "nshm": // number of SysV shared memory segments + case "shmsize": // SysV shared memory size, in bytes + case "wallclock": // wallclock time, in seconds + stats.Duration = val * 1000000000 + stats.UpTime = time.Duration(stats.Duration) + case "pcpu": // %CPU, in percents of a single CPU core + stats.CPU = float64(val) + case "readbps": // filesystem reads, in bytes per second + stats.BlockInput = val + case "writebps": // filesystem writes, in bytes per second + stats.BlockOutput = val + case "readiops": // filesystem reads, in operations per second + case "writeiops": // filesystem writes, in operations per second + } + } + stats.MemLimit = c.getMemLimit() + stats.SystemNano = now + + netStats, err := getContainerNetIO(c) + if err != nil { + return err + } + + // Handle case where the container is not in a network namespace + if netStats != nil { + stats.NetInput = netStats.TxBytes + stats.NetOutput = netStats.RxBytes + } else { + stats.NetInput = 0 + stats.NetOutput = 0 + } + + return nil +} + +// getMemory limit returns the memory limit for a container +func (c *Container) getMemLimit() uint64 { + memLimit := uint64(math.MaxUint64) + + if c.config.Spec.Linux != nil && c.config.Spec.Linux.Resources != nil && + c.config.Spec.Linux.Resources.Memory != nil && c.config.Spec.Linux.Resources.Memory.Limit != nil { + memLimit = uint64(*c.config.Spec.Linux.Resources.Memory.Limit) + } + + mi, err := system.ReadMemInfo() + if err != nil { + logrus.Errorf("ReadMemInfo error: %v", err) + return 0 + } + + //nolint:unconvert + physicalLimit := uint64(mi.MemTotal) + + if memLimit <= 0 || memLimit > physicalLimit { + return physicalLimit + } + + return memLimit +} + +// calculateCPUPercent calculates the cpu usage using the latest measurement in stats. +// previousCPU is the last value of stats.CPU.Usage.Total measured at the time previousSystem. +// +// (now - previousSystem) is the time delta in nanoseconds, between the measurement in previousCPU +// +// and the updated value in stats. +func calculateCPUPercent(currentCPU, previousCPU, now, previousSystem uint64) float64 { + var ( + cpuPercent = 0.0 + cpuDelta = float64(currentCPU - previousCPU) + systemDelta = float64(now - previousSystem) + ) + if systemDelta > 0.0 && cpuDelta > 0.0 { + // gets a ratio of container cpu usage total, and multiplies that by 100 to get a percentage + cpuPercent = (cpuDelta / systemDelta) * 100 + } + return cpuPercent +} + +func calculateBlockIO(stats *cgroups.Metrics) (read uint64, write uint64) { + for _, blkIOEntry := range stats.Blkio.IoServiceBytesRecursive { + switch strings.ToLower(blkIOEntry.Op) { + case "read": + read += blkIOEntry.Value + case "write": + write += blkIOEntry.Value + } + } + return +} diff --git a/libpod/stats.go b/libpod/stats_linux.go index c7e9e5128..ad8f33c91 100644 --- a/libpod/stats.go +++ b/libpod/stats_linux.go @@ -16,57 +16,33 @@ import ( "github.com/containers/podman/v4/libpod/define" ) -// GetContainerStats gets the running stats for a given container. -// The previousStats is used to correctly calculate cpu percentages. You -// should pass nil if there is no previous stat for this container. -func (c *Container) GetContainerStats(previousStats *define.ContainerStats) (*define.ContainerStats, error) { - stats := new(define.ContainerStats) - stats.ContainerID = c.ID() - stats.Name = c.Name() - +// getPlatformContainerStats gets the platform-specific running stats +// for a given container. The previousStats is used to correctly +// calculate cpu percentages. You should pass nil if there is no +// previous stat for this container. +func (c *Container) getPlatformContainerStats(stats *define.ContainerStats, previousStats *define.ContainerStats) error { if c.config.NoCgroups { - return nil, fmt.Errorf("cannot run top on container %s as it did not create a cgroup: %w", c.ID(), define.ErrNoCgroups) - } - - if !c.batched { - c.lock.Lock() - defer c.lock.Unlock() - if err := c.syncContainer(); err != nil { - return stats, err - } - } - - // returns stats with the fields' default values respective of their type - if c.state.State != define.ContainerStateRunning && c.state.State != define.ContainerStatePaused { - return stats, nil - } - - if previousStats == nil { - previousStats = &define.ContainerStats{ - // if we have no prev stats use the container start time as prev time - // otherwise we cannot correctly calculate the CPU percentage - SystemNano: uint64(c.state.StartedTime.UnixNano()), - } + return fmt.Errorf("cannot run top on container %s as it did not create a cgroup: %w", c.ID(), define.ErrNoCgroups) } cgroupPath, err := c.cGroupPath() if err != nil { - return nil, err + return err } cgroup, err := cgroups.Load(cgroupPath) if err != nil { - return stats, fmt.Errorf("unable to load cgroup at %s: %w", cgroupPath, err) + return fmt.Errorf("unable to load cgroup at %s: %w", cgroupPath, err) } // Ubuntu does not have swap memory in cgroups because swap is often not enabled. cgroupStats, err := cgroup.Stat() if err != nil { - return stats, fmt.Errorf("unable to obtain cgroup stats: %w", err) + return fmt.Errorf("unable to obtain cgroup stats: %w", err) } conState := c.state.State netStats, err := getContainerNetIO(c) if err != nil { - return nil, err + return err } // If the current total usage in the cgroup is less than what was previously @@ -103,7 +79,7 @@ func (c *Container) GetContainerStats(previousStats *define.ContainerStats) (*de stats.NetOutput = 0 } - return stats, nil + return nil } // getMemory limit returns the memory limit for a container @@ -133,7 +109,9 @@ func (c *Container) getMemLimit() uint64 { // calculateCPUPercent calculates the cpu usage using the latest measurement in stats. // previousCPU is the last value of stats.CPU.Usage.Total measured at the time previousSystem. -// (now - previousSystem) is the time delta in nanoseconds, between the measurement in previousCPU +// +// (now - previousSystem) is the time delta in nanoseconds, between the measurement in previousCPU +// // and the updated value in stats. func calculateCPUPercent(stats *runccgroup.Stats, previousCPU, now, previousSystem uint64) float64 { var ( diff --git a/libpod/stats_unsupported.go b/libpod/stats_unsupported.go new file mode 100644 index 000000000..3094e2eaa --- /dev/null +++ b/libpod/stats_unsupported.go @@ -0,0 +1,17 @@ +//go:build !linux && !freebsd +// +build !linux,!freebsd + +package libpod + +import ( + "errors" + + "github.com/containers/podman/v4/libpod/define" +) + +// GetContainerStats gets the running stats for a given container. +// The previousStats is used to correctly calculate cpu percentages. You +// should pass nil if there is no previous stat for this container. +func (c *Container) GetContainerStats(previousStats *define.ContainerStats) (*define.ContainerStats, error) { + return nil, errors.New("not implemented (*Container) GetContainerStats") +} diff --git a/libpod/util.go b/libpod/util.go index a6e6a4f3e..c5a2b81bd 100644 --- a/libpod/util.go +++ b/libpod/util.go @@ -231,7 +231,7 @@ func DefaultSeccompPath() (string, error) { func checkDependencyContainer(depCtr, ctr *Container) error { state, err := depCtr.State() if err != nil { - return fmt.Errorf("error accessing dependency container %s state: %w", depCtr.ID(), err) + return fmt.Errorf("accessing dependency container %s state: %w", depCtr.ID(), err) } if state == define.ContainerStateRemoving { return fmt.Errorf("cannot use container %s as a dependency as it is being removed: %w", depCtr.ID(), define.ErrCtrStateInvalid) diff --git a/libpod/util_linux.go b/libpod/util_linux.go index 7c79e6ce4..efc11710f 100644 --- a/libpod/util_linux.go +++ b/libpod/util_linux.go @@ -29,7 +29,7 @@ func systemdSliceFromPath(parent, name string, resources *spec.LinuxResources) ( logrus.Debugf("Created cgroup path %s for parent %s and name %s", cgroupPath, parent, name) if err := makeSystemdCgroup(cgroupPath, resources); err != nil { - return "", fmt.Errorf("error creating cgroup %s: %w", cgroupPath, err) + return "", fmt.Errorf("creating cgroup %s: %w", cgroupPath, err) } logrus.Debugf("Created cgroup %s", cgroupPath) @@ -112,17 +112,17 @@ var lvpReleaseLabel = label.ReleaseLabel func LabelVolumePath(path string) error { _, mountLabel, err := lvpInitLabels([]string{}) if err != nil { - return fmt.Errorf("error getting default mountlabels: %w", err) + return fmt.Errorf("getting default mountlabels: %w", err) } if err := lvpReleaseLabel(mountLabel); err != nil { - return fmt.Errorf("error releasing label %q: %w", mountLabel, err) + return fmt.Errorf("releasing label %q: %w", mountLabel, err) } if err := lvpRelabel(path, mountLabel, true); err != nil { if err == syscall.ENOTSUP { logrus.Debugf("Labeling not supported on %q", path) } else { - return fmt.Errorf("error setting selinux label for %s to %q as shared: %w", path, mountLabel, err) + return fmt.Errorf("setting selinux label for %s to %q as shared: %w", path, mountLabel, err) } } return nil diff --git a/libpod/util_unsupported.go b/libpod/util_unsupported.go new file mode 100644 index 000000000..d2ec3ae7b --- /dev/null +++ b/libpod/util_unsupported.go @@ -0,0 +1,27 @@ +//go:build !linux +// +build !linux + +package libpod + +import ( + "errors" + + spec "github.com/opencontainers/runtime-spec/specs-go" +) + +// systemdSliceFromPath makes a new systemd slice under the given parent with +// the given name. +// The parent must be a slice. The name must NOT include ".slice" +func systemdSliceFromPath(parent, name string, resources *spec.LinuxResources) (string, error) { + return "", errors.New("not implemented systemdSliceFromPath") +} + +// Unmount umounts a target directory +func Unmount(mount string) { +} + +// LabelVolumePath takes a mount path for a volume and gives it an +// selinux label of either shared or not +func LabelVolumePath(path string) error { + return errors.New("not implemented LabelVolumePath") +} diff --git a/libpod/volume.go b/libpod/volume.go index 2e8cd77a5..a054e4032 100644 --- a/libpod/volume.go +++ b/libpod/volume.go @@ -56,7 +56,7 @@ type VolumeConfig struct { // quota tracking. DisableQuota bool `json:"disableQuota,omitempty"` // Timeout allows users to override the default driver timeout of 5 seconds - Timeout int + Timeout *uint `json:"timeout,omitempty"` } // VolumeState holds the volume's mutable state. diff --git a/libpod/volume_inspect.go b/libpod/volume_inspect.go index dd2f3fd01..73441576b 100644 --- a/libpod/volume_inspect.go +++ b/libpod/volume_inspect.go @@ -39,7 +39,7 @@ func (v *Volume) Inspect() (*define.InspectVolumeData, error) { req.Name = v.Name() resp, err := v.plugin.GetVolume(req) if err != nil { - return nil, fmt.Errorf("error retrieving volume %s information from plugin %s: %w", v.Name(), v.Driver(), err) + return nil, fmt.Errorf("retrieving volume %s information from plugin %s: %w", v.Name(), v.Driver(), err) } if resp != nil { data.Status = resp.Status @@ -64,7 +64,12 @@ func (v *Volume) Inspect() (*define.InspectVolumeData, error) { data.MountCount = v.state.MountCount data.NeedsCopyUp = v.state.NeedsCopyUp data.NeedsChown = v.state.NeedsChown - data.Timeout = v.config.Timeout + + if v.config.Timeout != nil { + data.Timeout = *v.config.Timeout + } else if v.UsesVolumeDriver() { + data.Timeout = v.runtime.config.Engine.VolumePluginTimeout + } return data, nil } diff --git a/libpod/volume_internal_unsupported.go b/libpod/volume_internal_unsupported.go new file mode 100644 index 000000000..50515e692 --- /dev/null +++ b/libpod/volume_internal_unsupported.go @@ -0,0 +1,32 @@ +//go:build !linux +// +build !linux + +package libpod + +import ( + "errors" +) + +// mount mounts the volume if necessary. +// A mount is necessary if a volume has any options set. +// If a mount is necessary, v.state.MountCount will be incremented. +// If it was 0 when the increment occurred, the volume will be mounted on the +// host. Otherwise, we assume it is already mounted. +// Must be done while the volume is locked. +// Is a no-op on volumes that do not require a mount (as defined by +// volumeNeedsMount()). +func (v *Volume) mount() error { + return errors.New("not implemented (*Volume) mount") +} + +// unmount unmounts the volume if necessary. +// Unmounting a volume that is not mounted is a no-op. +// Unmounting a volume that does not require a mount is a no-op. +// The volume must be locked for this to occur. +// The mount counter will be decremented if non-zero. If the counter reaches 0, +// the volume will really be unmounted, as no further containers are using the +// volume. +// If force is set, the volume will be unmounted regardless of mount counter. +func (v *Volume) unmount(force bool) error { + return errors.New("not implemented (*Volume) unmount") +} |