diff options
Diffstat (limited to 'libpod')
40 files changed, 1189 insertions, 451 deletions
diff --git a/libpod/boltdb_state.go b/libpod/boltdb_state.go index c3db6152a..471f64b84 100644 --- a/libpod/boltdb_state.go +++ b/libpod/boltdb_state.go @@ -5,8 +5,10 @@ import ( "fmt" "net" "os" + "strconv" "strings" "sync" + "time" "github.com/containers/common/libnetwork/types" "github.com/containers/podman/v4/libpod/define" @@ -63,6 +65,13 @@ type BoltState struct { // initially created the database. This must match for any further instances // that access the database, to ensure that state mismatches with // containers/storage do not occur. +// - exitCodeBucket/exitCodeTimeStampBucket: (#14559) exit codes must be part +// of the database to resolve a previous race condition when one process waits +// for the exit file to be written and another process removes it along with +// the container during auto-removal. The same race would happen trying to +// read the exit code from the containers bucket. Hence, exit codes go into +// their own bucket. To avoid the rather expensive JSON (un)marshaling, we +// have two buckets: one for the exit codes, the other for the timestamps. // NewBoltState creates a new bolt-backed state database func NewBoltState(path string, runtime *Runtime) (State, error) { @@ -98,6 +107,8 @@ func NewBoltState(path string, runtime *Runtime) (State, error) { allVolsBkt, execBkt, runtimeConfigBkt, + exitCodeBkt, + exitCodeTimeStampBkt, } // Does the DB need an update? @@ -192,6 +203,45 @@ func (s *BoltState) Refresh() error { return err } + exitCodeBucket, err := getExitCodeBucket(tx) + if err != nil { + return err + } + + timeStampBucket, err := getExitCodeTimeStampBucket(tx) + if err != nil { + return err + } + + // Clear all exec exit codes + toRemoveExitCodes := []string{} + err = exitCodeBucket.ForEach(func(id, _ []byte) error { + toRemoveExitCodes = append(toRemoveExitCodes, string(id)) + return nil + }) + if err != nil { + return errors.Wrapf(err, "error reading exit codes bucket") + } + for _, id := range toRemoveExitCodes { + if err := exitCodeBucket.Delete([]byte(id)); err != nil { + return errors.Wrapf(err, "error removing exit code for ID %s", id) + } + } + + toRemoveTimeStamps := []string{} + err = timeStampBucket.ForEach(func(id, _ []byte) error { + toRemoveTimeStamps = append(toRemoveTimeStamps, string(id)) + return nil + }) + if err != nil { + return errors.Wrapf(err, "reading timestamps bucket") + } + for _, id := range toRemoveTimeStamps { + if err := timeStampBucket.Delete([]byte(id)); err != nil { + return errors.Wrapf(err, "removing timestamp for ID %s", id) + } + } + // Iterate through all IDs. Check if they are containers. // If they are, unmarshal their state, and then clear // PID, mountpoint, and state for all of them @@ -1341,6 +1391,204 @@ func (s *BoltState) GetContainerConfig(id string) (*ContainerConfig, error) { return config, nil } +// AddContainerExitCode adds the exit code for the specified container to the database. +func (s *BoltState) AddContainerExitCode(id string, exitCode int32) error { + if len(id) == 0 { + return define.ErrEmptyID + } + + if !s.valid { + return define.ErrDBClosed + } + + db, err := s.getDBCon() + if err != nil { + return err + } + defer s.deferredCloseDBCon(db) + + rawID := []byte(id) + rawExitCode := []byte(strconv.Itoa(int(exitCode))) + rawTimeStamp, err := time.Now().MarshalText() + if err != nil { + return fmt.Errorf("marshaling exit-code time stamp: %w", err) + } + + return db.Update(func(tx *bolt.Tx) error { + exitCodeBucket, err := getExitCodeBucket(tx) + if err != nil { + return err + } + timeStampBucket, err := getExitCodeTimeStampBucket(tx) + if err != nil { + return err + } + + if err := exitCodeBucket.Put(rawID, rawExitCode); err != nil { + return fmt.Errorf("adding exit code of container %s to DB: %w", id, err) + } + if err := timeStampBucket.Put(rawID, rawTimeStamp); err != nil { + if rmErr := exitCodeBucket.Delete(rawID); rmErr != nil { + logrus.Errorf("Removing exit code of container %s from DB: %v", id, rmErr) + } + return fmt.Errorf("adding exit-code time stamp of container %s to DB: %w", id, err) + } + + return nil + }) +} + +// GetContainerExitCode returns the exit code for the specified container. +func (s *BoltState) GetContainerExitCode(id string) (int32, error) { + if len(id) == 0 { + return -1, define.ErrEmptyID + } + + if !s.valid { + return -1, define.ErrDBClosed + } + + db, err := s.getDBCon() + if err != nil { + return -1, err + } + defer s.deferredCloseDBCon(db) + + rawID := []byte(id) + result := int32(-1) + return result, db.View(func(tx *bolt.Tx) error { + exitCodeBucket, err := getExitCodeBucket(tx) + if err != nil { + return err + } + + rawExitCode := exitCodeBucket.Get(rawID) + if rawExitCode == nil { + return fmt.Errorf("getting exit code of container %s from DB: %w", id, define.ErrNoSuchExitCode) + } + + exitCode, err := strconv.Atoi(string(rawExitCode)) + if err != nil { + return fmt.Errorf("converting raw exit code %v of container %s: %w", rawExitCode, id, err) + } + + result = int32(exitCode) + return nil + }) +} + +// GetContainerExitCodeTimeStamp returns the time stamp when the exit code of +// the specified container was added to the database. +func (s *BoltState) GetContainerExitCodeTimeStamp(id string) (*time.Time, error) { + if len(id) == 0 { + return nil, define.ErrEmptyID + } + + if !s.valid { + return nil, define.ErrDBClosed + } + + db, err := s.getDBCon() + if err != nil { + return nil, err + } + defer s.deferredCloseDBCon(db) + + rawID := []byte(id) + var result time.Time + return &result, db.View(func(tx *bolt.Tx) error { + timeStampBucket, err := getExitCodeTimeStampBucket(tx) + if err != nil { + return err + } + + rawTimeStamp := timeStampBucket.Get(rawID) + if rawTimeStamp == nil { + return fmt.Errorf("getting exit-code time stamp of container %s from DB: %w", id, define.ErrNoSuchExitCode) + } + + if err := result.UnmarshalText(rawTimeStamp); err != nil { + return fmt.Errorf("converting raw time stamp %v of container %s from DB: %w", rawTimeStamp, id, err) + } + + return nil + }) +} + +// PruneExitCodes removes exit codes older than 5 minutes. +func (s *BoltState) PruneContainerExitCodes() error { + if !s.valid { + return define.ErrDBClosed + } + + db, err := s.getDBCon() + if err != nil { + return err + } + defer s.deferredCloseDBCon(db) + + toRemoveIDs := []string{} + + threshold := time.Minute * 5 + err = db.View(func(tx *bolt.Tx) error { + timeStampBucket, err := getExitCodeTimeStampBucket(tx) + if err != nil { + return err + } + + return timeStampBucket.ForEach(func(rawID, rawTimeStamp []byte) error { + var timeStamp time.Time + if err := timeStamp.UnmarshalText(rawTimeStamp); err != nil { + return fmt.Errorf("converting raw time stamp %v of container %s from DB: %w", rawTimeStamp, string(rawID), err) + } + if time.Since(timeStamp) > threshold { + toRemoveIDs = append(toRemoveIDs, string(rawID)) + } + return nil + }) + }) + if err != nil { + return errors.Wrapf(err, "reading exit codes to prune") + } + + if len(toRemoveIDs) > 0 { + err = db.Update(func(tx *bolt.Tx) error { + exitCodeBucket, err := getExitCodeBucket(tx) + if err != nil { + return err + } + timeStampBucket, err := getExitCodeTimeStampBucket(tx) + if err != nil { + return err + } + + var finalErr error + for _, id := range toRemoveIDs { + rawID := []byte(id) + if err := exitCodeBucket.Delete(rawID); err != nil { + if finalErr != nil { + logrus.Error(finalErr) + } + finalErr = fmt.Errorf("removing exit code of container %s from DB: %w", id, err) + } + if err := timeStampBucket.Delete(rawID); err != nil { + if finalErr != nil { + logrus.Error(finalErr) + } + finalErr = fmt.Errorf("removing exit code timestamp of container %s from DB: %w", id, err) + } + } + + return finalErr + }) + if err != nil { + return errors.Wrapf(err, "pruning exit codes") + } + } + + return nil +} + // AddExecSession adds an exec session to the state. func (s *BoltState) AddExecSession(ctr *Container, session *ExecSession) error { if !s.valid { diff --git a/libpod/boltdb_state_internal.go b/libpod/boltdb_state_internal.go index d6f035af9..edba78d6d 100644 --- a/libpod/boltdb_state_internal.go +++ b/libpod/boltdb_state_internal.go @@ -29,6 +29,9 @@ const ( aliasesName = "aliases" runtimeConfigName = "runtime-config" + exitCodeName = "exit-code" + exitCodeTimeStampName = "exit-code-time-stamp" + configName = "config" stateName = "state" dependenciesName = "dependencies" @@ -65,6 +68,9 @@ var ( volDependenciesBkt = []byte(volCtrDependencies) networksBkt = []byte(networksName) + exitCodeBkt = []byte(exitCodeName) + exitCodeTimeStampBkt = []byte(exitCodeTimeStampName) + configKey = []byte(configName) stateKey = []byte(stateName) netNSKey = []byte(netNSName) @@ -362,6 +368,22 @@ func getRuntimeConfigBucket(tx *bolt.Tx) (*bolt.Bucket, error) { return bkt, nil } +func getExitCodeBucket(tx *bolt.Tx) (*bolt.Bucket, error) { + bkt := tx.Bucket(exitCodeBkt) + if bkt == nil { + return nil, errors.Wrapf(define.ErrDBBadConfig, "exit-code container bucket not found in DB") + } + return bkt, nil +} + +func getExitCodeTimeStampBucket(tx *bolt.Tx) (*bolt.Bucket, error) { + bkt := tx.Bucket(exitCodeTimeStampBkt) + if bkt == nil { + return nil, errors.Wrapf(define.ErrDBBadConfig, "exit-code time stamp bucket not found in DB") + } + return bkt, nil +} + func (s *BoltState) getContainerConfigFromDB(id []byte, config *ContainerConfig, ctrsBkt *bolt.Bucket) error { ctrBkt := ctrsBkt.Bucket(id) if ctrBkt == nil { diff --git a/libpod/container.go b/libpod/container.go index 04a4ae64a..3a15cfbdb 100644 --- a/libpod/container.go +++ b/libpod/container.go @@ -518,7 +518,7 @@ func (c *Container) PortMappings() ([]types.PortMapping, error) { if len(c.config.NetNsCtr) > 0 { netNsCtr, err := c.runtime.GetContainer(c.config.NetNsCtr) if err != nil { - return nil, errors.Wrapf(err, "unable to lookup network namespace for container %s", c.ID()) + return nil, errors.Wrapf(err, "unable to look up network namespace for container %s", c.ID()) } return netNsCtr.PortMappings() } @@ -657,7 +657,7 @@ func (c *Container) Hostname() string { utsNsCtr, err := c.runtime.GetContainer(c.config.UTSNsCtr) if err != nil { // should we return an error here? - logrus.Errorf("unable to lookup uts namespace for container %s: %v", c.ID(), err) + logrus.Errorf("unable to look up uts namespace for container %s: %v", c.ID(), err) return "" } return utsNsCtr.Hostname() diff --git a/libpod/container_api.go b/libpod/container_api.go index b064d3528..c14fe95b0 100644 --- a/libpod/container_api.go +++ b/libpod/container_api.go @@ -2,6 +2,7 @@ package libpod import ( "context" + "fmt" "io" "io/ioutil" "net/http" @@ -490,41 +491,84 @@ func (c *Container) RemoveArtifact(name string) error { // Wait blocks until the container exits and returns its exit code. func (c *Container) Wait(ctx context.Context) (int32, error) { - return c.WaitWithInterval(ctx, DefaultWaitInterval) + return c.WaitForExit(ctx, DefaultWaitInterval) } -// WaitWithInterval blocks until the container to exit and returns its exit -// code. The argument is the interval at which checks the container's status. -func (c *Container) WaitWithInterval(ctx context.Context, waitTimeout time.Duration) (int32, error) { +// WaitForExit blocks until the container exits and returns its exit code. The +// argument is the interval at which checks the container's status. +func (c *Container) WaitForExit(ctx context.Context, pollInterval time.Duration) (int32, error) { if !c.valid { return -1, define.ErrCtrRemoved } - exitFile, err := c.exitFilePath() - if err != nil { - return -1, err - } - chWait := make(chan error, 1) + id := c.ID() + var conmonTimer time.Timer + conmonTimerSet := false - go func() { - <-ctx.Done() - chWait <- define.ErrCanceled - }() + getExitCode := func() (bool, int32, error) { + containerRemoved := false + if !c.batched { + c.lock.Lock() + defer c.lock.Unlock() + } - for { - // ignore errors here (with exception of cancellation), it is only used to avoid waiting - // too long. - _, e := WaitForFile(exitFile, chWait, waitTimeout) - if e == define.ErrCanceled { - return -1, define.ErrCanceled + if err := c.syncContainer(); err != nil { + if !errors.Is(err, define.ErrNoSuchCtr) { + return false, -1, err + } + containerRemoved = true + } + + // If conmon is not alive anymore set a timer to make sure + // we're returning even if conmon has forcefully been killed. + if !conmonTimerSet && !containerRemoved { + conmonAlive, err := c.ociRuntime.CheckConmonRunning(c) + switch { + case errors.Is(err, define.ErrNoSuchCtr): + containerRemoved = true + case err != nil: + return false, -1, err + case !conmonAlive: + timerDuration := time.Second * 20 + conmonTimer = *time.NewTimer(timerDuration) + conmonTimerSet = true + } + } + + if !containerRemoved { + // If conmon is dead for more than $timerDuration or if the + // container has exited properly, try to look up the exit code. + select { + case <-conmonTimer.C: + logrus.Debugf("Exceeded conmon timeout waiting for container %s to exit", id) + default: + if !c.ensureState(define.ContainerStateExited, define.ContainerStateConfigured) { + return false, -1, nil + } + } } - stopped, code, err := c.isStopped() + exitCode, err := c.runtime.state.GetContainerExitCode(id) + if err != nil { + return true, -1, err + } + + return true, exitCode, nil + } + + for { + hasExited, exitCode, err := getExitCode() + if hasExited { + return exitCode, err + } if err != nil { return -1, err } - if stopped { - return code, nil + select { + case <-ctx.Done(): + return -1, fmt.Errorf("waiting for exit code of container %s canceled", id) + default: + time.Sleep(pollInterval) } } } @@ -551,11 +595,12 @@ func (c *Container) WaitForConditionWithInterval(ctx context.Context, waitTimeou wantedStates := make(map[define.ContainerStatus]bool, len(conditions)) for _, condition := range conditions { - if condition == define.ContainerStateStopped || condition == define.ContainerStateExited { + switch condition { + case define.ContainerStateExited, define.ContainerStateStopped: waitForExit = true - continue + default: + wantedStates[condition] = true } - wantedStates[condition] = true } trySend := func(code int32, err error) { @@ -572,7 +617,7 @@ func (c *Container) WaitForConditionWithInterval(ctx context.Context, waitTimeou go func() { defer wg.Done() - code, err := c.WaitWithInterval(ctx, waitTimeout) + code, err := c.WaitForExit(ctx, waitTimeout) trySend(code, err) }() } diff --git a/libpod/container_config.go b/libpod/container_config.go index 30b84adcf..544c45a8c 100644 --- a/libpod/container_config.go +++ b/libpod/container_config.go @@ -194,7 +194,7 @@ type ContainerSecurityConfig struct { // If not explicitly set, an unused random MLS label will be assigned by // containers/storage (but only if SELinux is enabled). MountLabel string `json:"MountLabel,omitempty"` - // LabelOpts are options passed in by the user to setup SELinux labels. + // LabelOpts are options passed in by the user to set up SELinux labels. // These are used by the containers/storage library. LabelOpts []string `json:"labelopts,omitempty"` // User and group to use in the container. Can be specified as only user @@ -386,7 +386,7 @@ type ContainerMiscConfig struct { IsService bool `json:"isService"` // SdNotifyMode tells libpod what to do with a NOTIFY_SOCKET if passed SdNotifyMode string `json:"sdnotifyMode,omitempty"` - // Systemd tells libpod to setup the container in systemd mode, a value of nil denotes false + // Systemd tells libpod to set up the container in systemd mode, a value of nil denotes false Systemd *bool `json:"systemd,omitempty"` // HealthCheckConfig has the health check command and related timings HealthCheckConfig *manifest.Schema2HealthConfig `json:"healthcheck"` @@ -412,6 +412,9 @@ type ContainerMiscConfig struct { InitContainerType string `json:"init_container_type,omitempty"` // PasswdEntry specifies arbitrary data to append to a file. PasswdEntry string `json:"passwd_entry,omitempty"` + // MountAllDevices is an option to indicate whether a privileged container + // will mount all the host's devices + MountAllDevices bool `json:"mountAllDevices"` } // InfraInherit contains the compatible options inheritable from the infra container @@ -421,7 +424,6 @@ type InfraInherit struct { CapDrop []string `json:"cap_drop,omitempty"` HostDeviceList []spec.LinuxDevice `json:"host_device_list,omitempty"` ImageVolumes []*specgen.ImageVolume `json:"image_volumes,omitempty"` - InfraResources *spec.LinuxResources `json:"resource_limits,omitempty"` Mounts []spec.Mount `json:"mounts,omitempty"` NoNewPrivileges bool `json:"no_new_privileges,omitempty"` OverlayVolumes []*specgen.OverlayVolume `json:"overlay_volumes,omitempty"` @@ -429,4 +431,10 @@ type InfraInherit struct { SeccompProfilePath string `json:"seccomp_profile_path,omitempty"` SelinuxOpts []string `json:"selinux_opts,omitempty"` Volumes []*specgen.NamedVolume `json:"volumes,omitempty"` + ShmSize *int64 `json:"shm_size"` +} + +// IsDefaultShmSize determines if the user actually set the shm in the parent ctr or if it has been set to the default size +func (inherit *InfraInherit) IsDefaultShmSize() bool { + return inherit.ShmSize == nil || *inherit.ShmSize == 65536000 } diff --git a/libpod/container_copy_linux.go b/libpod/container_copy_linux.go index 9528cd06b..6835b2f1f 100644 --- a/libpod/container_copy_linux.go +++ b/libpod/container_copy_linux.go @@ -94,6 +94,7 @@ func (c *Container) copyFromArchive(path string, chown, noOverwriteDirNonDir boo ChownDirs: idPair, ChownFiles: idPair, NoOverwriteDirNonDir: noOverwriteDirNonDir, + NoOverwriteNonDirDir: noOverwriteDirNonDir, Rename: rename, } diff --git a/libpod/container_exec.go b/libpod/container_exec.go index 1e8fce4da..b112273d0 100644 --- a/libpod/container_exec.go +++ b/libpod/container_exec.go @@ -79,11 +79,11 @@ type ExecConfig struct { type ExecSession struct { // Id is the ID of the exec session. // Named somewhat strangely to not conflict with ID(). - // nolint:stylecheck,revive + //nolint:stylecheck,revive Id string `json:"id"` // ContainerId is the ID of the container this exec session belongs to. // Named somewhat strangely to not conflict with ContainerID(). - // nolint:stylecheck,revive + //nolint:stylecheck,revive ContainerId string `json:"containerId"` // State is the state of the exec session. @@ -277,9 +277,13 @@ func (c *Container) ExecStart(sessionID string) error { return c.save() } +func (c *Container) ExecStartAndAttach(sessionID string, streams *define.AttachStreams, newSize *define.TerminalSize) error { + return c.execStartAndAttach(sessionID, streams, newSize, false) +} + // ExecStartAndAttach starts and attaches to an exec session in a container. // newSize resizes the tty to this size before the process is started, must be nil if the exec session has no tty -func (c *Container) ExecStartAndAttach(sessionID string, streams *define.AttachStreams, newSize *define.TerminalSize) error { +func (c *Container) execStartAndAttach(sessionID string, streams *define.AttachStreams, newSize *define.TerminalSize, isHealthcheck bool) error { if !c.batched { c.lock.Lock() defer c.lock.Unlock() @@ -315,7 +319,12 @@ func (c *Container) ExecStartAndAttach(sessionID string, streams *define.AttachS return err } - c.newContainerEvent(events.Exec) + if isHealthcheck { + c.newContainerEvent(events.HealthStatus) + } else { + c.newContainerEvent(events.Exec) + } + logrus.Debugf("Successfully started exec session %s in container %s", session.ID(), c.ID()) var lastErr error @@ -743,10 +752,14 @@ func (c *Container) ExecResize(sessionID string, newSize define.TerminalSize) er return c.ociRuntime.ExecAttachResize(c, sessionID, newSize) } +func (c *Container) Exec(config *ExecConfig, streams *define.AttachStreams, resize <-chan define.TerminalSize) (int, error) { + return c.exec(config, streams, resize, false) +} + // Exec emulates the old Libpod exec API, providing a single call to create, // run, and remove an exec session. Returns exit code and error. Exit code is // not guaranteed to be set sanely if error is not nil. -func (c *Container) Exec(config *ExecConfig, streams *define.AttachStreams, resize <-chan define.TerminalSize) (int, error) { +func (c *Container) exec(config *ExecConfig, streams *define.AttachStreams, resize <-chan define.TerminalSize, isHealthcheck bool) (int, error) { sessionID, err := c.ExecCreate(config) if err != nil { return -1, err @@ -780,7 +793,7 @@ func (c *Container) Exec(config *ExecConfig, streams *define.AttachStreams, resi }() } - if err := c.ExecStartAndAttach(sessionID, streams, size); err != nil { + if err := c.execStartAndAttach(sessionID, streams, size, isHealthcheck); err != nil { return -1, err } diff --git a/libpod/container_internal.go b/libpod/container_internal.go index 7494eb3ec..64696cc27 100644 --- a/libpod/container_internal.go +++ b/libpod/container_internal.go @@ -17,9 +17,11 @@ import ( "github.com/containers/buildah/pkg/overlay" butil "github.com/containers/buildah/util" "github.com/containers/common/libnetwork/etchosts" + "github.com/containers/common/libnetwork/resolvconf" "github.com/containers/common/pkg/cgroups" "github.com/containers/common/pkg/chown" "github.com/containers/common/pkg/config" + cutil "github.com/containers/common/pkg/util" "github.com/containers/podman/v4/libpod/define" "github.com/containers/podman/v4/libpod/events" "github.com/containers/podman/v4/pkg/ctime" @@ -218,7 +220,7 @@ func (c *Container) handleExitFile(exitFile string, fi os.FileInfo) error { // Write an event for the container's death c.newContainerExitedEvent(c.state.ExitCode) - return nil + return c.runtime.state.AddContainerExitCode(c.ID(), c.state.ExitCode) } func (c *Container) shouldRestart() bool { @@ -289,7 +291,7 @@ func (c *Container) handleRestartPolicy(ctx context.Context) (_ bool, retErr err return false, err } - // setup slirp4netns again because slirp4netns will die when conmon exits + // set up slirp4netns again because slirp4netns will die when conmon exits if c.config.NetMode.IsSlirp4netns() { err := c.runtime.setupSlirp4netns(c, c.state.NetNS) if err != nil { @@ -297,7 +299,7 @@ func (c *Container) handleRestartPolicy(ctx context.Context) (_ bool, retErr err } } - // setup rootlesskit port forwarder again since it dies when conmon exits + // set up rootlesskit port forwarder again since it dies when conmon exits // we use rootlesskit port forwarder only as rootless and when bridge network is used if rootless.IsRootless() && c.config.NetMode.IsBridge() && len(c.config.PortMappings) > 0 { err := c.runtime.setupRootlessPortMappingViaRLK(c, c.state.NetNS.Path(), c.state.NetworkStatus) @@ -588,7 +590,7 @@ func (c *Container) teardownStorage() error { } if err := c.cleanupStorage(); err != nil { - return errors.Wrapf(err, "failed to cleanup container %s storage", c.ID()) + return errors.Wrapf(err, "failed to clean up container %s storage", c.ID()) } if err := c.runtime.storageService.DeleteContainer(c.ID()); err != nil { @@ -783,20 +785,6 @@ func (c *Container) getArtifactPath(name string) string { return filepath.Join(c.config.StaticDir, artifactsDir, name) } -// Used with Wait() to determine if a container has exited -func (c *Container) isStopped() (bool, int32, error) { - if !c.batched { - c.lock.Lock() - defer c.lock.Unlock() - } - err := c.syncContainer() - if err != nil { - return true, -1, err - } - - return !c.ensureState(define.ContainerStateRunning, define.ContainerStatePaused, define.ContainerStateStopping), c.state.ExitCode, nil -} - // save container state to the database func (c *Container) save() error { if err := c.runtime.state.SaveContainer(c); err != nil { @@ -986,7 +974,7 @@ func (c *Container) checkDependenciesRunning() ([]string, error) { } func (c *Container) completeNetworkSetup() error { - var outResolvConf []string + var nameservers []string netDisabled, err := c.NetworkDisabled() if err != nil { return err @@ -1000,11 +988,14 @@ func (c *Container) completeNetworkSetup() error { if err := c.runtime.setupNetNS(c); err != nil { return err } + if err := c.save(); err != nil { + return err + } state := c.state // collect any dns servers that cni tells us to use (dnsname) for _, status := range c.getNetworkStatus() { for _, server := range status.DNSServerIPs { - outResolvConf = append(outResolvConf, fmt.Sprintf("nameserver %s", server)) + nameservers = append(nameservers, server.String()) } } // check if we have a bindmount for /etc/hosts @@ -1020,24 +1011,12 @@ func (c *Container) completeNetworkSetup() error { } // check if we have a bindmount for resolv.conf - resolvBindMount := state.BindMounts["/etc/resolv.conf"] - if len(outResolvConf) < 1 || resolvBindMount == "" || len(c.config.NetNsCtr) > 0 { + resolvBindMount := state.BindMounts[resolvconf.DefaultResolvConf] + if len(nameservers) < 1 || resolvBindMount == "" || len(c.config.NetNsCtr) > 0 { return nil } - // read the existing resolv.conf - b, err := ioutil.ReadFile(resolvBindMount) - if err != nil { - return err - } - for _, line := range strings.Split(string(b), "\n") { - // only keep things that don't start with nameserver from the old - // resolv.conf file - if !strings.HasPrefix(line, "nameserver") { - outResolvConf = append([]string{line}, outResolvConf...) - } - } // write and return - return ioutil.WriteFile(resolvBindMount, []byte(strings.Join(outResolvConf, "\n")), 0644) + return resolvconf.Add(resolvBindMount, nameservers) } // Initialize a container, creating it in the runtime @@ -1290,13 +1269,6 @@ func (c *Container) stop(timeout uint) error { } } - // Check if conmon is still alive. - // If it is not, we won't be getting an exit file. - conmonAlive, err := c.ociRuntime.CheckConmonRunning(c) - if err != nil { - return err - } - // Set the container state to "stopping" and unlock the container // before handing it over to conmon to unblock other commands. #8501 // demonstrates nicely that a high stop timeout will block even simple @@ -1349,21 +1321,18 @@ func (c *Container) stop(timeout uint) error { } c.newContainerEvent(events.Stop) - - c.state.PID = 0 - c.state.ConmonPID = 0 c.state.StoppedByUser = true + conmonAlive, err := c.ociRuntime.CheckConmonRunning(c) + if err != nil { + return err + } if !conmonAlive { - // Conmon is dead, so we can't expect an exit code. - c.state.ExitCode = -1 - c.state.FinishedTime = time.Now() - c.state.State = define.ContainerStateStopped - if err := c.save(); err != nil { - logrus.Errorf("Saving container %s status: %v", c.ID(), err) + if err := c.checkExitFile(); err != nil { + return err } - return errors.Wrapf(define.ErrConmonDead, "container %s conmon process missing, cannot retrieve exit code", c.ID()) + return c.save() } if err := c.save(); err != nil { @@ -1671,30 +1640,16 @@ func (c *Container) mountNamedVolume(v *ContainerNamedVolume, mountpoint string) if err := vol.update(); err != nil { return nil, err } - if vol.state.NeedsCopyUp { + _, hasNoCopy := vol.config.Options["nocopy"] + if vol.state.NeedsCopyUp && !cutil.StringInSlice("nocopy", v.Options) && !hasNoCopy { logrus.Debugf("Copying up contents from container %s to volume %s", c.ID(), vol.Name()) - // If the volume is not empty, we should not copy up. - volMount := vol.mountPoint() - contents, err := ioutil.ReadDir(volMount) - if err != nil { - return nil, errors.Wrapf(err, "error listing contents of volume %s mountpoint when copying up from container %s", vol.Name(), c.ID()) - } - if len(contents) > 0 { - // The volume is not empty. It was likely modified - // outside of Podman. For safety, let's not copy up into - // it. Fixes CVE-2020-1726. - return vol, nil - } - srcDir, err := securejoin.SecureJoin(mountpoint, v.Dest) if err != nil { return nil, errors.Wrapf(err, "error calculating destination path to copy up container %s volume %s", c.ID(), vol.Name()) } // Do a manual stat on the source directory to verify existence. // Skip the rest if it exists. - // TODO: Should this be stat or lstat? I'm using lstat because I - // think copy-up doesn't happen when the source is a link. srcStat, err := os.Lstat(srcDir) if err != nil { if os.IsNotExist(err) { @@ -1720,6 +1675,19 @@ func (c *Container) mountNamedVolume(v *ContainerNamedVolume, mountpoint string) return vol, nil } + // If the volume is not empty, we should not copy up. + volMount := vol.mountPoint() + contents, err := ioutil.ReadDir(volMount) + if err != nil { + return nil, errors.Wrapf(err, "error listing contents of volume %s mountpoint when copying up from container %s", vol.Name(), c.ID()) + } + if len(contents) > 0 { + // The volume is not empty. It was likely modified + // outside of Podman. For safety, let's not copy up into + // it. Fixes CVE-2020-1726. + return vol, nil + } + // Set NeedsCopyUp to false since we are about to do first copy // Do not copy second time. vol.state.NeedsCopyUp = false @@ -1792,7 +1760,7 @@ func (c *Container) cleanupStorage() error { overlayBasePath := filepath.Dir(c.state.Mountpoint) if err := overlay.Unmount(overlayBasePath); err != nil { if cleanupErr != nil { - logrus.Errorf("Failed to cleanup overlay mounts for %s: %v", c.ID(), err) + logrus.Errorf("Failed to clean up overlay mounts for %s: %v", c.ID(), err) } cleanupErr = err } @@ -1809,7 +1777,7 @@ func (c *Container) cleanupStorage() error { if err := c.cleanupOverlayMounts(); err != nil { // If the container can't remove content report the error - logrus.Errorf("Failed to cleanup overlay mounts for %s: %v", c.ID(), err) + logrus.Errorf("Failed to clean up overlay mounts for %s: %v", c.ID(), err) cleanupErr = err } @@ -1888,7 +1856,7 @@ func (c *Container) cleanup(ctx context.Context) error { // we cannot use the dependency container lock due ABBA deadlocks if lock, err := lockfile.GetLockfile(hoststFile); err == nil { lock.Lock() - // make sure to ignore ENOENT error in case the netns container was cleanup before this one + // make sure to ignore ENOENT error in case the netns container was cleaned up before this one if err := etchosts.Remove(hoststFile, getLocalhostHostEntry(c)); err != nil && !errors.Is(err, os.ErrNotExist) { // this error is not fatal we still want to do proper cleanup logrus.Errorf("failed to remove hosts entry from the netns containers /etc/hosts: %v", err) @@ -1947,6 +1915,18 @@ func (c *Container) cleanup(ctx context.Context) error { } } + // Prune the exit codes of other container during clean up. + // Since Podman is no daemon, we have to clean them up somewhere. + // Cleanup seems like a good place as it's not performance + // critical. + if err := c.runtime.state.PruneContainerExitCodes(); err != nil { + if lastError == nil { + lastError = err + } else { + logrus.Errorf("Pruning container exit codes: %v", err) + } + } + return lastError } diff --git a/libpod/container_internal_linux.go b/libpod/container_internal_linux.go index e19d75deb..0f4bf0f55 100644 --- a/libpod/container_internal_linux.go +++ b/libpod/container_internal_linux.go @@ -9,7 +9,6 @@ import ( "io" "io/ioutil" "math" - "net" "os" "os/user" "path" @@ -29,6 +28,7 @@ import ( "github.com/containers/buildah/pkg/overlay" butil "github.com/containers/buildah/util" "github.com/containers/common/libnetwork/etchosts" + "github.com/containers/common/libnetwork/resolvconf" "github.com/containers/common/libnetwork/types" "github.com/containers/common/pkg/apparmor" "github.com/containers/common/pkg/cgroups" @@ -44,7 +44,6 @@ import ( "github.com/containers/podman/v4/pkg/checkpoint/crutils" "github.com/containers/podman/v4/pkg/criu" "github.com/containers/podman/v4/pkg/lookup" - "github.com/containers/podman/v4/pkg/resolvconf" "github.com/containers/podman/v4/pkg/rootless" "github.com/containers/podman/v4/pkg/util" "github.com/containers/podman/v4/utils" @@ -312,7 +311,7 @@ func (c *Container) cleanupNetwork() error { // Stop the container's network namespace (if it has one) if err := c.runtime.teardownNetNS(c); err != nil { - logrus.Errorf("Unable to cleanup network for container %s: %q", c.ID(), err) + logrus.Errorf("Unable to clean up network for container %s: %q", c.ID(), err) } c.state.NetNS = nil @@ -368,7 +367,7 @@ func (c *Container) getUserOverrides() *lookup.Overrides { func lookupHostUser(name string) (*runcuser.ExecUser, error) { var execUser runcuser.ExecUser - // Lookup User on host + // Look up User on host u, err := util.LookupUser(name) if err != nil { return &execUser, err @@ -388,6 +387,37 @@ func lookupHostUser(name string) (*runcuser.ExecUser, error) { return &execUser, nil } +// Internal only function which returns upper and work dir from +// overlay options. +func getOverlayUpperAndWorkDir(options []string) (string, string, error) { + upperDir := "" + workDir := "" + for _, o := range options { + if strings.HasPrefix(o, "upperdir") { + splitOpt := strings.SplitN(o, "=", 2) + if len(splitOpt) > 1 { + upperDir = splitOpt[1] + if upperDir == "" { + return "", "", errors.New("cannot accept empty value for upperdir") + } + } + } + if strings.HasPrefix(o, "workdir") { + splitOpt := strings.SplitN(o, "=", 2) + if len(splitOpt) > 1 { + workDir = splitOpt[1] + if workDir == "" { + return "", "", errors.New("cannot accept empty value for workdir") + } + } + } + } + if (upperDir != "" && workDir == "") || (upperDir == "" && workDir != "") { + return "", "", errors.New("must specify both upperdir and workdir") + } + return upperDir, workDir, nil +} + // Generate spec for a container // Accepts a map of the container's dependencies func (c *Container) generateSpec(ctx context.Context) (*spec.Spec, error) { @@ -407,6 +437,14 @@ func (c *Container) generateSpec(ctx context.Context) (*spec.Spec, error) { //nolint:staticcheck g := generate.NewFromSpec(c.config.Spec) + // If the flag to mount all devices is set for a privileged container, add + // all the devices from the host's machine into the container + if c.config.MountAllDevices { + if err := util.AddPrivilegedDevices(&g); err != nil { + return nil, err + } + } + // If network namespace was requested, add it now if c.config.CreateNetNS { if c.config.PostConfigureNetNS { @@ -460,23 +498,9 @@ func (c *Container) generateSpec(ctx context.Context) (*spec.Spec, error) { for _, o := range namedVol.Options { if o == "O" { overlayFlag = true - } - if overlayFlag && strings.Contains(o, "upperdir") { - splitOpt := strings.SplitN(o, "=", 2) - if len(splitOpt) > 1 { - upperDir = splitOpt[1] - if upperDir == "" { - return nil, errors.New("cannot accept empty value for upperdir") - } - } - } - if overlayFlag && strings.Contains(o, "workdir") { - splitOpt := strings.SplitN(o, "=", 2) - if len(splitOpt) > 1 { - workDir = splitOpt[1] - if workDir == "" { - return nil, errors.New("cannot accept empty value for workdir") - } + upperDir, workDir, err = getOverlayUpperAndWorkDir(namedVol.Options) + if err != nil { + return nil, err } } } @@ -489,10 +513,6 @@ func (c *Container) generateSpec(ctx context.Context) (*spec.Spec, error) { return nil, err } - if (upperDir != "" && workDir == "") || (upperDir == "" && workDir != "") { - return nil, errors.Wrapf(err, "must specify both upperdir and workdir") - } - overlayOpts = &overlay.Options{RootUID: c.RootUID(), RootGID: c.RootGID(), UpperDirOptionFragment: upperDir, @@ -585,11 +605,22 @@ func (c *Container) generateSpec(ctx context.Context) (*spec.Spec, error) { // Add overlay volumes for _, overlayVol := range c.config.OverlayVolumes { + upperDir, workDir, err := getOverlayUpperAndWorkDir(overlayVol.Options) + if err != nil { + return nil, err + } contentDir, err := overlay.TempDir(c.config.StaticDir, c.RootUID(), c.RootGID()) if err != nil { return nil, err } - overlayMount, err := overlay.Mount(contentDir, overlayVol.Source, overlayVol.Dest, c.RootUID(), c.RootGID(), c.runtime.store.GraphOptions()) + overlayOpts := &overlay.Options{RootUID: c.RootUID(), + RootGID: c.RootGID(), + UpperDirOptionFragment: upperDir, + WorkDirOptionFragment: workDir, + GraphOpts: c.runtime.store.GraphOptions(), + } + + overlayMount, err := overlay.MountWithOptions(contentDir, overlayVol.Source, overlayVol.Dest, overlayOpts) if err != nil { return nil, errors.Wrapf(err, "mounting overlay failed %q", overlayVol.Source) } @@ -839,6 +870,7 @@ func (c *Container) generateSpec(ctx context.Context) (*spec.Spec, error) { if err != nil { return nil, err } + g.SetLinuxCgroupsPath(cgroupPath) // Warning: CDI may alter g.Config in place. @@ -1110,7 +1142,7 @@ func (c *Container) addCheckpointImageMetadata(importBuilder *buildah.Builder) e return fmt.Errorf("getting host info: %v", err) } - criuVersion, err := criu.GetCriuVestion() + criuVersion, err := criu.GetCriuVersion() if err != nil { return fmt.Errorf("getting criu version: %v", err) } @@ -1179,7 +1211,7 @@ func (c *Container) createCheckpointImage(ctx context.Context, options Container if err != nil { return err } - // Clean-up buildah working container + // Clean up buildah working container defer func() { if err := importBuilder.Delete(); err != nil { logrus.Errorf("Image builder delete failed: %v", err) @@ -1473,7 +1505,7 @@ func (c *Container) checkpoint(ctx context.Context, options ContainerCheckpointO c.state.Restored = false c.state.RestoredTime = time.Time{} - // Cleanup Storage and Network + // Clean up Storage and Network if err := c.cleanup(ctx); err != nil { return nil, 0, err } @@ -2218,8 +2250,19 @@ func (c *Container) makeBindMounts() error { } } + _, hasRunContainerenv := c.state.BindMounts["/run/.containerenv"] + if !hasRunContainerenv { + // check in the spec mounts + for _, m := range c.config.Spec.Mounts { + if m.Destination == "/run/.containerenv" || m.Destination == "/run" { + hasRunContainerenv = true + break + } + } + } + // Make .containerenv if it does not exist - if _, ok := c.state.BindMounts["/run/.containerenv"]; !ok { + if !hasRunContainerenv { containerenv := c.runtime.graphRootMountedFlag(c.config.Spec.Mounts) isRootless := 0 if rootless.IsRootless() { @@ -2284,49 +2327,10 @@ rootless=%d // generateResolvConf generates a containers resolv.conf func (c *Container) generateResolvConf() error { var ( - nameservers []string networkNameServers []string networkSearchDomains []string ) - hostns := true - resolvConf := "/etc/resolv.conf" - for _, namespace := range c.config.Spec.Linux.Namespaces { - if namespace.Type == spec.NetworkNamespace { - hostns = false - if namespace.Path != "" && !strings.HasPrefix(namespace.Path, "/proc/") { - definedPath := filepath.Join("/etc/netns", filepath.Base(namespace.Path), "resolv.conf") - _, err := os.Stat(definedPath) - if err == nil { - resolvConf = definedPath - } else if !os.IsNotExist(err) { - return err - } - } - break - } - } - - contents, err := ioutil.ReadFile(resolvConf) - // resolv.conf doesn't have to exists - if err != nil && !os.IsNotExist(err) { - return err - } - - ns := resolvconf.GetNameservers(contents) - // check if systemd-resolved is used, assume it is used when 127.0.0.53 is the only nameserver - if !hostns && len(ns) == 1 && ns[0] == "127.0.0.53" { - // read the actual resolv.conf file for systemd-resolved - resolvedContents, err := ioutil.ReadFile("/run/systemd/resolve/resolv.conf") - if err != nil { - if !os.IsNotExist(err) { - return errors.Wrapf(err, "detected that systemd-resolved is in use, but could not locate real resolv.conf") - } - } else { - contents = resolvedContents - } - } - netStatus := c.getNetworkStatus() for _, status := range netStatus { if status.DNSServerIPs != nil { @@ -2346,34 +2350,18 @@ func (c *Container) generateResolvConf() error { return err } - // Ensure that the container's /etc/resolv.conf is compatible with its - // network configuration. - resolv, err := resolvconf.FilterResolvDNS(contents, ipv6, !hostns) - if err != nil { - return errors.Wrapf(err, "error parsing host resolv.conf") - } - - dns := make([]net.IP, 0, len(c.runtime.config.Containers.DNSServers)+len(c.config.DNSServer)) - for _, i := range c.runtime.config.Containers.DNSServers { - result := net.ParseIP(i) - if result == nil { - return errors.Wrapf(define.ErrInvalidArg, "invalid IP address %s", i) - } - dns = append(dns, result) + nameservers := make([]string, 0, len(c.runtime.config.Containers.DNSServers)+len(c.config.DNSServer)) + nameservers = append(nameservers, c.runtime.config.Containers.DNSServers...) + for _, ip := range c.config.DNSServer { + nameservers = append(nameservers, ip.String()) } - dns = append(dns, c.config.DNSServer...) // If the user provided dns, it trumps all; then dns masq; then resolv.conf var search []string - switch { - case len(dns) > 0: - // We store DNS servers as net.IP, so need to convert to string - for _, server := range dns { - nameservers = append(nameservers, server.String()) - } - default: - // Make a new resolv.conf + keepHostServers := false + if len(nameservers) == 0 { + keepHostServers = true // first add the nameservers from the networks status - nameservers = append(nameservers, networkNameServers...) + nameservers = networkNameServers // when we add network dns server we also have to add the search domains search = networkSearchDomains // slirp4netns has a built in DNS forwarder. @@ -2385,38 +2373,34 @@ func (c *Container) generateResolvConf() error { nameservers = append(nameservers, slirp4netnsDNS.String()) } } - nameservers = append(nameservers, resolvconf.GetNameservers(resolv.Content)...) } if len(c.config.DNSSearch) > 0 || len(c.runtime.config.Containers.DNSSearches) > 0 { - if !cutil.StringInSlice(".", c.config.DNSSearch) { - search = append(search, c.runtime.config.Containers.DNSSearches...) - search = append(search, c.config.DNSSearch...) - } - } else { - search = append(search, resolvconf.GetSearchDomains(resolv.Content)...) + customSearch := make([]string, 0, len(c.config.DNSSearch)+len(c.runtime.config.Containers.DNSSearches)) + customSearch = append(customSearch, c.runtime.config.Containers.DNSSearches...) + customSearch = append(customSearch, c.config.DNSSearch...) + search = customSearch } - var options []string - if len(c.config.DNSOption) > 0 || len(c.runtime.config.Containers.DNSOptions) > 0 { - options = c.runtime.config.Containers.DNSOptions - options = append(options, c.config.DNSOption...) - } else { - options = resolvconf.GetOptions(resolv.Content) - } + options := make([]string, 0, len(c.config.DNSOption)+len(c.runtime.config.Containers.DNSOptions)) + options = append(options, c.runtime.config.Containers.DNSOptions...) + options = append(options, c.config.DNSOption...) destPath := filepath.Join(c.state.RunDir, "resolv.conf") - if err := os.Remove(destPath); err != nil && !os.IsNotExist(err) { - return errors.Wrapf(err, "container %s", c.ID()) - } - - // Build resolv.conf - if _, err = resolvconf.Build(destPath, nameservers, search, options); err != nil { + if err := resolvconf.New(&resolvconf.Params{ + IPv6Enabled: ipv6, + KeepHostServers: keepHostServers, + Nameservers: nameservers, + Namespaces: c.config.Spec.Linux.Namespaces, + Options: options, + Path: destPath, + Searches: search, + }); err != nil { return errors.Wrapf(err, "error building resolv.conf for container %s", c.ID()) } - return c.bindMountRootFile(destPath, "/etc/resolv.conf") + return c.bindMountRootFile(destPath, resolvconf.DefaultResolvConf) } // Check if a container uses IPv6. @@ -2457,31 +2441,13 @@ func (c *Container) addNameserver(ips []string) error { } // Do we have a resolv.conf at all? - path, ok := c.state.BindMounts["/etc/resolv.conf"] + path, ok := c.state.BindMounts[resolvconf.DefaultResolvConf] if !ok { return nil } - // Read in full contents, parse out existing nameservers - contents, err := ioutil.ReadFile(path) - if err != nil { - return err - } - ns := resolvconf.GetNameservers(contents) - options := resolvconf.GetOptions(contents) - search := resolvconf.GetSearchDomains(contents) - - // We could verify that it doesn't already exist - // but extra nameservers shouldn't harm anything. - // Ensure we are the first entry in resolv.conf though, otherwise we - // might be after user-added servers. - ns = append(ips, ns...) - - // We're rewriting the container's resolv.conf as part of this, but we - // hold the container lock, so there should be no risk of parallel - // modification. - if _, err := resolvconf.Build(path, ns, search, options); err != nil { - return errors.Wrapf(err, "error adding new nameserver to container %s resolv.conf", c.ID()) + if err := resolvconf.Add(path, ips); err != nil { + return fmt.Errorf("adding new nameserver to container %s resolv.conf: %w", c.ID(), err) } return nil @@ -2496,34 +2462,13 @@ func (c *Container) removeNameserver(ips []string) error { } // Do we have a resolv.conf at all? - path, ok := c.state.BindMounts["/etc/resolv.conf"] + path, ok := c.state.BindMounts[resolvconf.DefaultResolvConf] if !ok { return nil } - // Read in full contents, parse out existing nameservers - contents, err := ioutil.ReadFile(path) - if err != nil { - return err - } - ns := resolvconf.GetNameservers(contents) - options := resolvconf.GetOptions(contents) - search := resolvconf.GetSearchDomains(contents) - - toRemove := make(map[string]bool) - for _, ip := range ips { - toRemove[ip] = true - } - - newNS := make([]string, 0, len(ns)) - for _, server := range ns { - if !toRemove[server] { - newNS = append(newNS, server) - } - } - - if _, err := resolvconf.Build(path, newNS, search, options); err != nil { - return errors.Wrapf(err, "error removing nameservers from container %s resolv.conf", c.ID()) + if err := resolvconf.Remove(path, ips); err != nil { + return fmt.Errorf("removing nameservers from container %s resolv.conf: %w", c.ID(), err) } return nil @@ -2656,13 +2601,13 @@ func (c *Container) generateCurrentUserGroupEntry() (string, int, error) { return "", 0, errors.Wrapf(err, "failed to get current group") } - // Lookup group name to see if it exists in the image. + // Look up group name to see if it exists in the image. _, err = lookup.GetGroup(c.state.Mountpoint, g.Name) if err != runcuser.ErrNoGroupEntries { return "", 0, err } - // Lookup GID to see if it exists in the image. + // Look up GID to see if it exists in the image. _, err = lookup.GetGroup(c.state.Mountpoint, g.Gid) if err != runcuser.ErrNoGroupEntries { return "", 0, err @@ -2699,7 +2644,7 @@ func (c *Container) generateUserGroupEntry(addedGID int) (string, error) { gid, err := strconv.ParseUint(group, 10, 32) if err != nil { - return "", nil // nolint: nilerr + return "", nil //nolint: nilerr } if addedGID != 0 && addedGID == int(gid) { @@ -2732,7 +2677,7 @@ func (c *Container) generatePasswdEntry() (string, error) { addedUID := 0 for _, userid := range c.config.HostUsers { - // Lookup User on host + // Look up User on host u, err := util.LookupUser(userid) if err != nil { return "", err @@ -2784,13 +2729,13 @@ func (c *Container) generateCurrentUserPasswdEntry() (string, int, int, error) { } func (c *Container) userPasswdEntry(u *user.User) (string, error) { - // Lookup the user to see if it exists in the container image. + // Look up the user to see if it exists in the container image. _, err := lookup.GetUser(c.state.Mountpoint, u.Username) if err != runcuser.ErrNoPasswdEntries { return "", err } - // Lookup the UID to see if it exists in the container image. + // Look up the UID to see if it exists in the container image. _, err = lookup.GetUser(c.state.Mountpoint, u.Uid) if err != runcuser.ErrNoPasswdEntries { return "", err @@ -2855,14 +2800,14 @@ func (c *Container) generateUserPasswdEntry(addedUID int) (string, error) { // If a non numeric User, then don't generate passwd uid, err := strconv.ParseUint(userspec, 10, 32) if err != nil { - return "", nil // nolint: nilerr + return "", nil //nolint: nilerr } if addedUID != 0 && int(uid) == addedUID { return "", nil } - // Lookup the user to see if it exists in the container image + // Look up the user to see if it exists in the container image _, err = lookup.GetUser(c.state.Mountpoint, userspec) if err != runcuser.ErrNoPasswdEntries { return "", err @@ -3280,7 +3225,7 @@ func (c *Container) fixVolumePermissions(v *ContainerNamedVolume) error { return err } stat := st.Sys().(*syscall.Stat_t) - atime := time.Unix(int64(stat.Atim.Sec), int64(stat.Atim.Nsec)) // nolint: unconvert + atime := time.Unix(int64(stat.Atim.Sec), int64(stat.Atim.Nsec)) //nolint: unconvert if err := os.Chtimes(mountPoint, atime, st.ModTime()); err != nil { return err } diff --git a/libpod/container_log.go b/libpod/container_log.go index 7a9eb2dbf..da6d51670 100644 --- a/libpod/container_log.go +++ b/libpod/container_log.go @@ -75,7 +75,6 @@ func (c *Container) readFromLogFile(ctx context.Context, options *logs.LogOption go func() { defer options.WaitGroup.Done() - var partial string for line := range t.Lines { select { case <-ctx.Done(): @@ -89,13 +88,6 @@ func (c *Container) readFromLogFile(ctx context.Context, options *logs.LogOption logrus.Errorf("Getting new log line: %v", err) continue } - if nll.Partial() { - partial += nll.Msg - continue - } else if !nll.Partial() && len(partial) > 0 { - nll.Msg = partial + nll.Msg - partial = "" - } nll.CID = c.ID() nll.CName = c.Name() nll.ColorID = colorID diff --git a/libpod/container_log_linux.go b/libpod/container_log_linux.go index deb726526..7f90332c7 100644 --- a/libpod/container_log_linux.go +++ b/libpod/container_log_linux.go @@ -292,11 +292,12 @@ func formatterPrefix(entry *sdjournal.JournalEntry) (string, error) { if !ok { return "", errors.Errorf("no PRIORITY field present in journal entry") } - if priority == journaldLogOut { + switch priority { + case journaldLogOut: output += "stdout " - } else if priority == journaldLogErr { + case journaldLogErr: output += "stderr " - } else { + default: return "", errors.Errorf("unexpected PRIORITY field in journal entry") } diff --git a/libpod/define/container_inspect.go b/libpod/define/container_inspect.go index e7b82d654..ccc4ae00f 100644 --- a/libpod/define/container_inspect.go +++ b/libpod/define/container_inspect.go @@ -259,9 +259,7 @@ type HealthCheckLog struct { // as possible from the spec and container config. // Some things cannot be inferred. These will be populated by spec annotations // (if available). -// Field names are fixed for compatibility and cannot be changed. -// As such, silence lint warnings about them. -//nolint +//nolint:revive,stylecheck // Field names are fixed for compatibility and cannot be changed. type InspectContainerHostConfig struct { // Binds contains an array of user-added mounts. // Both volume mounts and named volumes are included. diff --git a/libpod/define/errors.go b/libpod/define/errors.go index f5a7c73e5..9757a85b1 100644 --- a/libpod/define/errors.go +++ b/libpod/define/errors.go @@ -24,6 +24,10 @@ var ( // not exist. ErrNoSuchExecSession = errors.New("no such exec session") + // ErrNoSuchExitCode indicates that the requested container exit code + // does not exist. + ErrNoSuchExitCode = errors.New("no such exit code") + // ErrDepExists indicates that the current object has dependencies and // cannot be removed before them. ErrDepExists = errors.New("dependency exists") diff --git a/libpod/define/volume_inspect.go b/libpod/define/volume_inspect.go index fac179176..aaa23b4fc 100644 --- a/libpod/define/volume_inspect.go +++ b/libpod/define/volume_inspect.go @@ -57,3 +57,9 @@ type InspectVolumeData struct { // UID/GID. NeedsChown bool `json:"NeedsChown,omitempty"` } + +type VolumeReload struct { + Added []string + Removed []string + Errors []error +} diff --git a/libpod/events.go b/libpod/events.go index f09d8402a..bb50df92d 100644 --- a/libpod/events.go +++ b/libpod/events.go @@ -33,6 +33,16 @@ func (c *Container) newContainerEvent(status events.Status) { Attributes: c.Labels(), } + // if the current event is a HealthStatus event, we need to get the current + // status of the container to pass to the event + if status == events.HealthStatus { + containerHealthStatus, err := c.healthCheckStatus() + if err != nil { + e.HealthStatus = fmt.Sprintf("%v", err) + } + e.HealthStatus = containerHealthStatus + } + if err := c.runtime.eventer.Write(e); err != nil { logrus.Errorf("Unable to write pod event: %q", err) } @@ -151,6 +161,9 @@ func (r *Runtime) GetEvents(ctx context.Context, filters []string) ([]*events.Ev // GetLastContainerEvent takes a container name or ID and an event status and returns // the last occurrence of the container event func (r *Runtime) GetLastContainerEvent(ctx context.Context, nameOrID string, containerEvent events.Status) (*events.Event, error) { + // FIXME: events should be read in reverse order! + // https://github.com/containers/podman/issues/14579 + // check to make sure the event.Status is valid if _, err := events.StringToStatus(containerEvent.String()); err != nil { return nil, err diff --git a/libpod/events/config.go b/libpod/events/config.go index 2e7016136..a678baa2d 100644 --- a/libpod/events/config.go +++ b/libpod/events/config.go @@ -40,6 +40,8 @@ type Event struct { Time time.Time // Type of event that occurred Type Type + // Health status of the current container + HealthStatus string `json:"health_status,omitempty"` Details } @@ -141,6 +143,8 @@ const ( Exited Status = "died" // Export ... Export Status = "export" + // HealthStatus ... + HealthStatus Status = "health_status" // History ... History Status = "history" // Import ... diff --git a/libpod/events/events.go b/libpod/events/events.go index a30e0f1ca..a8001ab95 100644 --- a/libpod/events/events.go +++ b/libpod/events/events.go @@ -76,7 +76,7 @@ func (e *Event) ToHumanReadable(truncate bool) string { } switch e.Type { case Container, Pod: - humanFormat = fmt.Sprintf("%s %s %s %s (image=%s, name=%s", e.Time, e.Type, e.Status, id, e.Image, e.Name) + humanFormat = fmt.Sprintf("%s %s %s %s (image=%s, name=%s, health_status=%s", e.Time, e.Type, e.Status, id, e.Image, e.Name, e.HealthStatus) // check if the container has labels and add it to the output if len(e.Attributes) > 0 { for k, v := range e.Attributes { @@ -168,6 +168,8 @@ func StringToStatus(name string) (Status, error) { return Exited, nil case Export.String(): return Export, nil + case HealthStatus.String(): + return HealthStatus, nil case History.String(): return History, nil case Import.String(): diff --git a/libpod/events/journal_linux.go b/libpod/events/journal_linux.go index 866042a4c..036638d34 100644 --- a/libpod/events/journal_linux.go +++ b/libpod/events/journal_linux.go @@ -58,13 +58,14 @@ func (e EventJournalD) Write(ee Event) error { } m["PODMAN_LABELS"] = string(b) } + m["PODMAN_HEALTH_STATUS"] = ee.HealthStatus case Network: m["PODMAN_ID"] = ee.ID m["PODMAN_NETWORK_NAME"] = ee.Network case Volume: m["PODMAN_NAME"] = ee.Name } - return journal.Send(string(ee.ToHumanReadable(false)), journal.PriInfo, m) + return journal.Send(ee.ToHumanReadable(false), journal.PriInfo, m) } // Read reads events from the journal and sends qualified events to the event channel @@ -167,10 +168,9 @@ func (e EventJournalD) Read(ctx context.Context, options ReadOptions) error { } } return nil - } -func newEventFromJournalEntry(entry *sdjournal.JournalEntry) (*Event, error) { //nolint +func newEventFromJournalEntry(entry *sdjournal.JournalEntry) (*Event, error) { newEvent := Event{} eventType, err := StringToType(entry.Fields["PODMAN_TYPE"]) if err != nil { @@ -214,6 +214,7 @@ func newEventFromJournalEntry(entry *sdjournal.JournalEntry) (*Event, error) { / newEvent.Details = Details{Attributes: labels} } } + newEvent.HealthStatus = entry.Fields["PODMAN_HEALTH_STATUS"] case Network: newEvent.ID = entry.Fields["PODMAN_ID"] newEvent.Network = entry.Fields["PODMAN_NETWORK_NAME"] diff --git a/libpod/healthcheck.go b/libpod/healthcheck.go index 40af9aec3..95c70b60e 100644 --- a/libpod/healthcheck.go +++ b/libpod/healthcheck.go @@ -26,7 +26,7 @@ const ( func (r *Runtime) HealthCheck(name string) (define.HealthCheckStatus, error) { container, err := r.LookupContainer(name) if err != nil { - return define.HealthCheckContainerNotFound, errors.Wrapf(err, "unable to lookup %s to perform a health check", name) + return define.HealthCheckContainerNotFound, errors.Wrapf(err, "unable to look up %s to perform a health check", name) } hcStatus, err := checkHealthCheckCanBeRun(container) if err == nil { @@ -90,7 +90,7 @@ func (c *Container) runHealthCheck() (define.HealthCheckStatus, error) { hcResult := define.HealthCheckSuccess config := new(ExecConfig) config.Command = newCommand - exitCode, hcErr := c.Exec(config, streams, nil) + exitCode, hcErr := c.exec(config, streams, nil, true) if hcErr != nil { errCause := errors.Cause(hcErr) hcResult = define.HealthCheckFailure @@ -232,18 +232,27 @@ func (c *Container) getHealthCheckLog() (define.HealthCheckResults, error) { // HealthCheckStatus returns the current state of a container with a healthcheck func (c *Container) HealthCheckStatus() (string, error) { + c.lock.Lock() + defer c.lock.Unlock() + return c.healthCheckStatus() +} + +// Internal function to return the current state of a container with a healthcheck. +// This function does not lock the container. +func (c *Container) healthCheckStatus() (string, error) { if !c.HasHealthCheck() { return "", errors.Errorf("container %s has no defined healthcheck", c.ID()) } - c.lock.Lock() - defer c.lock.Unlock() + if err := c.syncContainer(); err != nil { return "", err } + results, err := c.getHealthCheckLog() if err != nil { return "", errors.Wrapf(err, "unable to get healthcheck log for %s", c.ID()) } + return results.Status, nil } diff --git a/libpod/kube.go b/libpod/kube.go index 20c4612d1..bd4230d66 100644 --- a/libpod/kube.go +++ b/libpod/kube.go @@ -43,8 +43,8 @@ func GenerateForKube(ctx context.Context, ctrs []*Container) (*v1.Pod, error) { func (p *Pod) GenerateForKube(ctx context.Context) (*v1.Pod, []v1.ServicePort, error) { // Generate the v1.Pod yaml description var ( - ports []v1.ContainerPort //nolint - servicePorts []v1.ServicePort //nolint + ports []v1.ContainerPort + servicePorts []v1.ServicePort ) allContainers, err := p.allContainers() diff --git a/libpod/lock/file/file_lock.go b/libpod/lock/file/file_lock.go index 4685872b6..145aa6e26 100644 --- a/libpod/lock/file/file_lock.go +++ b/libpod/lock/file/file_lock.go @@ -14,7 +14,7 @@ import ( // FileLocks is a struct enabling POSIX lock locking in a shared memory // segment. -type FileLocks struct { // nolint +type FileLocks struct { //nolint:revive // struct name stutters lockPath string valid bool } diff --git a/libpod/lock/shm/shm_lock.go b/libpod/lock/shm/shm_lock.go index c7f4d1bc5..6eaf37e48 100644 --- a/libpod/lock/shm/shm_lock.go +++ b/libpod/lock/shm/shm_lock.go @@ -28,7 +28,7 @@ var ( // SHMLocks is a struct enabling POSIX semaphore locking in a shared memory // segment. -type SHMLocks struct { // nolint +type SHMLocks struct { lockStruct *C.shm_struct_t maxLocks uint32 valid bool diff --git a/libpod/networking_linux.go b/libpod/networking_linux.go index 37fa9b5f5..a83423c9f 100644 --- a/libpod/networking_linux.go +++ b/libpod/networking_linux.go @@ -21,6 +21,7 @@ import ( "github.com/containernetworking/plugins/pkg/ns" "github.com/containers/common/libnetwork/etchosts" + "github.com/containers/common/libnetwork/resolvconf" "github.com/containers/common/libnetwork/types" "github.com/containers/common/pkg/config" "github.com/containers/common/pkg/machine" @@ -30,11 +31,10 @@ import ( "github.com/containers/podman/v4/libpod/events" "github.com/containers/podman/v4/pkg/errorhandling" "github.com/containers/podman/v4/pkg/namespaces" - "github.com/containers/podman/v4/pkg/resolvconf" "github.com/containers/podman/v4/pkg/rootless" "github.com/containers/podman/v4/utils" "github.com/containers/storage/pkg/lockfile" - spec "github.com/opencontainers/runtime-spec/specs-go" + "github.com/opencontainers/runtime-spec/specs-go" "github.com/opencontainers/selinux/go-selinux/label" "github.com/pkg/errors" "github.com/sirupsen/logrus" @@ -109,7 +109,7 @@ func (r *RootlessNetNS) getPath(path string) string { func (r *RootlessNetNS) Do(toRun func() error) error { err := r.ns.Do(func(_ ns.NetNS) error { // Before we can run the given function, - // we have to setup all mounts correctly. + // we have to set up all mounts correctly. // The order of the mounts is IMPORTANT. // The idea of the extra mount ns is to make /run and /var/lib/cni writeable @@ -291,7 +291,7 @@ func (r *RootlessNetNS) Do(toRun func() error) error { return err } -// Cleanup the rootless network namespace if needed. +// Clean up the rootless network namespace if needed. // It checks if we have running containers with the bridge network mode. // Cleanup() expects that r.Lock is locked func (r *RootlessNetNS) Cleanup(runtime *Runtime) error { @@ -419,7 +419,7 @@ func (r *Runtime) GetRootlessNetNs(new bool) (*RootlessNetNS, error) { if err != nil { return nil, errors.Wrap(err, "error creating rootless network namespace") } - // setup slirp4netns here + // set up slirp4netns here path := r.config.Engine.NetworkCmdPath if path == "" { var err error @@ -526,23 +526,19 @@ func (r *Runtime) GetRootlessNetNs(new bool) (*RootlessNetNS, error) { return nil, errors.Wrapf(err, "failed to determine slirp4netns DNS address from cidr: %s", cidr.String()) } } - conf, err := resolvconf.Get() - if err != nil { - return nil, err - } - conf, err = resolvconf.FilterResolvDNS(conf.Content, netOptions.enableIPv6, true) - if err != nil { - return nil, err - } - searchDomains := resolvconf.GetSearchDomains(conf.Content) - dnsOptions := resolvconf.GetOptions(conf.Content) - nameServers := resolvconf.GetNameservers(conf.Content) - _, err = resolvconf.Build(filepath.Join(rootlessNetNsDir, "resolv.conf"), append([]string{resolveIP.String()}, nameServers...), searchDomains, dnsOptions) - if err != nil { + if err := resolvconf.New(&resolvconf.Params{ + Path: filepath.Join(rootlessNetNsDir, "resolv.conf"), + // fake the netns since we want to filter localhost + Namespaces: []specs.LinuxNamespace{ + {Type: specs.NetworkNamespace}, + }, + IPv6Enabled: netOptions.enableIPv6, + KeepHostServers: true, + Nameservers: []string{resolveIP.String()}, + }); err != nil { return nil, errors.Wrap(err, "failed to create rootless netns resolv.conf") } - // create cni directories to store files // they will be bind mounted to the correct location in a extra mount ns err = os.MkdirAll(filepath.Join(rootlessNetNsDir, persistentCNIDir), 0700) @@ -660,9 +656,9 @@ func (r *Runtime) configureNetNS(ctr *Container, ctrNS ns.NetNS) (status map[str return nil, err } - // setup rootless port forwarder when rootless with ports and the network status is empty, + // set up rootless port forwarder when rootless with ports and the network status is empty, // if this is called from network reload the network status will not be empty and we should - // not setup port because they are still active + // not set up port because they are still active if rootless.IsRootless() && len(ctr.config.PortMappings) > 0 && ctr.getNetworkStatus() == nil { // set up port forwarder for rootless netns netnsPath := ctrNS.Path() @@ -787,7 +783,7 @@ func (r *Runtime) teardownNetwork(ns string, opts types.NetworkOptions) error { // execute the cni setup in the rootless net ns err = rootlessNetNS.Do(tearDownPod) if cerr := rootlessNetNS.Cleanup(r); cerr != nil { - logrus.WithError(err).Error("failed to cleanup rootless netns") + logrus.WithError(err).Error("failed to clean up rootless netns") } rootlessNetNS.Lock.Unlock() } else { @@ -1089,7 +1085,7 @@ func (c *Container) getContainerNetworkInfo() (*define.InspectNetworkSettings, e func (c *Container) joinedNetworkNSPath() string { for _, namespace := range c.config.Spec.Linux.Namespaces { - if namespace.Type == spec.NetworkNamespace { + if namespace.Type == specs.NetworkNamespace { return namespace.Path } } diff --git a/libpod/oci_conmon_attach_linux.go b/libpod/oci_conmon_attach_linux.go index 155a8fbc3..26f9ba083 100644 --- a/libpod/oci_conmon_attach_linux.go +++ b/libpod/oci_conmon_attach_linux.go @@ -120,7 +120,7 @@ func (r *ConmonOCIRuntime) Attach(c *Container, params *AttachOptions) error { // conmon will then send the exit code of the exec process, or an error in the exec session // startFd must be the input side of the fd. // newSize resizes the tty to this size before the process is started, must be nil if the exec session has no tty -// conmon will wait to start the exec session until the parent process has setup the console socket. +// conmon will wait to start the exec session until the parent process has set up the console socket. // Once attachToExec successfully attaches to the console socket, the child conmon process responsible for calling runtime exec // will read from the output side of start fd, thus learning to start the child process. // Thus, the order goes as follow: diff --git a/libpod/oci_conmon_linux.go b/libpod/oci_conmon_linux.go index 6aa7ce6dc..7a9ae7ee5 100644 --- a/libpod/oci_conmon_linux.go +++ b/libpod/oci_conmon_linux.go @@ -23,6 +23,9 @@ import ( "text/template" "time" + runcconfig "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/devices" + "github.com/containers/common/pkg/cgroups" "github.com/containers/common/pkg/config" conmonConfig "github.com/containers/conmon/runner/config" @@ -264,11 +267,6 @@ func (r *ConmonOCIRuntime) CreateContainer(ctr *Container, restoreOptions *Conta // status, but will instead only check for the existence of the conmon exit file // and update state to stopped if it exists. func (r *ConmonOCIRuntime) UpdateContainerStatus(ctr *Container) error { - exitFile, err := r.ExitFilePath(ctr) - if err != nil { - return err - } - runtimeDir, err := util.GetRuntimeDir() if err != nil { return err @@ -340,22 +338,10 @@ func (r *ConmonOCIRuntime) UpdateContainerStatus(ctr *Container) error { // Only grab exit status if we were not already stopped // If we were, it should already be in the database if ctr.state.State == define.ContainerStateStopped && oldState != define.ContainerStateStopped { - var fi os.FileInfo - chWait := make(chan error) - defer close(chWait) - - _, err := WaitForFile(exitFile, chWait, time.Second*5) - if err == nil { - fi, err = os.Stat(exitFile) - } - if err != nil { - ctr.state.ExitCode = -1 - ctr.state.FinishedTime = time.Now() - logrus.Errorf("No exit file for container %s found: %v", ctr.ID(), err) - return nil + if _, err := ctr.Wait(context.Background()); err != nil { + logrus.Errorf("Waiting for container %s to exit: %v", ctr.ID(), err) } - - return ctr.handleExitFile(exitFile, fi) + return nil } // Handle ContainerStateStopping - keep it unless the container @@ -411,8 +397,8 @@ func (r *ConmonOCIRuntime) KillContainer(ctr *Container, signal uint, all bool) if err2 := r.UpdateContainerStatus(ctr); err2 != nil { logrus.Infof("Error updating status for container %s: %v", ctr.ID(), err2) } - if ctr.state.State == define.ContainerStateExited { - return nil + if ctr.ensureState(define.ContainerStateStopped, define.ContainerStateExited) { + return define.ErrCtrStateInvalid } return errors.Wrapf(err, "error sending signal to container %s", ctr.ID()) } @@ -1014,7 +1000,7 @@ func (r *ConmonOCIRuntime) getLogTag(ctr *Container) (string, error) { data, err := ctr.inspectLocked(false) if err != nil { // FIXME: this error should probably be returned - return "", nil // nolint: nilerr + return "", nil //nolint: nilerr } tmpl, err := template.New("container").Parse(logTag) if err != nil { @@ -1166,7 +1152,6 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co }).Debugf("running conmon: %s", r.conmonPath) cmd := exec.Command(r.conmonPath, args...) - cmd.Dir = ctr.bundlePath() cmd.SysProcAttr = &syscall.SysProcAttr{ Setpgid: true, } @@ -1354,8 +1339,6 @@ func (r *ConmonOCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, p logDriverArg = define.NoLogging case define.PassthroughLogging: logDriverArg = define.PassthroughLogging - case define.JSONLogging: - fallthrough //lint:ignore ST1015 the default case has to be here default: //nolint:stylecheck,gocritic // No case here should happen except JSONLogging, but keep this here in case the options are extended @@ -1365,6 +1348,8 @@ func (r *ConmonOCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, p // to get here, either a user would specify `--log-driver ""`, or this came from another place in libpod // since the former case is obscure, and the latter case isn't an error, let's silently fallthrough fallthrough + case define.JSONLogging: + fallthrough case define.KubernetesLogging: logDriverArg = fmt.Sprintf("%s:%s", define.KubernetesLogging, logPath) } @@ -1435,7 +1420,7 @@ func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec } // $INVOCATION_ID is set by systemd when running as a service. - if os.Getenv("INVOCATION_ID") != "" { + if ctr.runtime.RemoteURI() == "" && os.Getenv("INVOCATION_ID") != "" { mustCreateCgroup = false } @@ -1451,9 +1436,14 @@ func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec // TODO: This should be a switch - we are not guaranteed that // there are only 2 valid cgroup managers cgroupParent := ctr.CgroupParent() + cgroupPath := filepath.Join(ctr.config.CgroupParent, "conmon") + Resource := ctr.Spec().Linux.Resources + cgroupResources, err := GetLimits(Resource) + if err != nil { + logrus.StandardLogger().Log(logLevel, "Could not get ctr resources") + } if ctr.CgroupManager() == config.SystemdCgroupsManager { unitName := createUnitName("libpod-conmon", ctr.ID()) - realCgroupParent := cgroupParent splitParent := strings.Split(cgroupParent, "/") if strings.HasSuffix(cgroupParent, ".slice") && len(splitParent) > 1 { @@ -1465,8 +1455,7 @@ func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to systemd sandbox cgroup: %v", err) } } else { - cgroupPath := filepath.Join(ctr.config.CgroupParent, "conmon") - control, err := cgroups.New(cgroupPath, &spec.LinuxResources{}) + control, err := cgroups.New(cgroupPath, &cgroupResources) if err != nil { logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to cgroupfs sandbox cgroup: %v", err) } else if err := control.AddPid(cmd.Process.Pid); err != nil { @@ -1748,3 +1737,191 @@ func httpAttachNonTerminalCopy(container *net.UnixConn, http *bufio.ReadWriter, } } } + +// GetLimits converts spec resource limits to cgroup consumable limits +func GetLimits(resource *spec.LinuxResources) (runcconfig.Resources, error) { + if resource == nil { + resource = &spec.LinuxResources{} + } + final := &runcconfig.Resources{} + devs := []*devices.Rule{} + + // Devices + for _, entry := range resource.Devices { + if entry.Major == nil || entry.Minor == nil { + continue + } + runeType := 'a' + switch entry.Type { + case "b": + runeType = 'b' + case "c": + runeType = 'c' + } + + devs = append(devs, &devices.Rule{ + Type: devices.Type(runeType), + Major: *entry.Major, + Minor: *entry.Minor, + Permissions: devices.Permissions(entry.Access), + Allow: entry.Allow, + }) + } + final.Devices = devs + + // HugepageLimits + pageLimits := []*runcconfig.HugepageLimit{} + for _, entry := range resource.HugepageLimits { + pageLimits = append(pageLimits, &runcconfig.HugepageLimit{ + Pagesize: entry.Pagesize, + Limit: entry.Limit, + }) + } + final.HugetlbLimit = pageLimits + + // Networking + netPriorities := []*runcconfig.IfPrioMap{} + if resource.Network != nil { + for _, entry := range resource.Network.Priorities { + netPriorities = append(netPriorities, &runcconfig.IfPrioMap{ + Interface: entry.Name, + Priority: int64(entry.Priority), + }) + } + } + final.NetPrioIfpriomap = netPriorities + rdma := make(map[string]runcconfig.LinuxRdma) + for name, entry := range resource.Rdma { + rdma[name] = runcconfig.LinuxRdma{HcaHandles: entry.HcaHandles, HcaObjects: entry.HcaObjects} + } + final.Rdma = rdma + + // Memory + if resource.Memory != nil { + if resource.Memory.Limit != nil { + final.Memory = *resource.Memory.Limit + } + if resource.Memory.Reservation != nil { + final.MemoryReservation = *resource.Memory.Reservation + } + if resource.Memory.Swap != nil { + final.MemorySwap = *resource.Memory.Swap + } + if resource.Memory.Swappiness != nil { + final.MemorySwappiness = resource.Memory.Swappiness + } + } + + // CPU + if resource.CPU != nil { + if resource.CPU.Period != nil { + final.CpuPeriod = *resource.CPU.Period + } + if resource.CPU.Quota != nil { + final.CpuQuota = *resource.CPU.Quota + } + if resource.CPU.RealtimePeriod != nil { + final.CpuRtPeriod = *resource.CPU.RealtimePeriod + } + if resource.CPU.RealtimeRuntime != nil { + final.CpuRtRuntime = *resource.CPU.RealtimeRuntime + } + if resource.CPU.Shares != nil { + final.CpuShares = *resource.CPU.Shares + } + final.CpusetCpus = resource.CPU.Cpus + final.CpusetMems = resource.CPU.Mems + } + + // BlkIO + if resource.BlockIO != nil { + if len(resource.BlockIO.ThrottleReadBpsDevice) > 0 { + for _, entry := range resource.BlockIO.ThrottleReadBpsDevice { + throttle := &runcconfig.ThrottleDevice{} + dev := &runcconfig.BlockIODevice{ + Major: entry.Major, + Minor: entry.Minor, + } + throttle.BlockIODevice = *dev + throttle.Rate = entry.Rate + final.BlkioThrottleReadBpsDevice = append(final.BlkioThrottleReadBpsDevice, throttle) + } + } + if len(resource.BlockIO.ThrottleWriteBpsDevice) > 0 { + for _, entry := range resource.BlockIO.ThrottleWriteBpsDevice { + throttle := &runcconfig.ThrottleDevice{} + dev := &runcconfig.BlockIODevice{ + Major: entry.Major, + Minor: entry.Minor, + } + throttle.BlockIODevice = *dev + throttle.Rate = entry.Rate + final.BlkioThrottleWriteBpsDevice = append(final.BlkioThrottleWriteBpsDevice, throttle) + } + } + if len(resource.BlockIO.ThrottleReadIOPSDevice) > 0 { + for _, entry := range resource.BlockIO.ThrottleReadIOPSDevice { + throttle := &runcconfig.ThrottleDevice{} + dev := &runcconfig.BlockIODevice{ + Major: entry.Major, + Minor: entry.Minor, + } + throttle.BlockIODevice = *dev + throttle.Rate = entry.Rate + final.BlkioThrottleReadIOPSDevice = append(final.BlkioThrottleReadIOPSDevice, throttle) + } + } + if len(resource.BlockIO.ThrottleWriteIOPSDevice) > 0 { + for _, entry := range resource.BlockIO.ThrottleWriteIOPSDevice { + throttle := &runcconfig.ThrottleDevice{} + dev := &runcconfig.BlockIODevice{ + Major: entry.Major, + Minor: entry.Minor, + } + throttle.BlockIODevice = *dev + throttle.Rate = entry.Rate + final.BlkioThrottleWriteIOPSDevice = append(final.BlkioThrottleWriteIOPSDevice, throttle) + } + } + if resource.BlockIO.LeafWeight != nil { + final.BlkioLeafWeight = *resource.BlockIO.LeafWeight + } + if resource.BlockIO.Weight != nil { + final.BlkioWeight = *resource.BlockIO.Weight + } + if len(resource.BlockIO.WeightDevice) > 0 { + for _, entry := range resource.BlockIO.WeightDevice { + weight := &runcconfig.WeightDevice{} + dev := &runcconfig.BlockIODevice{ + Major: entry.Major, + Minor: entry.Minor, + } + if entry.Weight != nil { + weight.Weight = *entry.Weight + } + if entry.LeafWeight != nil { + weight.LeafWeight = *entry.LeafWeight + } + weight.BlockIODevice = *dev + final.BlkioWeightDevice = append(final.BlkioWeightDevice, weight) + } + } + } + + // Pids + if resource.Pids != nil { + final.PidsLimit = resource.Pids.Limit + } + + // Networking + if resource.Network != nil { + if resource.Network.ClassID != nil { + final.NetClsClassid = *resource.Network.ClassID + } + } + + // Unified state + final.Unified = resource.Unified + + return *final, nil +} diff --git a/libpod/options.go b/libpod/options.go index a02c05537..9a29fb279 100644 --- a/libpod/options.go +++ b/libpod/options.go @@ -435,6 +435,21 @@ func WithDefaultInfraCommand(cmd string) RuntimeOption { } } +// WithReset instructs libpod to reset all storage to factory defaults. +// All containers, pods, volumes, images, and networks will be removed. +// All directories created by Libpod will be removed. +func WithReset() RuntimeOption { + return func(rt *Runtime) error { + if rt.valid { + return define.ErrRuntimeFinalized + } + + rt.doReset = true + + return nil + } +} + // WithRenumber instructs libpod to perform a lock renumbering while // initializing. This will handle migrations from early versions of libpod with // file locks to newer versions with SHM locking, as well as changes in the @@ -1797,7 +1812,7 @@ func WithHostDevice(dev []specs.LinuxDevice) CtrCreateOption { } } -// WithSelectedPasswordManagement makes it so that the container either does or does not setup /etc/passwd or /etc/group +// WithSelectedPasswordManagement makes it so that the container either does or does not set up /etc/passwd or /etc/group func WithSelectedPasswordManagement(passwd *bool) CtrCreateOption { return func(c *Container) error { if c.valid { @@ -2159,3 +2174,17 @@ func WithPasswdEntry(passwdEntry string) CtrCreateOption { return nil } } + +// WithMountAllDevices sets the option to mount all of a privileged container's +// host devices +func WithMountAllDevices() CtrCreateOption { + return func(ctr *Container) error { + if ctr.valid { + return define.ErrCtrFinalized + } + + ctr.config.MountAllDevices = true + + return nil + } +} diff --git a/libpod/plugin/volume_api.go b/libpod/plugin/volume_api.go index 2818e70c1..2de7db32c 100644 --- a/libpod/plugin/volume_api.go +++ b/libpod/plugin/volume_api.go @@ -35,8 +35,6 @@ var ( hostVirtualPath = "/VolumeDriver.Path" mountPath = "/VolumeDriver.Mount" unmountPath = "/VolumeDriver.Unmount" - // nolint - capabilitiesPath = "/VolumeDriver.Capabilities" ) const ( @@ -199,13 +197,13 @@ func (p *VolumePlugin) verifyReachable() error { // Send a request to the volume plugin for handling. // Callers *MUST* close the response when they are done. -func (p *VolumePlugin) sendRequest(toJSON interface{}, hasBody bool, endpoint string) (*http.Response, error) { +func (p *VolumePlugin) sendRequest(toJSON interface{}, endpoint string) (*http.Response, error) { var ( reqJSON []byte err error ) - if hasBody { + if toJSON != nil { reqJSON, err = json.Marshal(toJSON) if err != nil { return nil, errors.Wrapf(err, "error marshalling request JSON for volume plugin %s endpoint %s", p.Name, endpoint) @@ -276,7 +274,7 @@ func (p *VolumePlugin) CreateVolume(req *volume.CreateRequest) error { logrus.Infof("Creating volume %s using plugin %s", req.Name, p.Name) - resp, err := p.sendRequest(req, true, createPath) + resp, err := p.sendRequest(req, createPath) if err != nil { return err } @@ -293,7 +291,7 @@ func (p *VolumePlugin) ListVolumes() ([]*volume.Volume, error) { logrus.Infof("Listing volumes using plugin %s", p.Name) - resp, err := p.sendRequest(nil, false, listPath) + resp, err := p.sendRequest(nil, listPath) if err != nil { return nil, err } @@ -328,7 +326,7 @@ func (p *VolumePlugin) GetVolume(req *volume.GetRequest) (*volume.Volume, error) logrus.Infof("Getting volume %s using plugin %s", req.Name, p.Name) - resp, err := p.sendRequest(req, true, getPath) + resp, err := p.sendRequest(req, getPath) if err != nil { return nil, err } @@ -363,7 +361,7 @@ func (p *VolumePlugin) RemoveVolume(req *volume.RemoveRequest) error { logrus.Infof("Removing volume %s using plugin %s", req.Name, p.Name) - resp, err := p.sendRequest(req, true, removePath) + resp, err := p.sendRequest(req, removePath) if err != nil { return err } @@ -384,7 +382,7 @@ func (p *VolumePlugin) GetVolumePath(req *volume.PathRequest) (string, error) { logrus.Infof("Getting volume %s path using plugin %s", req.Name, p.Name) - resp, err := p.sendRequest(req, true, hostVirtualPath) + resp, err := p.sendRequest(req, hostVirtualPath) if err != nil { return "", err } @@ -421,7 +419,7 @@ func (p *VolumePlugin) MountVolume(req *volume.MountRequest) (string, error) { logrus.Infof("Mounting volume %s using plugin %s for container %s", req.Name, p.Name, req.ID) - resp, err := p.sendRequest(req, true, mountPath) + resp, err := p.sendRequest(req, mountPath) if err != nil { return "", err } @@ -457,7 +455,7 @@ func (p *VolumePlugin) UnmountVolume(req *volume.UnmountRequest) error { logrus.Infof("Unmounting volume %s using plugin %s for container %s", req.Name, p.Name, req.ID) - resp, err := p.sendRequest(req, true, unmountPath) + resp, err := p.sendRequest(req, unmountPath) if err != nil { return err } diff --git a/libpod/pod.go b/libpod/pod.go index 3c8dc43d4..2502c41a9 100644 --- a/libpod/pod.go +++ b/libpod/pod.go @@ -178,8 +178,8 @@ func (p *Pod) NetworkMode() string { return infra.NetworkMode() } -// PidMode returns the PID mode given by the user ex: pod, private... -func (p *Pod) PidMode() string { +// Namespace Mode returns the given NS mode provided by the user ex: host, private... +func (p *Pod) NamespaceMode(kind specs.LinuxNamespaceType) string { infra, err := p.runtime.GetContainer(p.state.InfraContainerID) if err != nil { return "" @@ -187,28 +187,7 @@ func (p *Pod) PidMode() string { ctrSpec := infra.config.Spec if ctrSpec != nil && ctrSpec.Linux != nil { for _, ns := range ctrSpec.Linux.Namespaces { - if ns.Type == specs.PIDNamespace { - if ns.Path != "" { - return fmt.Sprintf("ns:%s", ns.Path) - } - return "private" - } - } - return "host" - } - return "" -} - -// PidMode returns the PID mode given by the user ex: pod, private... -func (p *Pod) UserNSMode() string { - infra, err := p.infraContainer() - if err != nil { - return "" - } - ctrSpec := infra.config.Spec - if ctrSpec != nil && ctrSpec.Linux != nil { - for _, ns := range ctrSpec.Linux.Namespaces { - if ns.Type == specs.UserNamespace { + if ns.Type == kind { if ns.Path != "" { return fmt.Sprintf("ns:%s", ns.Path) } @@ -471,3 +450,14 @@ func (p *Pod) initContainers() ([]*Container, error) { } return initCons, nil } + +func (p *Pod) Config() (*PodConfig, error) { + p.lock.Lock() + defer p.lock.Unlock() + + conf := &PodConfig{} + + err := JSONDeepCopy(p.config, conf) + + return conf, err +} diff --git a/libpod/pod_api.go b/libpod/pod_api.go index 1c1e15984..fefe0e329 100644 --- a/libpod/pod_api.go +++ b/libpod/pod_api.go @@ -9,6 +9,7 @@ import ( "github.com/containers/podman/v4/libpod/events" "github.com/containers/podman/v4/pkg/parallel" "github.com/containers/podman/v4/pkg/rootless" + "github.com/opencontainers/runtime-spec/specs-go" "github.com/pkg/errors" "github.com/sirupsen/logrus" ) @@ -673,8 +674,8 @@ func (p *Pod) Inspect() (*define.InspectPodData, error) { infraConfig.CPUPeriod = p.CPUPeriod() infraConfig.CPUQuota = p.CPUQuota() infraConfig.CPUSetCPUs = p.ResourceLim().CPU.Cpus - infraConfig.PidNS = p.PidMode() - infraConfig.UserNS = p.UserNSMode() + infraConfig.PidNS = p.NamespaceMode(specs.PIDNamespace) + infraConfig.UserNS = p.NamespaceMode(specs.UserNamespace) namedVolumes, mounts := infra.SortUserVolumes(infra.config.Spec) inspectMounts, err = infra.GetMounts(namedVolumes, infra.config.ImageVolumes, mounts) infraSecurity = infra.GetSecurityOptions() diff --git a/libpod/pod_internal.go b/libpod/pod_internal.go index 41f745e6c..1502bcb06 100644 --- a/libpod/pod_internal.go +++ b/libpod/pod_internal.go @@ -69,7 +69,7 @@ func (p *Pod) refresh() error { if p.config.UsePodCgroup { switch p.runtime.config.Engine.CgroupManager { case config.SystemdCgroupsManager: - cgroupPath, err := systemdSliceFromPath(p.config.CgroupParent, fmt.Sprintf("libpod_pod_%s", p.ID())) + cgroupPath, err := systemdSliceFromPath(p.config.CgroupParent, fmt.Sprintf("libpod_pod_%s", p.ID()), p.ResourceLim()) if err != nil { logrus.Errorf("Creating Cgroup for pod %s: %v", p.ID(), err) } diff --git a/libpod/reset.go b/libpod/reset.go index 28d0ee3f6..30eab50fb 100644 --- a/libpod/reset.go +++ b/libpod/reset.go @@ -17,8 +17,78 @@ import ( "github.com/sirupsen/logrus" ) +// removeAllDirs removes all Podman storage directories. It is intended to be +// used as a backup for reset() when that function cannot be used due to +// failures in initializing libpod. +// It does not expect that all the directories match what is in use by Podman, +// as this is a common failure point for `system reset`. As such, our ability to +// interface with containers and pods is somewhat limited. +// This function assumes that we do not have a working c/storage store. +func (r *Runtime) removeAllDirs() error { + var lastErr error + + // Grab the runtime alive lock. + // This ensures that no other Podman process can run while we are doing + // a reset, so no race conditions with containers/pods/etc being created + // while we are resetting storage. + // TODO: maybe want a helper for getting the path? This is duped from + // runtime.go + runtimeAliveLock := filepath.Join(r.config.Engine.TmpDir, "alive.lck") + aliveLock, err := storage.GetLockfile(runtimeAliveLock) + if err != nil { + logrus.Errorf("Lock runtime alive lock %s: %v", runtimeAliveLock, err) + } else { + aliveLock.Lock() + defer aliveLock.Unlock() + } + + // We do not have a store - so we can't really try and remove containers + // or pods or volumes... + // Try and remove the directories, in hopes that they are unmounted. + // This is likely to fail but it's the best we can do. + + // Volume path + if err := os.RemoveAll(r.config.Engine.VolumePath); err != nil { + lastErr = errors.Wrapf(err, "removing volume path") + } + + // Tmpdir + if err := os.RemoveAll(r.config.Engine.TmpDir); err != nil { + if lastErr != nil { + logrus.Errorf("Reset: %v", lastErr) + } + lastErr = errors.Wrapf(err, "removing tmp dir") + } + + // Runroot + if err := os.RemoveAll(r.storageConfig.RunRoot); err != nil { + if lastErr != nil { + logrus.Errorf("Reset: %v", lastErr) + } + lastErr = errors.Wrapf(err, "removing run root") + } + + // Static dir + if err := os.RemoveAll(r.config.Engine.StaticDir); err != nil { + if lastErr != nil { + logrus.Errorf("Reset: %v", lastErr) + } + lastErr = errors.Wrapf(err, "removing static dir") + } + + // Graph root + if err := os.RemoveAll(r.storageConfig.GraphRoot); err != nil { + if lastErr != nil { + logrus.Errorf("Reset: %v", lastErr) + } + lastErr = errors.Wrapf(err, "removing graph root") + } + + return lastErr +} + // Reset removes all storage -func (r *Runtime) Reset(ctx context.Context) error { +func (r *Runtime) reset(ctx context.Context) error { var timeout *uint pods, err := r.GetAllPods() if err != nil { diff --git a/libpod/runtime.go b/libpod/runtime.go index e268c2d17..11ec750b1 100644 --- a/libpod/runtime.go +++ b/libpod/runtime.go @@ -96,6 +96,10 @@ type Runtime struct { // This bool is just needed so that we can set it for netavark interface. syslog bool + // doReset indicates that the runtime should perform a system reset. + // All Podman files will be removed. + doReset bool + // doRenumber indicates that the runtime should perform a lock renumber // during initialization. // Once the runtime has been initialized and returned, this variable is @@ -131,7 +135,7 @@ func SetXdgDirs() error { return nil } - // Setup XDG_RUNTIME_DIR + // Set up XDG_RUNTIME_DIR runtimeDir := os.Getenv("XDG_RUNTIME_DIR") if runtimeDir == "" { @@ -152,7 +156,7 @@ func SetXdgDirs() error { } } - // Setup XDG_CONFIG_HOME + // Set up XDG_CONFIG_HOME if cfgHomeDir := os.Getenv("XDG_CONFIG_HOME"); cfgHomeDir == "" { cfgHomeDir, err := util.GetRootlessConfigHomeDir() if err != nil { @@ -235,6 +239,11 @@ func newRuntimeFromConfig(conf *config.Config, options ...RuntimeOption) (*Runti runtime.config.CheckCgroupsAndAdjustConfig() + // If resetting storage, do *not* return a runtime. + if runtime.doReset { + return nil, nil + } + return runtime, nil } @@ -305,6 +314,13 @@ func makeRuntime(runtime *Runtime) (retErr error) { } runtime.conmonPath = cPath + if runtime.noStore && runtime.doReset { + return errors.Wrapf(define.ErrInvalidArg, "cannot perform system reset if runtime is not creating a store") + } + if runtime.doReset && runtime.doRenumber { + return errors.Wrapf(define.ErrInvalidArg, "cannot perform system reset while renumbering locks") + } + // Make the static files directory if it does not exist if err := os.MkdirAll(runtime.config.Engine.StaticDir, 0700); err != nil { // The directory is allowed to exist @@ -339,6 +355,20 @@ func makeRuntime(runtime *Runtime) (retErr error) { // Grab config from the database so we can reset some defaults dbConfig, err := runtime.state.GetDBConfig() if err != nil { + if runtime.doReset { + // We can at least delete the DB and the static files + // directory. + // Can't safely touch anything else because we aren't + // sure of other directories. + if err := runtime.state.Close(); err != nil { + logrus.Errorf("Closing database connection: %v", err) + } else { + if err := os.RemoveAll(runtime.config.Engine.StaticDir); err != nil { + logrus.Errorf("Removing static files directory %v: %v", runtime.config.Engine.StaticDir, err) + } + } + } + return errors.Wrapf(err, "error retrieving runtime configuration from database") } @@ -372,7 +402,13 @@ func makeRuntime(runtime *Runtime) (retErr error) { // Validate our config against the database, now that we've set our // final storage configuration if err := runtime.state.ValidateDBConfig(runtime); err != nil { - return err + // If we are performing a storage reset: continue on with a + // warning. Otherwise we can't `system reset` after a change to + // the core paths. + if !runtime.doReset { + return err + } + logrus.Errorf("Runtime paths differ from those stored in database, storage reset may not remove all files") } if err := runtime.state.SetNamespace(runtime.config.Engine.Namespace); err != nil { @@ -394,6 +430,14 @@ func makeRuntime(runtime *Runtime) (retErr error) { } else if runtime.noStore { logrus.Debug("No store required. Not opening container store.") } else if err := runtime.configureStore(); err != nil { + // Make a best-effort attempt to clean up if performing a + // storage reset. + if runtime.doReset { + if err := runtime.removeAllDirs(); err != nil { + logrus.Errorf("Removing libpod directories: %v", err) + } + } + return err } defer func() { @@ -406,7 +450,7 @@ func makeRuntime(runtime *Runtime) (retErr error) { } }() - // Setup the eventer + // Set up the eventer eventer, err := runtime.newEventer() if err != nil { return err @@ -495,7 +539,7 @@ func makeRuntime(runtime *Runtime) (retErr error) { } } - // the store is only setup when we are in the userns so we do the same for the network interface + // the store is only set up when we are in the userns so we do the same for the network interface if !needsUserns { netBackend, netInterface, err := network.NetworkBackend(runtime.store, runtime.config, runtime.syslog) if err != nil { @@ -575,6 +619,18 @@ func makeRuntime(runtime *Runtime) (retErr error) { return err } + // If we're resetting storage, do it now. + // We will not return a valid runtime. + // TODO: Plumb this context out so it can be set. + if runtime.doReset { + // Mark the runtime as valid, so normal functionality "mostly" + // works and we can use regular functions to remove + // ctrs/pods/etc + runtime.valid = true + + return runtime.reset(context.Background()) + } + // If we're renumbering locks, do it now. // It breaks out of normal runtime init, and will not return a valid // runtime. @@ -818,7 +874,7 @@ func (r *Runtime) DeferredShutdown(force bool) { // still containers running or mounted func (r *Runtime) Shutdown(force bool) error { if !r.valid { - return define.ErrRuntimeStopped + return nil } if r.workerChannel != nil { @@ -1067,7 +1123,7 @@ func (r *Runtime) mergeDBConfig(dbConfig *DBConfig) { if !r.storageSet.GraphDriverNameSet && dbConfig.GraphDriver != "" { if r.storageConfig.GraphDriverName != dbConfig.GraphDriver && r.storageConfig.GraphDriverName != "" { - logrus.Errorf("User-selected graph driver %q overwritten by graph driver %q from database - delete libpod local files to resolve", + logrus.Errorf("User-selected graph driver %q overwritten by graph driver %q from database - delete libpod local files to resolve. May prevent use of images created by other tools", r.storageConfig.GraphDriverName, dbConfig.GraphDriver) } r.storageConfig.GraphDriverName = dbConfig.GraphDriver diff --git a/libpod/runtime_ctr.go b/libpod/runtime_ctr.go index bdfc102ba..fafec5e12 100644 --- a/libpod/runtime_ctr.go +++ b/libpod/runtime_ctr.go @@ -502,7 +502,7 @@ func (r *Runtime) setupContainer(ctx context.Context, ctr *Container) (_ *Contai volOptions = append(volOptions, parsedOptions...) } } - newVol, err := r.newVolume(volOptions...) + newVol, err := r.newVolume(false, volOptions...) if err != nil { return nil, errors.Wrapf(err, "error creating named volume %q", vol.Name) } @@ -664,9 +664,6 @@ func (r *Runtime) removeContainer(ctx context.Context, c *Container, force, remo } if c.state.State == define.ContainerStatePaused { - if err := c.ociRuntime.KillContainer(c, 9, false); err != nil { - return err - } isV2, err := cgroups.IsCgroup2UnifiedMode() if err != nil { return err @@ -677,6 +674,9 @@ func (r *Runtime) removeContainer(ctx context.Context, c *Container, force, remo return err } } + if err := c.ociRuntime.KillContainer(c, 9, false); err != nil { + return err + } // Need to update container state to make sure we know it's stopped if err := c.waitForExitFileAndSync(); err != nil { return err @@ -755,7 +755,7 @@ func (r *Runtime) removeContainer(ctx context.Context, c *Container, force, remo if cleanupErr == nil { cleanupErr = err } else { - logrus.Errorf("Cleanup storage: %v", err) + logrus.Errorf("Cleaning up storage: %v", err) } } @@ -805,16 +805,16 @@ func (r *Runtime) removeContainer(ctx context.Context, c *Container, force, remo if !volume.Anonymous() { continue } - if err := runtime.removeVolume(ctx, volume, false, timeout); err != nil && errors.Cause(err) != define.ErrNoSuchVolume { + if err := runtime.removeVolume(ctx, volume, false, timeout, false); err != nil && errors.Cause(err) != define.ErrNoSuchVolume { if errors.Cause(err) == define.ErrVolumeBeingUsed { // Ignore error, since podman will report original error volumesFrom, _ := c.volumesFrom() if len(volumesFrom) > 0 { - logrus.Debugf("Cleanup volume not possible since volume is in use (%s)", v) + logrus.Debugf("Cleaning up volume not possible since volume is in use (%s)", v) continue } } - logrus.Errorf("Cleanup volume (%s): %v", v, err) + logrus.Errorf("Cleaning up volume (%s): %v", v, err) } } } @@ -963,8 +963,8 @@ func (r *Runtime) evictContainer(ctx context.Context, idOrName string, removeVol if !volume.Anonymous() { continue } - if err := r.removeVolume(ctx, volume, false, timeout); err != nil && err != define.ErrNoSuchVolume && err != define.ErrVolumeBeingUsed { - logrus.Errorf("Cleanup volume (%s): %v", v, err) + if err := r.removeVolume(ctx, volume, false, timeout, false); err != nil && err != define.ErrNoSuchVolume && err != define.ErrVolumeBeingUsed { + logrus.Errorf("Cleaning up volume (%s): %v", v, err) } } } @@ -1111,7 +1111,7 @@ func (r *Runtime) GetContainersByList(containers []string) ([]*Container, error) for _, inputContainer := range containers { ctr, err := r.LookupContainer(inputContainer) if err != nil { - return ctrs, errors.Wrapf(err, "unable to lookup container %s", inputContainer) + return ctrs, errors.Wrapf(err, "unable to look up container %s", inputContainer) } ctrs = append(ctrs, ctr) } diff --git a/libpod/runtime_pod_linux.go b/libpod/runtime_pod_linux.go index dcc3a044f..00017ca21 100644 --- a/libpod/runtime_pod_linux.go +++ b/libpod/runtime_pod_linux.go @@ -17,7 +17,7 @@ import ( "github.com/containers/podman/v4/libpod/events" "github.com/containers/podman/v4/pkg/rootless" "github.com/containers/podman/v4/pkg/specgen" - spec "github.com/opencontainers/runtime-spec/specs-go" + runcconfig "github.com/opencontainers/runc/libcontainer/configs" "github.com/pkg/errors" "github.com/sirupsen/logrus" ) @@ -66,6 +66,7 @@ func (r *Runtime) NewPod(ctx context.Context, p specgen.PodSpecGenerator, option case config.CgroupfsCgroupsManager: canUseCgroup := !rootless.IsRootless() || isRootlessCgroupSet(pod.config.CgroupParent) if canUseCgroup { + // need to actually create parent here if pod.config.CgroupParent == "" { pod.config.CgroupParent = CgroupfsDefaultCgroupParent } else if strings.HasSuffix(path.Base(pod.config.CgroupParent), ".slice") { @@ -73,12 +74,26 @@ func (r *Runtime) NewPod(ctx context.Context, p specgen.PodSpecGenerator, option } // If we are set to use pod cgroups, set the cgroup parent that // all containers in the pod will share - // No need to create it with cgroupfs - the first container to - // launch should do it for us if pod.config.UsePodCgroup { pod.state.CgroupPath = filepath.Join(pod.config.CgroupParent, pod.ID()) if p.InfraContainerSpec != nil { p.InfraContainerSpec.CgroupParent = pod.state.CgroupPath + res, err := GetLimits(p.InfraContainerSpec.ResourceLimits) + if err != nil { + return nil, err + } + // Need to both create and update the cgroup + // rather than create a new path in c/common for pod cgroup creation + // just create as if it is a ctr and then update figures out that we need to + // populate the resource limits on the pod level + cgc, err := cgroups.New(pod.state.CgroupPath, &res) + if err != nil { + return nil, err + } + err = cgc.Update(&res) + if err != nil { + return nil, err + } } } } @@ -95,7 +110,7 @@ func (r *Runtime) NewPod(ctx context.Context, p specgen.PodSpecGenerator, option // If we are set to use pod cgroups, set the cgroup parent that // all containers in the pod will share if pod.config.UsePodCgroup { - cgroupPath, err := systemdSliceFromPath(pod.config.CgroupParent, fmt.Sprintf("libpod_pod_%s", pod.ID())) + cgroupPath, err := systemdSliceFromPath(pod.config.CgroupParent, fmt.Sprintf("libpod_pod_%s", pod.ID()), p.InfraContainerSpec.ResourceLimits) if err != nil { return nil, errors.Wrapf(err, "unable to create pod cgroup for pod %s", pod.ID()) } @@ -239,9 +254,8 @@ func (r *Runtime) removePod(ctx context.Context, p *Pod, removeCtrs, force bool, } // New resource limits - resLimits := new(spec.LinuxResources) - resLimits.Pids = new(spec.LinuxPids) - resLimits.Pids.Limit = 1 // Inhibit forks with very low pids limit + resLimits := new(runcconfig.Resources) + resLimits.PidsLimit = 1 // Inhibit forks with very low pids limit // Don't try if we failed to retrieve the cgroup if err == nil { @@ -301,7 +315,7 @@ func (r *Runtime) removePod(ctx context.Context, p *Pod, removeCtrs, force bool, if !volume.Anonymous() { continue } - if err := r.removeVolume(ctx, volume, false, timeout); err != nil { + if err := r.removeVolume(ctx, volume, false, timeout, false); err != nil { if errors.Cause(err) == define.ErrNoSuchVolume || errors.Cause(err) == define.ErrVolumeRemoved { continue } @@ -321,7 +335,7 @@ func (r *Runtime) removePod(ctx context.Context, p *Pod, removeCtrs, force bool, switch p.runtime.config.Engine.CgroupManager { case config.SystemdCgroupsManager: - if err := deleteSystemdCgroup(p.state.CgroupPath); err != nil { + if err := deleteSystemdCgroup(p.state.CgroupPath, p.ResourceLim()); err != nil { if removalErr == nil { removalErr = errors.Wrapf(err, "error removing pod %s cgroup", p.ID()) } else { diff --git a/libpod/runtime_volume.go b/libpod/runtime_volume.go index 21bf8aefc..6872db21d 100644 --- a/libpod/runtime_volume.go +++ b/libpod/runtime_volume.go @@ -33,7 +33,7 @@ func (r *Runtime) RemoveVolume(ctx context.Context, v *Volume, force bool, timeo return nil } } - return r.removeVolume(ctx, v, force, timeout) + return r.removeVolume(ctx, v, force, timeout, false) } // GetVolume retrieves a volume given its full name. diff --git a/libpod/runtime_volume_linux.go b/libpod/runtime_volume_linux.go index f8788e183..17a48be86 100644 --- a/libpod/runtime_volume_linux.go +++ b/libpod/runtime_volume_linux.go @@ -5,6 +5,7 @@ package libpod import ( "context" + "fmt" "os" "path/filepath" "strings" @@ -25,11 +26,13 @@ func (r *Runtime) NewVolume(ctx context.Context, options ...VolumeCreateOption) if !r.valid { return nil, define.ErrRuntimeStopped } - return r.newVolume(options...) + return r.newVolume(false, options...) } -// newVolume creates a new empty volume -func (r *Runtime) newVolume(options ...VolumeCreateOption) (_ *Volume, deferredErr error) { +// newVolume creates a new empty volume with the given options. +// The createPluginVolume can be set to true to make it not create the volume in the volume plugin, +// this is required for the UpdateVolumePlugins() function. If you are not sure set this to false. +func (r *Runtime) newVolume(noCreatePluginVolume bool, options ...VolumeCreateOption) (_ *Volume, deferredErr error) { volume := newVolume(r) for _, option := range options { if err := option(volume); err != nil { @@ -73,7 +76,7 @@ func (r *Runtime) newVolume(options ...VolumeCreateOption) (_ *Volume, deferredE return nil, errors.Wrapf(err, "invalid volume option %s for driver 'local'", key) } } - case "o", "type", "uid", "gid", "size", "inodes", "noquota": + case "o", "type", "uid", "gid", "size", "inodes", "noquota", "copy", "nocopy": // Do nothing, valid keys default: return nil, errors.Wrapf(define.ErrInvalidArg, "invalid mount option %s for driver 'local'", key) @@ -83,7 +86,7 @@ func (r *Runtime) newVolume(options ...VolumeCreateOption) (_ *Volume, deferredE // Now we get conditional: we either need to make the volume in the // volume plugin, or on disk if not using a plugin. - if volume.plugin != nil { + if volume.plugin != nil && !noCreatePluginVolume { // We can't chown, or relabel, or similar the path the volume is // using, because it's not managed by us. // TODO: reevaluate this once we actually have volume plugins in @@ -164,6 +167,85 @@ func (r *Runtime) newVolume(options ...VolumeCreateOption) (_ *Volume, deferredE return volume, nil } +// UpdateVolumePlugins reads all volumes from all configured volume plugins and +// imports them into the libpod db. It also checks if existing libpod volumes +// are removed in the plugin, in this case we try to remove it from libpod. +// On errors we continue and try to do as much as possible. all errors are +// returned as array in the returned struct. +// This function has many race conditions, it is best effort but cannot guarantee +// a perfect state since plugins can be modified from the outside at any time. +func (r *Runtime) UpdateVolumePlugins(ctx context.Context) *define.VolumeReload { + var ( + added []string + removed []string + errs []error + allPluginVolumes = map[string]struct{}{} + ) + + for driverName, socket := range r.config.Engine.VolumePlugins { + driver, err := volplugin.GetVolumePlugin(driverName, socket) + if err != nil { + errs = append(errs, err) + continue + } + vols, err := driver.ListVolumes() + if err != nil { + errs = append(errs, fmt.Errorf("failed to read volumes from plugin %q: %w", driverName, err)) + continue + } + for _, vol := range vols { + allPluginVolumes[vol.Name] = struct{}{} + if _, err := r.newVolume(true, WithVolumeName(vol.Name), WithVolumeDriver(driverName)); err != nil { + // If the volume exists this is not an error, just ignore it and log. It is very likely + // that the volume from the plugin was already in our db. + if !errors.Is(err, define.ErrVolumeExists) { + errs = append(errs, err) + continue + } + logrus.Infof("Volume %q already exists: %v", vol.Name, err) + continue + } + added = append(added, vol.Name) + } + } + + libpodVolumes, err := r.state.AllVolumes() + if err != nil { + errs = append(errs, fmt.Errorf("cannot delete dangling plugin volumes: failed to read libpod volumes: %w", err)) + } + for _, vol := range libpodVolumes { + if vol.UsesVolumeDriver() { + if _, ok := allPluginVolumes[vol.Name()]; !ok { + // The volume is no longer in the plugin, lets remove it from the libpod db. + if err := r.removeVolume(ctx, vol, false, nil, true); err != nil { + if errors.Is(err, define.ErrVolumeBeingUsed) { + // Volume is still used by at least one container. This is very bad, + // the plugin no longer has this but we still need it. + errs = append(errs, fmt.Errorf("volume was removed from the plugin %q but containers still require it: %w", vol.config.Driver, err)) + continue + } + if errors.Is(err, define.ErrNoSuchVolume) || errors.Is(err, define.ErrVolumeRemoved) || errors.Is(err, define.ErrMissingPlugin) { + // Volume was already removed, no problem just ignore it and continue. + continue + } + + // some other error + errs = append(errs, err) + continue + } + // Volume was successfully removed + removed = append(removed, vol.Name()) + } + } + } + + return &define.VolumeReload{ + Added: added, + Removed: removed, + Errors: errs, + } +} + // makeVolumeInPluginIfNotExist makes a volume in the given volume plugin if it // does not already exist. func makeVolumeInPluginIfNotExist(name string, options map[string]string, plugin *volplugin.VolumePlugin) error { @@ -197,8 +279,10 @@ func makeVolumeInPluginIfNotExist(name string, options map[string]string, plugin return nil } -// removeVolume removes the specified volume from state as well tears down its mountpoint and storage -func (r *Runtime) removeVolume(ctx context.Context, v *Volume, force bool, timeout *uint) error { +// removeVolume removes the specified volume from state as well tears down its mountpoint and storage. +// ignoreVolumePlugin is used to only remove the volume from the db and not the plugin, +// this is required when the volume was already removed from the plugin, i.e. in UpdateVolumePlugins(). +func (r *Runtime) removeVolume(ctx context.Context, v *Volume, force bool, timeout *uint, ignoreVolumePlugin bool) error { if !v.valid { if ok, _ := r.state.HasVolume(v.Name()); !ok { return nil @@ -263,7 +347,7 @@ func (r *Runtime) removeVolume(ctx context.Context, v *Volume, force bool, timeo var removalErr error // If we use a volume plugin, we need to remove from the plugin. - if v.UsesVolumeDriver() { + if v.UsesVolumeDriver() && !ignoreVolumePlugin { canRemove := true // Do we have a volume driver? diff --git a/libpod/state.go b/libpod/state.go index 471023769..4fbd3c302 100644 --- a/libpod/state.go +++ b/libpod/state.go @@ -111,6 +111,15 @@ type State interface { // Return a container config from the database by full ID GetContainerConfig(id string) (*ContainerConfig, error) + // Add the exit code for the specified container to the database. + AddContainerExitCode(id string, exitCode int32) error + + // Return the exit code for the specified container. + GetContainerExitCode(id string) (int32, error) + + // Remove exit codes older than 5 minutes. + PruneContainerExitCodes() error + // Add creates a reference to an exec session in the database. // The container the exec session is attached to will be recorded. // The container state will not be modified. diff --git a/libpod/stats.go b/libpod/stats.go index 25baa378d..eaac9d7d0 100644 --- a/libpod/stats.go +++ b/libpod/stats.go @@ -9,6 +9,8 @@ import ( "syscall" "time" + runccgroup "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/containers/common/pkg/cgroups" "github.com/containers/podman/v4/libpod/define" "github.com/pkg/errors" @@ -34,8 +36,9 @@ func (c *Container) GetContainerStats(previousStats *define.ContainerStats) (*de } } + // returns stats with the fields' default values respective of their type if c.state.State != define.ContainerStateRunning && c.state.State != define.ContainerStatePaused { - return stats, define.ErrCtrStateInvalid + return stats, nil } if previousStats == nil { @@ -68,29 +71,29 @@ func (c *Container) GetContainerStats(previousStats *define.ContainerStats) (*de // If the current total usage in the cgroup is less than what was previously // recorded then it means the container was restarted and runs in a new cgroup - if previousStats.Duration > cgroupStats.CPU.Usage.Total { + if previousStats.Duration > cgroupStats.CpuStats.CpuUsage.TotalUsage { previousStats = &define.ContainerStats{} } previousCPU := previousStats.CPUNano now := uint64(time.Now().UnixNano()) - stats.Duration = cgroupStats.CPU.Usage.Total + stats.Duration = cgroupStats.CpuStats.CpuUsage.TotalUsage stats.UpTime = time.Duration(stats.Duration) stats.CPU = calculateCPUPercent(cgroupStats, previousCPU, now, previousStats.SystemNano) // calc the average cpu usage for the time the container is running stats.AvgCPU = calculateCPUPercent(cgroupStats, 0, now, uint64(c.state.StartedTime.UnixNano())) - stats.MemUsage = cgroupStats.Memory.Usage.Usage + stats.MemUsage = cgroupStats.MemoryStats.Usage.Usage stats.MemLimit = c.getMemLimit() stats.MemPerc = (float64(stats.MemUsage) / float64(stats.MemLimit)) * 100 stats.PIDs = 0 if conState == define.ContainerStateRunning || conState == define.ContainerStatePaused { - stats.PIDs = cgroupStats.Pids.Current + stats.PIDs = cgroupStats.PidsStats.Current } stats.BlockInput, stats.BlockOutput = calculateBlockIO(cgroupStats) - stats.CPUNano = cgroupStats.CPU.Usage.Total - stats.CPUSystemNano = cgroupStats.CPU.Usage.Kernel + stats.CPUNano = cgroupStats.CpuStats.CpuUsage.TotalUsage + stats.CPUSystemNano = cgroupStats.CpuStats.CpuUsage.UsageInKernelmode stats.SystemNano = now - stats.PerCPU = cgroupStats.CPU.Usage.PerCPU + stats.PerCPU = cgroupStats.CpuStats.CpuUsage.PercpuUsage // Handle case where the container is not in a network namespace if netStats != nil { stats.NetInput = netStats.TxBytes @@ -132,10 +135,10 @@ func (c *Container) getMemLimit() uint64 { // previousCPU is the last value of stats.CPU.Usage.Total measured at the time previousSystem. // (now - previousSystem) is the time delta in nanoseconds, between the measurement in previousCPU // and the updated value in stats. -func calculateCPUPercent(stats *cgroups.Metrics, previousCPU, now, previousSystem uint64) float64 { +func calculateCPUPercent(stats *runccgroup.Stats, previousCPU, now, previousSystem uint64) float64 { var ( cpuPercent = 0.0 - cpuDelta = float64(stats.CPU.Usage.Total - previousCPU) + cpuDelta = float64(stats.CpuStats.CpuUsage.TotalUsage - previousCPU) systemDelta = float64(now - previousSystem) ) if systemDelta > 0.0 && cpuDelta > 0.0 { @@ -145,8 +148,8 @@ func calculateCPUPercent(stats *cgroups.Metrics, previousCPU, now, previousSyste return cpuPercent } -func calculateBlockIO(stats *cgroups.Metrics) (read uint64, write uint64) { - for _, blkIOEntry := range stats.Blkio.IoServiceBytesRecursive { +func calculateBlockIO(stats *runccgroup.Stats) (read uint64, write uint64) { + for _, blkIOEntry := range stats.BlkioStats.IoServiceBytesRecursive { switch strings.ToLower(blkIOEntry.Op) { case "read": read += blkIOEntry.Value diff --git a/libpod/util_linux.go b/libpod/util_linux.go index fe98056dc..414d1bff9 100644 --- a/libpod/util_linux.go +++ b/libpod/util_linux.go @@ -11,6 +11,7 @@ import ( "github.com/containers/common/pkg/cgroups" "github.com/containers/podman/v4/libpod/define" "github.com/containers/podman/v4/pkg/rootless" + spec "github.com/opencontainers/runtime-spec/specs-go" "github.com/opencontainers/selinux/go-selinux/label" "github.com/pkg/errors" "github.com/sirupsen/logrus" @@ -20,7 +21,7 @@ import ( // systemdSliceFromPath makes a new systemd slice under the given parent with // the given name. // The parent must be a slice. The name must NOT include ".slice" -func systemdSliceFromPath(parent, name string) (string, error) { +func systemdSliceFromPath(parent, name string, resources *spec.LinuxResources) (string, error) { cgroupPath, err := assembleSystemdCgroupName(parent, name) if err != nil { return "", err @@ -28,7 +29,7 @@ func systemdSliceFromPath(parent, name string) (string, error) { logrus.Debugf("Created cgroup path %s for parent %s and name %s", cgroupPath, parent, name) - if err := makeSystemdCgroup(cgroupPath); err != nil { + if err := makeSystemdCgroup(cgroupPath, resources); err != nil { return "", errors.Wrapf(err, "error creating cgroup %s", cgroupPath) } @@ -45,8 +46,12 @@ func getDefaultSystemdCgroup() string { } // makeSystemdCgroup creates a systemd Cgroup at the given location. -func makeSystemdCgroup(path string) error { - controller, err := cgroups.NewSystemd(getDefaultSystemdCgroup()) +func makeSystemdCgroup(path string, resources *spec.LinuxResources) error { + res, err := GetLimits(resources) + if err != nil { + return err + } + controller, err := cgroups.NewSystemd(getDefaultSystemdCgroup(), &res) if err != nil { return err } @@ -54,12 +59,20 @@ func makeSystemdCgroup(path string) error { if rootless.IsRootless() { return controller.CreateSystemdUserUnit(path, rootless.GetRootlessUID()) } - return controller.CreateSystemdUnit(path) + err = controller.CreateSystemdUnit(path) + if err != nil { + return err + } + return nil } // deleteSystemdCgroup deletes the systemd cgroup at the given location -func deleteSystemdCgroup(path string) error { - controller, err := cgroups.NewSystemd(getDefaultSystemdCgroup()) +func deleteSystemdCgroup(path string, resources *spec.LinuxResources) error { + res, err := GetLimits(resources) + if err != nil { + return err + } + controller, err := cgroups.NewSystemd(getDefaultSystemdCgroup(), &res) if err != nil { return err } diff --git a/libpod/volume_internal.go b/libpod/volume_internal.go index e0ebb729d..24522c0f9 100644 --- a/libpod/volume_internal.go +++ b/libpod/volume_internal.go @@ -55,6 +55,12 @@ func (v *Volume) needsMount() bool { if _, ok := v.config.Options["NOQUOTA"]; ok { index++ } + if _, ok := v.config.Options["nocopy"]; ok { + index++ + } + if _, ok := v.config.Options["copy"]; ok { + index++ + } // when uid or gid is set there is also the "o" option // set so we have to ignore this one as well if index > 0 { |