diff options
Diffstat (limited to 'libpod')
37 files changed, 929 insertions, 225 deletions
diff --git a/libpod/boltdb_state.go b/libpod/boltdb_state.go index c3db6152a..471f64b84 100644 --- a/libpod/boltdb_state.go +++ b/libpod/boltdb_state.go @@ -5,8 +5,10 @@ import ( "fmt" "net" "os" + "strconv" "strings" "sync" + "time" "github.com/containers/common/libnetwork/types" "github.com/containers/podman/v4/libpod/define" @@ -63,6 +65,13 @@ type BoltState struct { // initially created the database. This must match for any further instances // that access the database, to ensure that state mismatches with // containers/storage do not occur. +// - exitCodeBucket/exitCodeTimeStampBucket: (#14559) exit codes must be part +// of the database to resolve a previous race condition when one process waits +// for the exit file to be written and another process removes it along with +// the container during auto-removal. The same race would happen trying to +// read the exit code from the containers bucket. Hence, exit codes go into +// their own bucket. To avoid the rather expensive JSON (un)marshaling, we +// have two buckets: one for the exit codes, the other for the timestamps. // NewBoltState creates a new bolt-backed state database func NewBoltState(path string, runtime *Runtime) (State, error) { @@ -98,6 +107,8 @@ func NewBoltState(path string, runtime *Runtime) (State, error) { allVolsBkt, execBkt, runtimeConfigBkt, + exitCodeBkt, + exitCodeTimeStampBkt, } // Does the DB need an update? @@ -192,6 +203,45 @@ func (s *BoltState) Refresh() error { return err } + exitCodeBucket, err := getExitCodeBucket(tx) + if err != nil { + return err + } + + timeStampBucket, err := getExitCodeTimeStampBucket(tx) + if err != nil { + return err + } + + // Clear all exec exit codes + toRemoveExitCodes := []string{} + err = exitCodeBucket.ForEach(func(id, _ []byte) error { + toRemoveExitCodes = append(toRemoveExitCodes, string(id)) + return nil + }) + if err != nil { + return errors.Wrapf(err, "error reading exit codes bucket") + } + for _, id := range toRemoveExitCodes { + if err := exitCodeBucket.Delete([]byte(id)); err != nil { + return errors.Wrapf(err, "error removing exit code for ID %s", id) + } + } + + toRemoveTimeStamps := []string{} + err = timeStampBucket.ForEach(func(id, _ []byte) error { + toRemoveTimeStamps = append(toRemoveTimeStamps, string(id)) + return nil + }) + if err != nil { + return errors.Wrapf(err, "reading timestamps bucket") + } + for _, id := range toRemoveTimeStamps { + if err := timeStampBucket.Delete([]byte(id)); err != nil { + return errors.Wrapf(err, "removing timestamp for ID %s", id) + } + } + // Iterate through all IDs. Check if they are containers. // If they are, unmarshal their state, and then clear // PID, mountpoint, and state for all of them @@ -1341,6 +1391,204 @@ func (s *BoltState) GetContainerConfig(id string) (*ContainerConfig, error) { return config, nil } +// AddContainerExitCode adds the exit code for the specified container to the database. +func (s *BoltState) AddContainerExitCode(id string, exitCode int32) error { + if len(id) == 0 { + return define.ErrEmptyID + } + + if !s.valid { + return define.ErrDBClosed + } + + db, err := s.getDBCon() + if err != nil { + return err + } + defer s.deferredCloseDBCon(db) + + rawID := []byte(id) + rawExitCode := []byte(strconv.Itoa(int(exitCode))) + rawTimeStamp, err := time.Now().MarshalText() + if err != nil { + return fmt.Errorf("marshaling exit-code time stamp: %w", err) + } + + return db.Update(func(tx *bolt.Tx) error { + exitCodeBucket, err := getExitCodeBucket(tx) + if err != nil { + return err + } + timeStampBucket, err := getExitCodeTimeStampBucket(tx) + if err != nil { + return err + } + + if err := exitCodeBucket.Put(rawID, rawExitCode); err != nil { + return fmt.Errorf("adding exit code of container %s to DB: %w", id, err) + } + if err := timeStampBucket.Put(rawID, rawTimeStamp); err != nil { + if rmErr := exitCodeBucket.Delete(rawID); rmErr != nil { + logrus.Errorf("Removing exit code of container %s from DB: %v", id, rmErr) + } + return fmt.Errorf("adding exit-code time stamp of container %s to DB: %w", id, err) + } + + return nil + }) +} + +// GetContainerExitCode returns the exit code for the specified container. +func (s *BoltState) GetContainerExitCode(id string) (int32, error) { + if len(id) == 0 { + return -1, define.ErrEmptyID + } + + if !s.valid { + return -1, define.ErrDBClosed + } + + db, err := s.getDBCon() + if err != nil { + return -1, err + } + defer s.deferredCloseDBCon(db) + + rawID := []byte(id) + result := int32(-1) + return result, db.View(func(tx *bolt.Tx) error { + exitCodeBucket, err := getExitCodeBucket(tx) + if err != nil { + return err + } + + rawExitCode := exitCodeBucket.Get(rawID) + if rawExitCode == nil { + return fmt.Errorf("getting exit code of container %s from DB: %w", id, define.ErrNoSuchExitCode) + } + + exitCode, err := strconv.Atoi(string(rawExitCode)) + if err != nil { + return fmt.Errorf("converting raw exit code %v of container %s: %w", rawExitCode, id, err) + } + + result = int32(exitCode) + return nil + }) +} + +// GetContainerExitCodeTimeStamp returns the time stamp when the exit code of +// the specified container was added to the database. +func (s *BoltState) GetContainerExitCodeTimeStamp(id string) (*time.Time, error) { + if len(id) == 0 { + return nil, define.ErrEmptyID + } + + if !s.valid { + return nil, define.ErrDBClosed + } + + db, err := s.getDBCon() + if err != nil { + return nil, err + } + defer s.deferredCloseDBCon(db) + + rawID := []byte(id) + var result time.Time + return &result, db.View(func(tx *bolt.Tx) error { + timeStampBucket, err := getExitCodeTimeStampBucket(tx) + if err != nil { + return err + } + + rawTimeStamp := timeStampBucket.Get(rawID) + if rawTimeStamp == nil { + return fmt.Errorf("getting exit-code time stamp of container %s from DB: %w", id, define.ErrNoSuchExitCode) + } + + if err := result.UnmarshalText(rawTimeStamp); err != nil { + return fmt.Errorf("converting raw time stamp %v of container %s from DB: %w", rawTimeStamp, id, err) + } + + return nil + }) +} + +// PruneExitCodes removes exit codes older than 5 minutes. +func (s *BoltState) PruneContainerExitCodes() error { + if !s.valid { + return define.ErrDBClosed + } + + db, err := s.getDBCon() + if err != nil { + return err + } + defer s.deferredCloseDBCon(db) + + toRemoveIDs := []string{} + + threshold := time.Minute * 5 + err = db.View(func(tx *bolt.Tx) error { + timeStampBucket, err := getExitCodeTimeStampBucket(tx) + if err != nil { + return err + } + + return timeStampBucket.ForEach(func(rawID, rawTimeStamp []byte) error { + var timeStamp time.Time + if err := timeStamp.UnmarshalText(rawTimeStamp); err != nil { + return fmt.Errorf("converting raw time stamp %v of container %s from DB: %w", rawTimeStamp, string(rawID), err) + } + if time.Since(timeStamp) > threshold { + toRemoveIDs = append(toRemoveIDs, string(rawID)) + } + return nil + }) + }) + if err != nil { + return errors.Wrapf(err, "reading exit codes to prune") + } + + if len(toRemoveIDs) > 0 { + err = db.Update(func(tx *bolt.Tx) error { + exitCodeBucket, err := getExitCodeBucket(tx) + if err != nil { + return err + } + timeStampBucket, err := getExitCodeTimeStampBucket(tx) + if err != nil { + return err + } + + var finalErr error + for _, id := range toRemoveIDs { + rawID := []byte(id) + if err := exitCodeBucket.Delete(rawID); err != nil { + if finalErr != nil { + logrus.Error(finalErr) + } + finalErr = fmt.Errorf("removing exit code of container %s from DB: %w", id, err) + } + if err := timeStampBucket.Delete(rawID); err != nil { + if finalErr != nil { + logrus.Error(finalErr) + } + finalErr = fmt.Errorf("removing exit code timestamp of container %s from DB: %w", id, err) + } + } + + return finalErr + }) + if err != nil { + return errors.Wrapf(err, "pruning exit codes") + } + } + + return nil +} + // AddExecSession adds an exec session to the state. func (s *BoltState) AddExecSession(ctr *Container, session *ExecSession) error { if !s.valid { diff --git a/libpod/boltdb_state_internal.go b/libpod/boltdb_state_internal.go index 9dc333ef9..11b4aa049 100644 --- a/libpod/boltdb_state_internal.go +++ b/libpod/boltdb_state_internal.go @@ -29,6 +29,9 @@ const ( aliasesName = "aliases" runtimeConfigName = "runtime-config" + exitCodeName = "exit-code" + exitCodeTimeStampName = "exit-code-time-stamp" + configName = "config" stateName = "state" dependenciesName = "dependencies" @@ -65,6 +68,9 @@ var ( volDependenciesBkt = []byte(volCtrDependencies) networksBkt = []byte(networksName) + exitCodeBkt = []byte(exitCodeName) + exitCodeTimeStampBkt = []byte(exitCodeTimeStampName) + configKey = []byte(configName) stateKey = []byte(stateName) netNSKey = []byte(netNSName) @@ -362,6 +368,22 @@ func getRuntimeConfigBucket(tx *bolt.Tx) (*bolt.Bucket, error) { return bkt, nil } +func getExitCodeBucket(tx *bolt.Tx) (*bolt.Bucket, error) { + bkt := tx.Bucket(exitCodeBkt) + if bkt == nil { + return nil, errors.Wrapf(define.ErrDBBadConfig, "exit-code container bucket not found in DB") + } + return bkt, nil +} + +func getExitCodeTimeStampBucket(tx *bolt.Tx) (*bolt.Bucket, error) { + bkt := tx.Bucket(exitCodeTimeStampBkt) + if bkt == nil { + return nil, errors.Wrapf(define.ErrDBBadConfig, "exit-code time stamp bucket not found in DB") + } + return bkt, nil +} + func (s *BoltState) getContainerConfigFromDB(id []byte, config *ContainerConfig, ctrsBkt *bolt.Bucket) error { ctrBkt := ctrsBkt.Bucket(id) if ctrBkt == nil { diff --git a/libpod/container.go b/libpod/container.go index 04a4ae64a..3a15cfbdb 100644 --- a/libpod/container.go +++ b/libpod/container.go @@ -518,7 +518,7 @@ func (c *Container) PortMappings() ([]types.PortMapping, error) { if len(c.config.NetNsCtr) > 0 { netNsCtr, err := c.runtime.GetContainer(c.config.NetNsCtr) if err != nil { - return nil, errors.Wrapf(err, "unable to lookup network namespace for container %s", c.ID()) + return nil, errors.Wrapf(err, "unable to look up network namespace for container %s", c.ID()) } return netNsCtr.PortMappings() } @@ -657,7 +657,7 @@ func (c *Container) Hostname() string { utsNsCtr, err := c.runtime.GetContainer(c.config.UTSNsCtr) if err != nil { // should we return an error here? - logrus.Errorf("unable to lookup uts namespace for container %s: %v", c.ID(), err) + logrus.Errorf("unable to look up uts namespace for container %s: %v", c.ID(), err) return "" } return utsNsCtr.Hostname() diff --git a/libpod/container_api.go b/libpod/container_api.go index b064d3528..f35cce772 100644 --- a/libpod/container_api.go +++ b/libpod/container_api.go @@ -2,6 +2,7 @@ package libpod import ( "context" + "fmt" "io" "io/ioutil" "net/http" @@ -490,41 +491,84 @@ func (c *Container) RemoveArtifact(name string) error { // Wait blocks until the container exits and returns its exit code. func (c *Container) Wait(ctx context.Context) (int32, error) { - return c.WaitWithInterval(ctx, DefaultWaitInterval) + return c.WaitForExit(ctx, DefaultWaitInterval) } -// WaitWithInterval blocks until the container to exit and returns its exit -// code. The argument is the interval at which checks the container's status. -func (c *Container) WaitWithInterval(ctx context.Context, waitTimeout time.Duration) (int32, error) { +// WaitForExit blocks until the container exits and returns its exit code. The +// argument is the interval at which checks the container's status. +func (c *Container) WaitForExit(ctx context.Context, pollInterval time.Duration) (int32, error) { if !c.valid { return -1, define.ErrCtrRemoved } - exitFile, err := c.exitFilePath() - if err != nil { - return -1, err - } - chWait := make(chan error, 1) + id := c.ID() + var conmonTimer time.Timer + conmonTimerSet := false - go func() { - <-ctx.Done() - chWait <- define.ErrCanceled - }() + getExitCode := func() (bool, int32, error) { + containerRemoved := false + if !c.batched { + c.lock.Lock() + defer c.lock.Unlock() + } - for { - // ignore errors here (with exception of cancellation), it is only used to avoid waiting - // too long. - _, e := WaitForFile(exitFile, chWait, waitTimeout) - if e == define.ErrCanceled { - return -1, define.ErrCanceled + if err := c.syncContainer(); err != nil { + if !errors.Is(err, define.ErrNoSuchCtr) { + return false, -1, err + } + containerRemoved = true + } + + // If conmon is not alive anymore set a timer to make sure + // we're returning even if conmon has forcefully been killed. + if !conmonTimerSet && !containerRemoved { + conmonAlive, err := c.ociRuntime.CheckConmonRunning(c) + switch { + case errors.Is(err, define.ErrNoSuchCtr): + containerRemoved = true + case err != nil: + return false, -1, err + case !conmonAlive: + timerDuration := time.Second * 20 + conmonTimer = *time.NewTimer(timerDuration) + conmonTimerSet = true + } + } + + if !containerRemoved { + // If conmon is dead for more than $timerDuration or if the + // container has exited properly, try to look up the exit code. + select { + case <-conmonTimer.C: + logrus.Debugf("Exceeded conmon timeout waiting for container %s to exit", id) + default: + if !c.ensureState(define.ContainerStateExited, define.ContainerStateConfigured) { + return false, -1, nil + } + } + } + + exitCode, err := c.runtime.state.GetContainerExitCode(id) + if err != nil { + return true, -1, err } - stopped, code, err := c.isStopped() + return true, exitCode, nil + } + + for { + hasExited, exitCode, err := getExitCode() + if hasExited { + return exitCode, err + } if err != nil { return -1, err } - if stopped { - return code, nil + select { + case <-ctx.Done(): + return -1, fmt.Errorf("waiting for exit code of container %s canceled", id) + default: + time.Sleep(pollInterval) } } } @@ -551,11 +595,12 @@ func (c *Container) WaitForConditionWithInterval(ctx context.Context, waitTimeou wantedStates := make(map[define.ContainerStatus]bool, len(conditions)) for _, condition := range conditions { - if condition == define.ContainerStateStopped || condition == define.ContainerStateExited { + switch condition { + case define.ContainerStateExited, define.ContainerStateStopped: waitForExit = true - continue + default: + wantedStates[condition] = true } - wantedStates[condition] = true } trySend := func(code int32, err error) { @@ -572,7 +617,7 @@ func (c *Container) WaitForConditionWithInterval(ctx context.Context, waitTimeou go func() { defer wg.Done() - code, err := c.WaitWithInterval(ctx, waitTimeout) + code, err := c.WaitForExit(ctx, waitTimeout) trySend(code, err) }() } @@ -621,6 +666,15 @@ func (c *Container) Cleanup(ctx context.Context) error { defer c.lock.Unlock() if err := c.syncContainer(); err != nil { + switch errors.Cause(err) { + // When the container has already been removed, the OCI runtime directory remain. + case define.ErrNoSuchCtr, define.ErrCtrRemoved: + if err := c.cleanupRuntime(ctx); err != nil { + return errors.Wrapf(err, "error cleaning up container %s from OCI runtime", c.ID()) + } + default: + logrus.Errorf("Syncing container %s status: %v", c.ID(), err) + } return err } } diff --git a/libpod/container_config.go b/libpod/container_config.go index 6558f3c89..544c45a8c 100644 --- a/libpod/container_config.go +++ b/libpod/container_config.go @@ -194,7 +194,7 @@ type ContainerSecurityConfig struct { // If not explicitly set, an unused random MLS label will be assigned by // containers/storage (but only if SELinux is enabled). MountLabel string `json:"MountLabel,omitempty"` - // LabelOpts are options passed in by the user to setup SELinux labels. + // LabelOpts are options passed in by the user to set up SELinux labels. // These are used by the containers/storage library. LabelOpts []string `json:"labelopts,omitempty"` // User and group to use in the container. Can be specified as only user @@ -386,7 +386,7 @@ type ContainerMiscConfig struct { IsService bool `json:"isService"` // SdNotifyMode tells libpod what to do with a NOTIFY_SOCKET if passed SdNotifyMode string `json:"sdnotifyMode,omitempty"` - // Systemd tells libpod to setup the container in systemd mode, a value of nil denotes false + // Systemd tells libpod to set up the container in systemd mode, a value of nil denotes false Systemd *bool `json:"systemd,omitempty"` // HealthCheckConfig has the health check command and related timings HealthCheckConfig *manifest.Schema2HealthConfig `json:"healthcheck"` @@ -424,7 +424,6 @@ type InfraInherit struct { CapDrop []string `json:"cap_drop,omitempty"` HostDeviceList []spec.LinuxDevice `json:"host_device_list,omitempty"` ImageVolumes []*specgen.ImageVolume `json:"image_volumes,omitempty"` - InfraResources *spec.LinuxResources `json:"resource_limits,omitempty"` Mounts []spec.Mount `json:"mounts,omitempty"` NoNewPrivileges bool `json:"no_new_privileges,omitempty"` OverlayVolumes []*specgen.OverlayVolume `json:"overlay_volumes,omitempty"` @@ -432,4 +431,10 @@ type InfraInherit struct { SeccompProfilePath string `json:"seccomp_profile_path,omitempty"` SelinuxOpts []string `json:"selinux_opts,omitempty"` Volumes []*specgen.NamedVolume `json:"volumes,omitempty"` + ShmSize *int64 `json:"shm_size"` +} + +// IsDefaultShmSize determines if the user actually set the shm in the parent ctr or if it has been set to the default size +func (inherit *InfraInherit) IsDefaultShmSize() bool { + return inherit.ShmSize == nil || *inherit.ShmSize == 65536000 } diff --git a/libpod/container_copy_linux.go b/libpod/container_copy_linux.go index 9528cd06b..6835b2f1f 100644 --- a/libpod/container_copy_linux.go +++ b/libpod/container_copy_linux.go @@ -94,6 +94,7 @@ func (c *Container) copyFromArchive(path string, chown, noOverwriteDirNonDir boo ChownDirs: idPair, ChownFiles: idPair, NoOverwriteDirNonDir: noOverwriteDirNonDir, + NoOverwriteNonDirDir: noOverwriteDirNonDir, Rename: rename, } diff --git a/libpod/container_exec.go b/libpod/container_exec.go index 1e8fce4da..b112273d0 100644 --- a/libpod/container_exec.go +++ b/libpod/container_exec.go @@ -79,11 +79,11 @@ type ExecConfig struct { type ExecSession struct { // Id is the ID of the exec session. // Named somewhat strangely to not conflict with ID(). - // nolint:stylecheck,revive + //nolint:stylecheck,revive Id string `json:"id"` // ContainerId is the ID of the container this exec session belongs to. // Named somewhat strangely to not conflict with ContainerID(). - // nolint:stylecheck,revive + //nolint:stylecheck,revive ContainerId string `json:"containerId"` // State is the state of the exec session. @@ -277,9 +277,13 @@ func (c *Container) ExecStart(sessionID string) error { return c.save() } +func (c *Container) ExecStartAndAttach(sessionID string, streams *define.AttachStreams, newSize *define.TerminalSize) error { + return c.execStartAndAttach(sessionID, streams, newSize, false) +} + // ExecStartAndAttach starts and attaches to an exec session in a container. // newSize resizes the tty to this size before the process is started, must be nil if the exec session has no tty -func (c *Container) ExecStartAndAttach(sessionID string, streams *define.AttachStreams, newSize *define.TerminalSize) error { +func (c *Container) execStartAndAttach(sessionID string, streams *define.AttachStreams, newSize *define.TerminalSize, isHealthcheck bool) error { if !c.batched { c.lock.Lock() defer c.lock.Unlock() @@ -315,7 +319,12 @@ func (c *Container) ExecStartAndAttach(sessionID string, streams *define.AttachS return err } - c.newContainerEvent(events.Exec) + if isHealthcheck { + c.newContainerEvent(events.HealthStatus) + } else { + c.newContainerEvent(events.Exec) + } + logrus.Debugf("Successfully started exec session %s in container %s", session.ID(), c.ID()) var lastErr error @@ -743,10 +752,14 @@ func (c *Container) ExecResize(sessionID string, newSize define.TerminalSize) er return c.ociRuntime.ExecAttachResize(c, sessionID, newSize) } +func (c *Container) Exec(config *ExecConfig, streams *define.AttachStreams, resize <-chan define.TerminalSize) (int, error) { + return c.exec(config, streams, resize, false) +} + // Exec emulates the old Libpod exec API, providing a single call to create, // run, and remove an exec session. Returns exit code and error. Exit code is // not guaranteed to be set sanely if error is not nil. -func (c *Container) Exec(config *ExecConfig, streams *define.AttachStreams, resize <-chan define.TerminalSize) (int, error) { +func (c *Container) exec(config *ExecConfig, streams *define.AttachStreams, resize <-chan define.TerminalSize, isHealthcheck bool) (int, error) { sessionID, err := c.ExecCreate(config) if err != nil { return -1, err @@ -780,7 +793,7 @@ func (c *Container) Exec(config *ExecConfig, streams *define.AttachStreams, resi }() } - if err := c.ExecStartAndAttach(sessionID, streams, size); err != nil { + if err := c.execStartAndAttach(sessionID, streams, size, isHealthcheck); err != nil { return -1, err } diff --git a/libpod/container_internal.go b/libpod/container_internal.go index fd451f9ef..3b01ee6c8 100644 --- a/libpod/container_internal.go +++ b/libpod/container_internal.go @@ -21,6 +21,7 @@ import ( "github.com/containers/common/pkg/cgroups" "github.com/containers/common/pkg/chown" "github.com/containers/common/pkg/config" + cutil "github.com/containers/common/pkg/util" "github.com/containers/podman/v4/libpod/define" "github.com/containers/podman/v4/libpod/events" "github.com/containers/podman/v4/pkg/ctime" @@ -219,7 +220,7 @@ func (c *Container) handleExitFile(exitFile string, fi os.FileInfo) error { // Write an event for the container's death c.newContainerExitedEvent(c.state.ExitCode) - return nil + return c.runtime.state.AddContainerExitCode(c.ID(), c.state.ExitCode) } func (c *Container) shouldRestart() bool { @@ -290,7 +291,7 @@ func (c *Container) handleRestartPolicy(ctx context.Context) (_ bool, retErr err return false, err } - // setup slirp4netns again because slirp4netns will die when conmon exits + // set up slirp4netns again because slirp4netns will die when conmon exits if c.config.NetMode.IsSlirp4netns() { err := c.runtime.setupSlirp4netns(c, c.state.NetNS) if err != nil { @@ -298,7 +299,7 @@ func (c *Container) handleRestartPolicy(ctx context.Context) (_ bool, retErr err } } - // setup rootlesskit port forwarder again since it dies when conmon exits + // set up rootlesskit port forwarder again since it dies when conmon exits // we use rootlesskit port forwarder only as rootless and when bridge network is used if rootless.IsRootless() && c.config.NetMode.IsBridge() && len(c.config.PortMappings) > 0 { err := c.runtime.setupRootlessPortMappingViaRLK(c, c.state.NetNS.Path(), c.state.NetworkStatus) @@ -589,7 +590,7 @@ func (c *Container) teardownStorage() error { } if err := c.cleanupStorage(); err != nil { - return errors.Wrapf(err, "failed to cleanup container %s storage", c.ID()) + return errors.Wrapf(err, "failed to clean up container %s storage", c.ID()) } if err := c.runtime.storageService.DeleteContainer(c.ID()); err != nil { @@ -784,20 +785,6 @@ func (c *Container) getArtifactPath(name string) string { return filepath.Join(c.config.StaticDir, artifactsDir, name) } -// Used with Wait() to determine if a container has exited -func (c *Container) isStopped() (bool, int32, error) { - if !c.batched { - c.lock.Lock() - defer c.lock.Unlock() - } - err := c.syncContainer() - if err != nil { - return true, -1, err - } - - return !c.ensureState(define.ContainerStateRunning, define.ContainerStatePaused, define.ContainerStateStopping), c.state.ExitCode, nil -} - // save container state to the database func (c *Container) save() error { if err := c.runtime.state.SaveContainer(c); err != nil { @@ -1282,13 +1269,6 @@ func (c *Container) stop(timeout uint) error { } } - // Check if conmon is still alive. - // If it is not, we won't be getting an exit file. - conmonAlive, err := c.ociRuntime.CheckConmonRunning(c) - if err != nil { - return err - } - // Set the container state to "stopping" and unlock the container // before handing it over to conmon to unblock other commands. #8501 // demonstrates nicely that a high stop timeout will block even simple @@ -1309,8 +1289,9 @@ func (c *Container) stop(timeout uint) error { if err := c.syncContainer(); err != nil { switch errors.Cause(err) { // If the container has already been removed (e.g., via - // the cleanup process), there's nothing left to do. + // the cleanup process), set the container state to "stopped". case define.ErrNoSuchCtr, define.ErrCtrRemoved: + c.state.State = define.ContainerStateStopped return stopErr default: if stopErr != nil { @@ -1341,21 +1322,18 @@ func (c *Container) stop(timeout uint) error { } c.newContainerEvent(events.Stop) - - c.state.PID = 0 - c.state.ConmonPID = 0 c.state.StoppedByUser = true + conmonAlive, err := c.ociRuntime.CheckConmonRunning(c) + if err != nil { + return err + } if !conmonAlive { - // Conmon is dead, so we can't expect an exit code. - c.state.ExitCode = -1 - c.state.FinishedTime = time.Now() - c.state.State = define.ContainerStateStopped - if err := c.save(); err != nil { - logrus.Errorf("Saving container %s status: %v", c.ID(), err) + if err := c.checkExitFile(); err != nil { + return err } - return errors.Wrapf(define.ErrConmonDead, "container %s conmon process missing, cannot retrieve exit code", c.ID()) + return c.save() } if err := c.save(); err != nil { @@ -1663,30 +1641,16 @@ func (c *Container) mountNamedVolume(v *ContainerNamedVolume, mountpoint string) if err := vol.update(); err != nil { return nil, err } - if vol.state.NeedsCopyUp { + _, hasNoCopy := vol.config.Options["nocopy"] + if vol.state.NeedsCopyUp && !cutil.StringInSlice("nocopy", v.Options) && !hasNoCopy { logrus.Debugf("Copying up contents from container %s to volume %s", c.ID(), vol.Name()) - // If the volume is not empty, we should not copy up. - volMount := vol.mountPoint() - contents, err := ioutil.ReadDir(volMount) - if err != nil { - return nil, errors.Wrapf(err, "error listing contents of volume %s mountpoint when copying up from container %s", vol.Name(), c.ID()) - } - if len(contents) > 0 { - // The volume is not empty. It was likely modified - // outside of Podman. For safety, let's not copy up into - // it. Fixes CVE-2020-1726. - return vol, nil - } - srcDir, err := securejoin.SecureJoin(mountpoint, v.Dest) if err != nil { return nil, errors.Wrapf(err, "error calculating destination path to copy up container %s volume %s", c.ID(), vol.Name()) } // Do a manual stat on the source directory to verify existence. // Skip the rest if it exists. - // TODO: Should this be stat or lstat? I'm using lstat because I - // think copy-up doesn't happen when the source is a link. srcStat, err := os.Lstat(srcDir) if err != nil { if os.IsNotExist(err) { @@ -1712,6 +1676,19 @@ func (c *Container) mountNamedVolume(v *ContainerNamedVolume, mountpoint string) return vol, nil } + // If the volume is not empty, we should not copy up. + volMount := vol.mountPoint() + contents, err := ioutil.ReadDir(volMount) + if err != nil { + return nil, errors.Wrapf(err, "error listing contents of volume %s mountpoint when copying up from container %s", vol.Name(), c.ID()) + } + if len(contents) > 0 { + // The volume is not empty. It was likely modified + // outside of Podman. For safety, let's not copy up into + // it. Fixes CVE-2020-1726. + return vol, nil + } + // Set NeedsCopyUp to false since we are about to do first copy // Do not copy second time. vol.state.NeedsCopyUp = false @@ -1784,7 +1761,7 @@ func (c *Container) cleanupStorage() error { overlayBasePath := filepath.Dir(c.state.Mountpoint) if err := overlay.Unmount(overlayBasePath); err != nil { if cleanupErr != nil { - logrus.Errorf("Failed to cleanup overlay mounts for %s: %v", c.ID(), err) + logrus.Errorf("Failed to clean up overlay mounts for %s: %v", c.ID(), err) } cleanupErr = err } @@ -1801,7 +1778,7 @@ func (c *Container) cleanupStorage() error { if err := c.cleanupOverlayMounts(); err != nil { // If the container can't remove content report the error - logrus.Errorf("Failed to cleanup overlay mounts for %s: %v", c.ID(), err) + logrus.Errorf("Failed to clean up overlay mounts for %s: %v", c.ID(), err) cleanupErr = err } @@ -1880,7 +1857,7 @@ func (c *Container) cleanup(ctx context.Context) error { // we cannot use the dependency container lock due ABBA deadlocks if lock, err := lockfile.GetLockfile(hoststFile); err == nil { lock.Lock() - // make sure to ignore ENOENT error in case the netns container was cleanup before this one + // make sure to ignore ENOENT error in case the netns container was cleaned up before this one if err := etchosts.Remove(hoststFile, getLocalhostHostEntry(c)); err != nil && !errors.Is(err, os.ErrNotExist) { // this error is not fatal we still want to do proper cleanup logrus.Errorf("failed to remove hosts entry from the netns containers /etc/hosts: %v", err) @@ -1939,6 +1916,18 @@ func (c *Container) cleanup(ctx context.Context) error { } } + // Prune the exit codes of other container during clean up. + // Since Podman is no daemon, we have to clean them up somewhere. + // Cleanup seems like a good place as it's not performance + // critical. + if err := c.runtime.state.PruneContainerExitCodes(); err != nil { + if lastError == nil { + lastError = err + } else { + logrus.Errorf("Pruning container exit codes: %v", err) + } + } + return lastError } diff --git a/libpod/container_internal_linux.go b/libpod/container_internal_linux.go index 41c0ac595..0f4bf0f55 100644 --- a/libpod/container_internal_linux.go +++ b/libpod/container_internal_linux.go @@ -311,7 +311,7 @@ func (c *Container) cleanupNetwork() error { // Stop the container's network namespace (if it has one) if err := c.runtime.teardownNetNS(c); err != nil { - logrus.Errorf("Unable to cleanup network for container %s: %q", c.ID(), err) + logrus.Errorf("Unable to clean up network for container %s: %q", c.ID(), err) } c.state.NetNS = nil @@ -367,7 +367,7 @@ func (c *Container) getUserOverrides() *lookup.Overrides { func lookupHostUser(name string) (*runcuser.ExecUser, error) { var execUser runcuser.ExecUser - // Lookup User on host + // Look up User on host u, err := util.LookupUser(name) if err != nil { return &execUser, err @@ -870,6 +870,7 @@ func (c *Container) generateSpec(ctx context.Context) (*spec.Spec, error) { if err != nil { return nil, err } + g.SetLinuxCgroupsPath(cgroupPath) // Warning: CDI may alter g.Config in place. @@ -1141,7 +1142,7 @@ func (c *Container) addCheckpointImageMetadata(importBuilder *buildah.Builder) e return fmt.Errorf("getting host info: %v", err) } - criuVersion, err := criu.GetCriuVestion() + criuVersion, err := criu.GetCriuVersion() if err != nil { return fmt.Errorf("getting criu version: %v", err) } @@ -1210,7 +1211,7 @@ func (c *Container) createCheckpointImage(ctx context.Context, options Container if err != nil { return err } - // Clean-up buildah working container + // Clean up buildah working container defer func() { if err := importBuilder.Delete(); err != nil { logrus.Errorf("Image builder delete failed: %v", err) @@ -1504,7 +1505,7 @@ func (c *Container) checkpoint(ctx context.Context, options ContainerCheckpointO c.state.Restored = false c.state.RestoredTime = time.Time{} - // Cleanup Storage and Network + // Clean up Storage and Network if err := c.cleanup(ctx); err != nil { return nil, 0, err } @@ -2249,8 +2250,19 @@ func (c *Container) makeBindMounts() error { } } + _, hasRunContainerenv := c.state.BindMounts["/run/.containerenv"] + if !hasRunContainerenv { + // check in the spec mounts + for _, m := range c.config.Spec.Mounts { + if m.Destination == "/run/.containerenv" || m.Destination == "/run" { + hasRunContainerenv = true + break + } + } + } + // Make .containerenv if it does not exist - if _, ok := c.state.BindMounts["/run/.containerenv"]; !ok { + if !hasRunContainerenv { containerenv := c.runtime.graphRootMountedFlag(c.config.Spec.Mounts) isRootless := 0 if rootless.IsRootless() { @@ -2589,13 +2601,13 @@ func (c *Container) generateCurrentUserGroupEntry() (string, int, error) { return "", 0, errors.Wrapf(err, "failed to get current group") } - // Lookup group name to see if it exists in the image. + // Look up group name to see if it exists in the image. _, err = lookup.GetGroup(c.state.Mountpoint, g.Name) if err != runcuser.ErrNoGroupEntries { return "", 0, err } - // Lookup GID to see if it exists in the image. + // Look up GID to see if it exists in the image. _, err = lookup.GetGroup(c.state.Mountpoint, g.Gid) if err != runcuser.ErrNoGroupEntries { return "", 0, err @@ -2632,7 +2644,7 @@ func (c *Container) generateUserGroupEntry(addedGID int) (string, error) { gid, err := strconv.ParseUint(group, 10, 32) if err != nil { - return "", nil // nolint: nilerr + return "", nil //nolint: nilerr } if addedGID != 0 && addedGID == int(gid) { @@ -2665,7 +2677,7 @@ func (c *Container) generatePasswdEntry() (string, error) { addedUID := 0 for _, userid := range c.config.HostUsers { - // Lookup User on host + // Look up User on host u, err := util.LookupUser(userid) if err != nil { return "", err @@ -2717,13 +2729,13 @@ func (c *Container) generateCurrentUserPasswdEntry() (string, int, int, error) { } func (c *Container) userPasswdEntry(u *user.User) (string, error) { - // Lookup the user to see if it exists in the container image. + // Look up the user to see if it exists in the container image. _, err := lookup.GetUser(c.state.Mountpoint, u.Username) if err != runcuser.ErrNoPasswdEntries { return "", err } - // Lookup the UID to see if it exists in the container image. + // Look up the UID to see if it exists in the container image. _, err = lookup.GetUser(c.state.Mountpoint, u.Uid) if err != runcuser.ErrNoPasswdEntries { return "", err @@ -2788,14 +2800,14 @@ func (c *Container) generateUserPasswdEntry(addedUID int) (string, error) { // If a non numeric User, then don't generate passwd uid, err := strconv.ParseUint(userspec, 10, 32) if err != nil { - return "", nil // nolint: nilerr + return "", nil //nolint: nilerr } if addedUID != 0 && int(uid) == addedUID { return "", nil } - // Lookup the user to see if it exists in the container image + // Look up the user to see if it exists in the container image _, err = lookup.GetUser(c.state.Mountpoint, userspec) if err != runcuser.ErrNoPasswdEntries { return "", err @@ -3213,7 +3225,7 @@ func (c *Container) fixVolumePermissions(v *ContainerNamedVolume) error { return err } stat := st.Sys().(*syscall.Stat_t) - atime := time.Unix(int64(stat.Atim.Sec), int64(stat.Atim.Nsec)) // nolint: unconvert + atime := time.Unix(int64(stat.Atim.Sec), int64(stat.Atim.Nsec)) //nolint: unconvert if err := os.Chtimes(mountPoint, atime, st.ModTime()); err != nil { return err } diff --git a/libpod/container_log_linux.go b/libpod/container_log_linux.go index deb726526..7f90332c7 100644 --- a/libpod/container_log_linux.go +++ b/libpod/container_log_linux.go @@ -292,11 +292,12 @@ func formatterPrefix(entry *sdjournal.JournalEntry) (string, error) { if !ok { return "", errors.Errorf("no PRIORITY field present in journal entry") } - if priority == journaldLogOut { + switch priority { + case journaldLogOut: output += "stdout " - } else if priority == journaldLogErr { + case journaldLogErr: output += "stderr " - } else { + default: return "", errors.Errorf("unexpected PRIORITY field in journal entry") } diff --git a/libpod/define/container_inspect.go b/libpod/define/container_inspect.go index e7b82d654..ccc4ae00f 100644 --- a/libpod/define/container_inspect.go +++ b/libpod/define/container_inspect.go @@ -259,9 +259,7 @@ type HealthCheckLog struct { // as possible from the spec and container config. // Some things cannot be inferred. These will be populated by spec annotations // (if available). -// Field names are fixed for compatibility and cannot be changed. -// As such, silence lint warnings about them. -//nolint +//nolint:revive,stylecheck // Field names are fixed for compatibility and cannot be changed. type InspectContainerHostConfig struct { // Binds contains an array of user-added mounts. // Both volume mounts and named volumes are included. diff --git a/libpod/define/errors.go b/libpod/define/errors.go index f5a7c73e5..9757a85b1 100644 --- a/libpod/define/errors.go +++ b/libpod/define/errors.go @@ -24,6 +24,10 @@ var ( // not exist. ErrNoSuchExecSession = errors.New("no such exec session") + // ErrNoSuchExitCode indicates that the requested container exit code + // does not exist. + ErrNoSuchExitCode = errors.New("no such exit code") + // ErrDepExists indicates that the current object has dependencies and // cannot be removed before them. ErrDepExists = errors.New("dependency exists") diff --git a/libpod/define/volume_inspect.go b/libpod/define/volume_inspect.go index 4b91c3ece..f731a8735 100644 --- a/libpod/define/volume_inspect.go +++ b/libpod/define/volume_inspect.go @@ -59,3 +59,9 @@ type InspectVolumeData struct { // Timeout is the specified driver timeout if given Timeout int `json:"Timeout,omitempty"` } + +type VolumeReload struct { + Added []string + Removed []string + Errors []error +} diff --git a/libpod/events.go b/libpod/events.go index f09d8402a..bb50df92d 100644 --- a/libpod/events.go +++ b/libpod/events.go @@ -33,6 +33,16 @@ func (c *Container) newContainerEvent(status events.Status) { Attributes: c.Labels(), } + // if the current event is a HealthStatus event, we need to get the current + // status of the container to pass to the event + if status == events.HealthStatus { + containerHealthStatus, err := c.healthCheckStatus() + if err != nil { + e.HealthStatus = fmt.Sprintf("%v", err) + } + e.HealthStatus = containerHealthStatus + } + if err := c.runtime.eventer.Write(e); err != nil { logrus.Errorf("Unable to write pod event: %q", err) } @@ -151,6 +161,9 @@ func (r *Runtime) GetEvents(ctx context.Context, filters []string) ([]*events.Ev // GetLastContainerEvent takes a container name or ID and an event status and returns // the last occurrence of the container event func (r *Runtime) GetLastContainerEvent(ctx context.Context, nameOrID string, containerEvent events.Status) (*events.Event, error) { + // FIXME: events should be read in reverse order! + // https://github.com/containers/podman/issues/14579 + // check to make sure the event.Status is valid if _, err := events.StringToStatus(containerEvent.String()); err != nil { return nil, err diff --git a/libpod/events/config.go b/libpod/events/config.go index 2e7016136..a678baa2d 100644 --- a/libpod/events/config.go +++ b/libpod/events/config.go @@ -40,6 +40,8 @@ type Event struct { Time time.Time // Type of event that occurred Type Type + // Health status of the current container + HealthStatus string `json:"health_status,omitempty"` Details } @@ -141,6 +143,8 @@ const ( Exited Status = "died" // Export ... Export Status = "export" + // HealthStatus ... + HealthStatus Status = "health_status" // History ... History Status = "history" // Import ... diff --git a/libpod/events/events.go b/libpod/events/events.go index a30e0f1ca..a8001ab95 100644 --- a/libpod/events/events.go +++ b/libpod/events/events.go @@ -76,7 +76,7 @@ func (e *Event) ToHumanReadable(truncate bool) string { } switch e.Type { case Container, Pod: - humanFormat = fmt.Sprintf("%s %s %s %s (image=%s, name=%s", e.Time, e.Type, e.Status, id, e.Image, e.Name) + humanFormat = fmt.Sprintf("%s %s %s %s (image=%s, name=%s, health_status=%s", e.Time, e.Type, e.Status, id, e.Image, e.Name, e.HealthStatus) // check if the container has labels and add it to the output if len(e.Attributes) > 0 { for k, v := range e.Attributes { @@ -168,6 +168,8 @@ func StringToStatus(name string) (Status, error) { return Exited, nil case Export.String(): return Export, nil + case HealthStatus.String(): + return HealthStatus, nil case History.String(): return History, nil case Import.String(): diff --git a/libpod/events/journal_linux.go b/libpod/events/journal_linux.go index 866042a4c..036638d34 100644 --- a/libpod/events/journal_linux.go +++ b/libpod/events/journal_linux.go @@ -58,13 +58,14 @@ func (e EventJournalD) Write(ee Event) error { } m["PODMAN_LABELS"] = string(b) } + m["PODMAN_HEALTH_STATUS"] = ee.HealthStatus case Network: m["PODMAN_ID"] = ee.ID m["PODMAN_NETWORK_NAME"] = ee.Network case Volume: m["PODMAN_NAME"] = ee.Name } - return journal.Send(string(ee.ToHumanReadable(false)), journal.PriInfo, m) + return journal.Send(ee.ToHumanReadable(false), journal.PriInfo, m) } // Read reads events from the journal and sends qualified events to the event channel @@ -167,10 +168,9 @@ func (e EventJournalD) Read(ctx context.Context, options ReadOptions) error { } } return nil - } -func newEventFromJournalEntry(entry *sdjournal.JournalEntry) (*Event, error) { //nolint +func newEventFromJournalEntry(entry *sdjournal.JournalEntry) (*Event, error) { newEvent := Event{} eventType, err := StringToType(entry.Fields["PODMAN_TYPE"]) if err != nil { @@ -214,6 +214,7 @@ func newEventFromJournalEntry(entry *sdjournal.JournalEntry) (*Event, error) { / newEvent.Details = Details{Attributes: labels} } } + newEvent.HealthStatus = entry.Fields["PODMAN_HEALTH_STATUS"] case Network: newEvent.ID = entry.Fields["PODMAN_ID"] newEvent.Network = entry.Fields["PODMAN_NETWORK_NAME"] diff --git a/libpod/healthcheck.go b/libpod/healthcheck.go index 40af9aec3..95c70b60e 100644 --- a/libpod/healthcheck.go +++ b/libpod/healthcheck.go @@ -26,7 +26,7 @@ const ( func (r *Runtime) HealthCheck(name string) (define.HealthCheckStatus, error) { container, err := r.LookupContainer(name) if err != nil { - return define.HealthCheckContainerNotFound, errors.Wrapf(err, "unable to lookup %s to perform a health check", name) + return define.HealthCheckContainerNotFound, errors.Wrapf(err, "unable to look up %s to perform a health check", name) } hcStatus, err := checkHealthCheckCanBeRun(container) if err == nil { @@ -90,7 +90,7 @@ func (c *Container) runHealthCheck() (define.HealthCheckStatus, error) { hcResult := define.HealthCheckSuccess config := new(ExecConfig) config.Command = newCommand - exitCode, hcErr := c.Exec(config, streams, nil) + exitCode, hcErr := c.exec(config, streams, nil, true) if hcErr != nil { errCause := errors.Cause(hcErr) hcResult = define.HealthCheckFailure @@ -232,18 +232,27 @@ func (c *Container) getHealthCheckLog() (define.HealthCheckResults, error) { // HealthCheckStatus returns the current state of a container with a healthcheck func (c *Container) HealthCheckStatus() (string, error) { + c.lock.Lock() + defer c.lock.Unlock() + return c.healthCheckStatus() +} + +// Internal function to return the current state of a container with a healthcheck. +// This function does not lock the container. +func (c *Container) healthCheckStatus() (string, error) { if !c.HasHealthCheck() { return "", errors.Errorf("container %s has no defined healthcheck", c.ID()) } - c.lock.Lock() - defer c.lock.Unlock() + if err := c.syncContainer(); err != nil { return "", err } + results, err := c.getHealthCheckLog() if err != nil { return "", errors.Wrapf(err, "unable to get healthcheck log for %s", c.ID()) } + return results.Status, nil } diff --git a/libpod/kube.go b/libpod/kube.go index 20c4612d1..bd4230d66 100644 --- a/libpod/kube.go +++ b/libpod/kube.go @@ -43,8 +43,8 @@ func GenerateForKube(ctx context.Context, ctrs []*Container) (*v1.Pod, error) { func (p *Pod) GenerateForKube(ctx context.Context) (*v1.Pod, []v1.ServicePort, error) { // Generate the v1.Pod yaml description var ( - ports []v1.ContainerPort //nolint - servicePorts []v1.ServicePort //nolint + ports []v1.ContainerPort + servicePorts []v1.ServicePort ) allContainers, err := p.allContainers() diff --git a/libpod/lock/file/file_lock.go b/libpod/lock/file/file_lock.go index 4685872b6..145aa6e26 100644 --- a/libpod/lock/file/file_lock.go +++ b/libpod/lock/file/file_lock.go @@ -14,7 +14,7 @@ import ( // FileLocks is a struct enabling POSIX lock locking in a shared memory // segment. -type FileLocks struct { // nolint +type FileLocks struct { //nolint:revive // struct name stutters lockPath string valid bool } diff --git a/libpod/lock/shm/shm_lock.go b/libpod/lock/shm/shm_lock.go index c7f4d1bc5..6eaf37e48 100644 --- a/libpod/lock/shm/shm_lock.go +++ b/libpod/lock/shm/shm_lock.go @@ -28,7 +28,7 @@ var ( // SHMLocks is a struct enabling POSIX semaphore locking in a shared memory // segment. -type SHMLocks struct { // nolint +type SHMLocks struct { lockStruct *C.shm_struct_t maxLocks uint32 valid bool diff --git a/libpod/networking_linux.go b/libpod/networking_linux.go index ee80b00fe..a83423c9f 100644 --- a/libpod/networking_linux.go +++ b/libpod/networking_linux.go @@ -109,7 +109,7 @@ func (r *RootlessNetNS) getPath(path string) string { func (r *RootlessNetNS) Do(toRun func() error) error { err := r.ns.Do(func(_ ns.NetNS) error { // Before we can run the given function, - // we have to setup all mounts correctly. + // we have to set up all mounts correctly. // The order of the mounts is IMPORTANT. // The idea of the extra mount ns is to make /run and /var/lib/cni writeable @@ -291,7 +291,7 @@ func (r *RootlessNetNS) Do(toRun func() error) error { return err } -// Cleanup the rootless network namespace if needed. +// Clean up the rootless network namespace if needed. // It checks if we have running containers with the bridge network mode. // Cleanup() expects that r.Lock is locked func (r *RootlessNetNS) Cleanup(runtime *Runtime) error { @@ -419,7 +419,7 @@ func (r *Runtime) GetRootlessNetNs(new bool) (*RootlessNetNS, error) { if err != nil { return nil, errors.Wrap(err, "error creating rootless network namespace") } - // setup slirp4netns here + // set up slirp4netns here path := r.config.Engine.NetworkCmdPath if path == "" { var err error @@ -656,9 +656,9 @@ func (r *Runtime) configureNetNS(ctr *Container, ctrNS ns.NetNS) (status map[str return nil, err } - // setup rootless port forwarder when rootless with ports and the network status is empty, + // set up rootless port forwarder when rootless with ports and the network status is empty, // if this is called from network reload the network status will not be empty and we should - // not setup port because they are still active + // not set up port because they are still active if rootless.IsRootless() && len(ctr.config.PortMappings) > 0 && ctr.getNetworkStatus() == nil { // set up port forwarder for rootless netns netnsPath := ctrNS.Path() @@ -783,7 +783,7 @@ func (r *Runtime) teardownNetwork(ns string, opts types.NetworkOptions) error { // execute the cni setup in the rootless net ns err = rootlessNetNS.Do(tearDownPod) if cerr := rootlessNetNS.Cleanup(r); cerr != nil { - logrus.WithError(err).Error("failed to cleanup rootless netns") + logrus.WithError(err).Error("failed to clean up rootless netns") } rootlessNetNS.Lock.Unlock() } else { diff --git a/libpod/oci_conmon_attach_linux.go b/libpod/oci_conmon_attach_linux.go index 155a8fbc3..26f9ba083 100644 --- a/libpod/oci_conmon_attach_linux.go +++ b/libpod/oci_conmon_attach_linux.go @@ -120,7 +120,7 @@ func (r *ConmonOCIRuntime) Attach(c *Container, params *AttachOptions) error { // conmon will then send the exit code of the exec process, or an error in the exec session // startFd must be the input side of the fd. // newSize resizes the tty to this size before the process is started, must be nil if the exec session has no tty -// conmon will wait to start the exec session until the parent process has setup the console socket. +// conmon will wait to start the exec session until the parent process has set up the console socket. // Once attachToExec successfully attaches to the console socket, the child conmon process responsible for calling runtime exec // will read from the output side of start fd, thus learning to start the child process. // Thus, the order goes as follow: diff --git a/libpod/oci_conmon_linux.go b/libpod/oci_conmon_linux.go index 0c1ee61d3..7a9ae7ee5 100644 --- a/libpod/oci_conmon_linux.go +++ b/libpod/oci_conmon_linux.go @@ -23,6 +23,9 @@ import ( "text/template" "time" + runcconfig "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/devices" + "github.com/containers/common/pkg/cgroups" "github.com/containers/common/pkg/config" conmonConfig "github.com/containers/conmon/runner/config" @@ -264,11 +267,6 @@ func (r *ConmonOCIRuntime) CreateContainer(ctr *Container, restoreOptions *Conta // status, but will instead only check for the existence of the conmon exit file // and update state to stopped if it exists. func (r *ConmonOCIRuntime) UpdateContainerStatus(ctr *Container) error { - exitFile, err := r.ExitFilePath(ctr) - if err != nil { - return err - } - runtimeDir, err := util.GetRuntimeDir() if err != nil { return err @@ -340,22 +338,10 @@ func (r *ConmonOCIRuntime) UpdateContainerStatus(ctr *Container) error { // Only grab exit status if we were not already stopped // If we were, it should already be in the database if ctr.state.State == define.ContainerStateStopped && oldState != define.ContainerStateStopped { - var fi os.FileInfo - chWait := make(chan error) - defer close(chWait) - - _, err := WaitForFile(exitFile, chWait, time.Second*5) - if err == nil { - fi, err = os.Stat(exitFile) - } - if err != nil { - ctr.state.ExitCode = -1 - ctr.state.FinishedTime = time.Now() - logrus.Errorf("No exit file for container %s found: %v", ctr.ID(), err) - return nil + if _, err := ctr.Wait(context.Background()); err != nil { + logrus.Errorf("Waiting for container %s to exit: %v", ctr.ID(), err) } - - return ctr.handleExitFile(exitFile, fi) + return nil } // Handle ContainerStateStopping - keep it unless the container @@ -1014,7 +1000,7 @@ func (r *ConmonOCIRuntime) getLogTag(ctr *Container) (string, error) { data, err := ctr.inspectLocked(false) if err != nil { // FIXME: this error should probably be returned - return "", nil // nolint: nilerr + return "", nil //nolint: nilerr } tmpl, err := template.New("container").Parse(logTag) if err != nil { @@ -1166,7 +1152,6 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co }).Debugf("running conmon: %s", r.conmonPath) cmd := exec.Command(r.conmonPath, args...) - cmd.Dir = ctr.bundlePath() cmd.SysProcAttr = &syscall.SysProcAttr{ Setpgid: true, } @@ -1354,8 +1339,6 @@ func (r *ConmonOCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, p logDriverArg = define.NoLogging case define.PassthroughLogging: logDriverArg = define.PassthroughLogging - case define.JSONLogging: - fallthrough //lint:ignore ST1015 the default case has to be here default: //nolint:stylecheck,gocritic // No case here should happen except JSONLogging, but keep this here in case the options are extended @@ -1365,6 +1348,8 @@ func (r *ConmonOCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, p // to get here, either a user would specify `--log-driver ""`, or this came from another place in libpod // since the former case is obscure, and the latter case isn't an error, let's silently fallthrough fallthrough + case define.JSONLogging: + fallthrough case define.KubernetesLogging: logDriverArg = fmt.Sprintf("%s:%s", define.KubernetesLogging, logPath) } @@ -1435,7 +1420,7 @@ func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec } // $INVOCATION_ID is set by systemd when running as a service. - if os.Getenv("INVOCATION_ID") != "" { + if ctr.runtime.RemoteURI() == "" && os.Getenv("INVOCATION_ID") != "" { mustCreateCgroup = false } @@ -1451,9 +1436,14 @@ func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec // TODO: This should be a switch - we are not guaranteed that // there are only 2 valid cgroup managers cgroupParent := ctr.CgroupParent() + cgroupPath := filepath.Join(ctr.config.CgroupParent, "conmon") + Resource := ctr.Spec().Linux.Resources + cgroupResources, err := GetLimits(Resource) + if err != nil { + logrus.StandardLogger().Log(logLevel, "Could not get ctr resources") + } if ctr.CgroupManager() == config.SystemdCgroupsManager { unitName := createUnitName("libpod-conmon", ctr.ID()) - realCgroupParent := cgroupParent splitParent := strings.Split(cgroupParent, "/") if strings.HasSuffix(cgroupParent, ".slice") && len(splitParent) > 1 { @@ -1465,8 +1455,7 @@ func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to systemd sandbox cgroup: %v", err) } } else { - cgroupPath := filepath.Join(ctr.config.CgroupParent, "conmon") - control, err := cgroups.New(cgroupPath, &spec.LinuxResources{}) + control, err := cgroups.New(cgroupPath, &cgroupResources) if err != nil { logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to cgroupfs sandbox cgroup: %v", err) } else if err := control.AddPid(cmd.Process.Pid); err != nil { @@ -1748,3 +1737,191 @@ func httpAttachNonTerminalCopy(container *net.UnixConn, http *bufio.ReadWriter, } } } + +// GetLimits converts spec resource limits to cgroup consumable limits +func GetLimits(resource *spec.LinuxResources) (runcconfig.Resources, error) { + if resource == nil { + resource = &spec.LinuxResources{} + } + final := &runcconfig.Resources{} + devs := []*devices.Rule{} + + // Devices + for _, entry := range resource.Devices { + if entry.Major == nil || entry.Minor == nil { + continue + } + runeType := 'a' + switch entry.Type { + case "b": + runeType = 'b' + case "c": + runeType = 'c' + } + + devs = append(devs, &devices.Rule{ + Type: devices.Type(runeType), + Major: *entry.Major, + Minor: *entry.Minor, + Permissions: devices.Permissions(entry.Access), + Allow: entry.Allow, + }) + } + final.Devices = devs + + // HugepageLimits + pageLimits := []*runcconfig.HugepageLimit{} + for _, entry := range resource.HugepageLimits { + pageLimits = append(pageLimits, &runcconfig.HugepageLimit{ + Pagesize: entry.Pagesize, + Limit: entry.Limit, + }) + } + final.HugetlbLimit = pageLimits + + // Networking + netPriorities := []*runcconfig.IfPrioMap{} + if resource.Network != nil { + for _, entry := range resource.Network.Priorities { + netPriorities = append(netPriorities, &runcconfig.IfPrioMap{ + Interface: entry.Name, + Priority: int64(entry.Priority), + }) + } + } + final.NetPrioIfpriomap = netPriorities + rdma := make(map[string]runcconfig.LinuxRdma) + for name, entry := range resource.Rdma { + rdma[name] = runcconfig.LinuxRdma{HcaHandles: entry.HcaHandles, HcaObjects: entry.HcaObjects} + } + final.Rdma = rdma + + // Memory + if resource.Memory != nil { + if resource.Memory.Limit != nil { + final.Memory = *resource.Memory.Limit + } + if resource.Memory.Reservation != nil { + final.MemoryReservation = *resource.Memory.Reservation + } + if resource.Memory.Swap != nil { + final.MemorySwap = *resource.Memory.Swap + } + if resource.Memory.Swappiness != nil { + final.MemorySwappiness = resource.Memory.Swappiness + } + } + + // CPU + if resource.CPU != nil { + if resource.CPU.Period != nil { + final.CpuPeriod = *resource.CPU.Period + } + if resource.CPU.Quota != nil { + final.CpuQuota = *resource.CPU.Quota + } + if resource.CPU.RealtimePeriod != nil { + final.CpuRtPeriod = *resource.CPU.RealtimePeriod + } + if resource.CPU.RealtimeRuntime != nil { + final.CpuRtRuntime = *resource.CPU.RealtimeRuntime + } + if resource.CPU.Shares != nil { + final.CpuShares = *resource.CPU.Shares + } + final.CpusetCpus = resource.CPU.Cpus + final.CpusetMems = resource.CPU.Mems + } + + // BlkIO + if resource.BlockIO != nil { + if len(resource.BlockIO.ThrottleReadBpsDevice) > 0 { + for _, entry := range resource.BlockIO.ThrottleReadBpsDevice { + throttle := &runcconfig.ThrottleDevice{} + dev := &runcconfig.BlockIODevice{ + Major: entry.Major, + Minor: entry.Minor, + } + throttle.BlockIODevice = *dev + throttle.Rate = entry.Rate + final.BlkioThrottleReadBpsDevice = append(final.BlkioThrottleReadBpsDevice, throttle) + } + } + if len(resource.BlockIO.ThrottleWriteBpsDevice) > 0 { + for _, entry := range resource.BlockIO.ThrottleWriteBpsDevice { + throttle := &runcconfig.ThrottleDevice{} + dev := &runcconfig.BlockIODevice{ + Major: entry.Major, + Minor: entry.Minor, + } + throttle.BlockIODevice = *dev + throttle.Rate = entry.Rate + final.BlkioThrottleWriteBpsDevice = append(final.BlkioThrottleWriteBpsDevice, throttle) + } + } + if len(resource.BlockIO.ThrottleReadIOPSDevice) > 0 { + for _, entry := range resource.BlockIO.ThrottleReadIOPSDevice { + throttle := &runcconfig.ThrottleDevice{} + dev := &runcconfig.BlockIODevice{ + Major: entry.Major, + Minor: entry.Minor, + } + throttle.BlockIODevice = *dev + throttle.Rate = entry.Rate + final.BlkioThrottleReadIOPSDevice = append(final.BlkioThrottleReadIOPSDevice, throttle) + } + } + if len(resource.BlockIO.ThrottleWriteIOPSDevice) > 0 { + for _, entry := range resource.BlockIO.ThrottleWriteIOPSDevice { + throttle := &runcconfig.ThrottleDevice{} + dev := &runcconfig.BlockIODevice{ + Major: entry.Major, + Minor: entry.Minor, + } + throttle.BlockIODevice = *dev + throttle.Rate = entry.Rate + final.BlkioThrottleWriteIOPSDevice = append(final.BlkioThrottleWriteIOPSDevice, throttle) + } + } + if resource.BlockIO.LeafWeight != nil { + final.BlkioLeafWeight = *resource.BlockIO.LeafWeight + } + if resource.BlockIO.Weight != nil { + final.BlkioWeight = *resource.BlockIO.Weight + } + if len(resource.BlockIO.WeightDevice) > 0 { + for _, entry := range resource.BlockIO.WeightDevice { + weight := &runcconfig.WeightDevice{} + dev := &runcconfig.BlockIODevice{ + Major: entry.Major, + Minor: entry.Minor, + } + if entry.Weight != nil { + weight.Weight = *entry.Weight + } + if entry.LeafWeight != nil { + weight.LeafWeight = *entry.LeafWeight + } + weight.BlockIODevice = *dev + final.BlkioWeightDevice = append(final.BlkioWeightDevice, weight) + } + } + } + + // Pids + if resource.Pids != nil { + final.PidsLimit = resource.Pids.Limit + } + + // Networking + if resource.Network != nil { + if resource.Network.ClassID != nil { + final.NetClsClassid = *resource.Network.ClassID + } + } + + // Unified state + final.Unified = resource.Unified + + return *final, nil +} diff --git a/libpod/options.go b/libpod/options.go index 7f4ca22d3..3f9a0424a 100644 --- a/libpod/options.go +++ b/libpod/options.go @@ -1824,7 +1824,7 @@ func WithHostDevice(dev []specs.LinuxDevice) CtrCreateOption { } } -// WithSelectedPasswordManagement makes it so that the container either does or does not setup /etc/passwd or /etc/group +// WithSelectedPasswordManagement makes it so that the container either does or does not set up /etc/passwd or /etc/group func WithSelectedPasswordManagement(passwd *bool) CtrCreateOption { return func(c *Container) error { if c.valid { diff --git a/libpod/plugin/volume_api.go b/libpod/plugin/volume_api.go index 2587bd571..332ab912b 100644 --- a/libpod/plugin/volume_api.go +++ b/libpod/plugin/volume_api.go @@ -35,8 +35,6 @@ var ( hostVirtualPath = "/VolumeDriver.Path" mountPath = "/VolumeDriver.Mount" unmountPath = "/VolumeDriver.Unmount" - // nolint - capabilitiesPath = "/VolumeDriver.Capabilities" ) const ( @@ -206,13 +204,13 @@ func (p *VolumePlugin) verifyReachable() error { // Send a request to the volume plugin for handling. // Callers *MUST* close the response when they are done. -func (p *VolumePlugin) sendRequest(toJSON interface{}, hasBody bool, endpoint string) (*http.Response, error) { +func (p *VolumePlugin) sendRequest(toJSON interface{}, endpoint string) (*http.Response, error) { var ( reqJSON []byte err error ) - if hasBody { + if toJSON != nil { reqJSON, err = json.Marshal(toJSON) if err != nil { return nil, errors.Wrapf(err, "error marshalling request JSON for volume plugin %s endpoint %s", p.Name, endpoint) @@ -283,7 +281,7 @@ func (p *VolumePlugin) CreateVolume(req *volume.CreateRequest) error { logrus.Infof("Creating volume %s using plugin %s", req.Name, p.Name) - resp, err := p.sendRequest(req, true, createPath) + resp, err := p.sendRequest(req, createPath) if err != nil { return err } @@ -300,7 +298,7 @@ func (p *VolumePlugin) ListVolumes() ([]*volume.Volume, error) { logrus.Infof("Listing volumes using plugin %s", p.Name) - resp, err := p.sendRequest(nil, false, listPath) + resp, err := p.sendRequest(nil, listPath) if err != nil { return nil, err } @@ -335,7 +333,7 @@ func (p *VolumePlugin) GetVolume(req *volume.GetRequest) (*volume.Volume, error) logrus.Infof("Getting volume %s using plugin %s", req.Name, p.Name) - resp, err := p.sendRequest(req, true, getPath) + resp, err := p.sendRequest(req, getPath) if err != nil { return nil, err } @@ -370,7 +368,7 @@ func (p *VolumePlugin) RemoveVolume(req *volume.RemoveRequest) error { logrus.Infof("Removing volume %s using plugin %s", req.Name, p.Name) - resp, err := p.sendRequest(req, true, removePath) + resp, err := p.sendRequest(req, removePath) if err != nil { return err } @@ -391,7 +389,7 @@ func (p *VolumePlugin) GetVolumePath(req *volume.PathRequest) (string, error) { logrus.Infof("Getting volume %s path using plugin %s", req.Name, p.Name) - resp, err := p.sendRequest(req, true, hostVirtualPath) + resp, err := p.sendRequest(req, hostVirtualPath) if err != nil { return "", err } @@ -428,7 +426,7 @@ func (p *VolumePlugin) MountVolume(req *volume.MountRequest) (string, error) { logrus.Infof("Mounting volume %s using plugin %s for container %s", req.Name, p.Name, req.ID) - resp, err := p.sendRequest(req, true, mountPath) + resp, err := p.sendRequest(req, mountPath) if err != nil { return "", err } @@ -464,7 +462,7 @@ func (p *VolumePlugin) UnmountVolume(req *volume.UnmountRequest) error { logrus.Infof("Unmounting volume %s using plugin %s for container %s", req.Name, p.Name, req.ID) - resp, err := p.sendRequest(req, true, unmountPath) + resp, err := p.sendRequest(req, unmountPath) if err != nil { return err } diff --git a/libpod/pod.go b/libpod/pod.go index 108317637..2502c41a9 100644 --- a/libpod/pod.go +++ b/libpod/pod.go @@ -450,3 +450,14 @@ func (p *Pod) initContainers() ([]*Container, error) { } return initCons, nil } + +func (p *Pod) Config() (*PodConfig, error) { + p.lock.Lock() + defer p.lock.Unlock() + + conf := &PodConfig{} + + err := JSONDeepCopy(p.config, conf) + + return conf, err +} diff --git a/libpod/pod_internal.go b/libpod/pod_internal.go index 41f745e6c..1502bcb06 100644 --- a/libpod/pod_internal.go +++ b/libpod/pod_internal.go @@ -69,7 +69,7 @@ func (p *Pod) refresh() error { if p.config.UsePodCgroup { switch p.runtime.config.Engine.CgroupManager { case config.SystemdCgroupsManager: - cgroupPath, err := systemdSliceFromPath(p.config.CgroupParent, fmt.Sprintf("libpod_pod_%s", p.ID())) + cgroupPath, err := systemdSliceFromPath(p.config.CgroupParent, fmt.Sprintf("libpod_pod_%s", p.ID()), p.ResourceLim()) if err != nil { logrus.Errorf("Creating Cgroup for pod %s: %v", p.ID(), err) } diff --git a/libpod/runtime.go b/libpod/runtime.go index 8aad480bf..da57c20c7 100644 --- a/libpod/runtime.go +++ b/libpod/runtime.go @@ -135,7 +135,7 @@ func SetXdgDirs() error { return nil } - // Setup XDG_RUNTIME_DIR + // Set up XDG_RUNTIME_DIR runtimeDir := os.Getenv("XDG_RUNTIME_DIR") if runtimeDir == "" { @@ -156,7 +156,7 @@ func SetXdgDirs() error { } } - // Setup XDG_CONFIG_HOME + // Set up XDG_CONFIG_HOME if cfgHomeDir := os.Getenv("XDG_CONFIG_HOME"); cfgHomeDir == "" { cfgHomeDir, err := util.GetRootlessConfigHomeDir() if err != nil { @@ -450,7 +450,7 @@ func makeRuntime(runtime *Runtime) (retErr error) { } }() - // Setup the eventer + // Set up the eventer eventer, err := runtime.newEventer() if err != nil { return err @@ -539,7 +539,7 @@ func makeRuntime(runtime *Runtime) (retErr error) { } } - // the store is only setup when we are in the userns so we do the same for the network interface + // the store is only set up when we are in the userns so we do the same for the network interface if !needsUserns { netBackend, netInterface, err := network.NetworkBackend(runtime.store, runtime.config, runtime.syslog) if err != nil { diff --git a/libpod/runtime_ctr.go b/libpod/runtime_ctr.go index bdfc102ba..4d34c6a08 100644 --- a/libpod/runtime_ctr.go +++ b/libpod/runtime_ctr.go @@ -502,7 +502,7 @@ func (r *Runtime) setupContainer(ctx context.Context, ctr *Container) (_ *Contai volOptions = append(volOptions, parsedOptions...) } } - newVol, err := r.newVolume(volOptions...) + newVol, err := r.newVolume(false, volOptions...) if err != nil { return nil, errors.Wrapf(err, "error creating named volume %q", vol.Name) } @@ -664,9 +664,6 @@ func (r *Runtime) removeContainer(ctx context.Context, c *Container, force, remo } if c.state.State == define.ContainerStatePaused { - if err := c.ociRuntime.KillContainer(c, 9, false); err != nil { - return err - } isV2, err := cgroups.IsCgroup2UnifiedMode() if err != nil { return err @@ -677,6 +674,9 @@ func (r *Runtime) removeContainer(ctx context.Context, c *Container, force, remo return err } } + if err := c.ociRuntime.KillContainer(c, 9, false); err != nil { + return err + } // Need to update container state to make sure we know it's stopped if err := c.waitForExitFileAndSync(); err != nil { return err @@ -715,6 +715,10 @@ func (r *Runtime) removeContainer(ctx context.Context, c *Container, force, remo // Do a quick ping of the database to check if the container // still exists. if ok, _ := r.state.HasContainer(c.ID()); !ok { + // When the container has already been removed, the OCI runtime directory remain. + if err := c.cleanupRuntime(ctx); err != nil { + return errors.Wrapf(err, "error cleaning up container %s from OCI runtime", c.ID()) + } return nil } } @@ -755,7 +759,7 @@ func (r *Runtime) removeContainer(ctx context.Context, c *Container, force, remo if cleanupErr == nil { cleanupErr = err } else { - logrus.Errorf("Cleanup storage: %v", err) + logrus.Errorf("Cleaning up storage: %v", err) } } @@ -805,16 +809,16 @@ func (r *Runtime) removeContainer(ctx context.Context, c *Container, force, remo if !volume.Anonymous() { continue } - if err := runtime.removeVolume(ctx, volume, false, timeout); err != nil && errors.Cause(err) != define.ErrNoSuchVolume { + if err := runtime.removeVolume(ctx, volume, false, timeout, false); err != nil && errors.Cause(err) != define.ErrNoSuchVolume { if errors.Cause(err) == define.ErrVolumeBeingUsed { // Ignore error, since podman will report original error volumesFrom, _ := c.volumesFrom() if len(volumesFrom) > 0 { - logrus.Debugf("Cleanup volume not possible since volume is in use (%s)", v) + logrus.Debugf("Cleaning up volume not possible since volume is in use (%s)", v) continue } } - logrus.Errorf("Cleanup volume (%s): %v", v, err) + logrus.Errorf("Cleaning up volume (%s): %v", v, err) } } } @@ -963,8 +967,8 @@ func (r *Runtime) evictContainer(ctx context.Context, idOrName string, removeVol if !volume.Anonymous() { continue } - if err := r.removeVolume(ctx, volume, false, timeout); err != nil && err != define.ErrNoSuchVolume && err != define.ErrVolumeBeingUsed { - logrus.Errorf("Cleanup volume (%s): %v", v, err) + if err := r.removeVolume(ctx, volume, false, timeout, false); err != nil && err != define.ErrNoSuchVolume && err != define.ErrVolumeBeingUsed { + logrus.Errorf("Cleaning up volume (%s): %v", v, err) } } } @@ -1111,7 +1115,7 @@ func (r *Runtime) GetContainersByList(containers []string) ([]*Container, error) for _, inputContainer := range containers { ctr, err := r.LookupContainer(inputContainer) if err != nil { - return ctrs, errors.Wrapf(err, "unable to lookup container %s", inputContainer) + return ctrs, errors.Wrapf(err, "unable to look up container %s", inputContainer) } ctrs = append(ctrs, ctr) } diff --git a/libpod/runtime_pod_linux.go b/libpod/runtime_pod_linux.go index dcc3a044f..1f9ebe724 100644 --- a/libpod/runtime_pod_linux.go +++ b/libpod/runtime_pod_linux.go @@ -17,7 +17,7 @@ import ( "github.com/containers/podman/v4/libpod/events" "github.com/containers/podman/v4/pkg/rootless" "github.com/containers/podman/v4/pkg/specgen" - spec "github.com/opencontainers/runtime-spec/specs-go" + runcconfig "github.com/opencontainers/runc/libcontainer/configs" "github.com/pkg/errors" "github.com/sirupsen/logrus" ) @@ -66,6 +66,7 @@ func (r *Runtime) NewPod(ctx context.Context, p specgen.PodSpecGenerator, option case config.CgroupfsCgroupsManager: canUseCgroup := !rootless.IsRootless() || isRootlessCgroupSet(pod.config.CgroupParent) if canUseCgroup { + // need to actually create parent here if pod.config.CgroupParent == "" { pod.config.CgroupParent = CgroupfsDefaultCgroupParent } else if strings.HasSuffix(path.Base(pod.config.CgroupParent), ".slice") { @@ -73,12 +74,29 @@ func (r *Runtime) NewPod(ctx context.Context, p specgen.PodSpecGenerator, option } // If we are set to use pod cgroups, set the cgroup parent that // all containers in the pod will share - // No need to create it with cgroupfs - the first container to - // launch should do it for us if pod.config.UsePodCgroup { pod.state.CgroupPath = filepath.Join(pod.config.CgroupParent, pod.ID()) if p.InfraContainerSpec != nil { p.InfraContainerSpec.CgroupParent = pod.state.CgroupPath + // cgroupfs + rootless = permission denied when creating the cgroup. + if !rootless.IsRootless() { + res, err := GetLimits(p.InfraContainerSpec.ResourceLimits) + if err != nil { + return nil, err + } + // Need to both create and update the cgroup + // rather than create a new path in c/common for pod cgroup creation + // just create as if it is a ctr and then update figures out that we need to + // populate the resource limits on the pod level + cgc, err := cgroups.New(pod.state.CgroupPath, &res) + if err != nil { + return nil, err + } + err = cgc.Update(&res) + if err != nil { + return nil, err + } + } } } } @@ -95,7 +113,7 @@ func (r *Runtime) NewPod(ctx context.Context, p specgen.PodSpecGenerator, option // If we are set to use pod cgroups, set the cgroup parent that // all containers in the pod will share if pod.config.UsePodCgroup { - cgroupPath, err := systemdSliceFromPath(pod.config.CgroupParent, fmt.Sprintf("libpod_pod_%s", pod.ID())) + cgroupPath, err := systemdSliceFromPath(pod.config.CgroupParent, fmt.Sprintf("libpod_pod_%s", pod.ID()), p.InfraContainerSpec.ResourceLimits) if err != nil { return nil, errors.Wrapf(err, "unable to create pod cgroup for pod %s", pod.ID()) } @@ -239,9 +257,8 @@ func (r *Runtime) removePod(ctx context.Context, p *Pod, removeCtrs, force bool, } // New resource limits - resLimits := new(spec.LinuxResources) - resLimits.Pids = new(spec.LinuxPids) - resLimits.Pids.Limit = 1 // Inhibit forks with very low pids limit + resLimits := new(runcconfig.Resources) + resLimits.PidsLimit = 1 // Inhibit forks with very low pids limit // Don't try if we failed to retrieve the cgroup if err == nil { @@ -301,7 +318,7 @@ func (r *Runtime) removePod(ctx context.Context, p *Pod, removeCtrs, force bool, if !volume.Anonymous() { continue } - if err := r.removeVolume(ctx, volume, false, timeout); err != nil { + if err := r.removeVolume(ctx, volume, false, timeout, false); err != nil { if errors.Cause(err) == define.ErrNoSuchVolume || errors.Cause(err) == define.ErrVolumeRemoved { continue } @@ -321,7 +338,7 @@ func (r *Runtime) removePod(ctx context.Context, p *Pod, removeCtrs, force bool, switch p.runtime.config.Engine.CgroupManager { case config.SystemdCgroupsManager: - if err := deleteSystemdCgroup(p.state.CgroupPath); err != nil { + if err := deleteSystemdCgroup(p.state.CgroupPath, p.ResourceLim()); err != nil { if removalErr == nil { removalErr = errors.Wrapf(err, "error removing pod %s cgroup", p.ID()) } else { diff --git a/libpod/runtime_volume.go b/libpod/runtime_volume.go index 21bf8aefc..6872db21d 100644 --- a/libpod/runtime_volume.go +++ b/libpod/runtime_volume.go @@ -33,7 +33,7 @@ func (r *Runtime) RemoveVolume(ctx context.Context, v *Volume, force bool, timeo return nil } } - return r.removeVolume(ctx, v, force, timeout) + return r.removeVolume(ctx, v, force, timeout, false) } // GetVolume retrieves a volume given its full name. diff --git a/libpod/runtime_volume_linux.go b/libpod/runtime_volume_linux.go index 59dba9b35..4fd4f7301 100644 --- a/libpod/runtime_volume_linux.go +++ b/libpod/runtime_volume_linux.go @@ -5,6 +5,7 @@ package libpod import ( "context" + "fmt" "os" "path/filepath" "strings" @@ -25,11 +26,13 @@ func (r *Runtime) NewVolume(ctx context.Context, options ...VolumeCreateOption) if !r.valid { return nil, define.ErrRuntimeStopped } - return r.newVolume(options...) + return r.newVolume(false, options...) } -// newVolume creates a new empty volume -func (r *Runtime) newVolume(options ...VolumeCreateOption) (_ *Volume, deferredErr error) { +// newVolume creates a new empty volume with the given options. +// The createPluginVolume can be set to true to make it not create the volume in the volume plugin, +// this is required for the UpdateVolumePlugins() function. If you are not sure set this to false. +func (r *Runtime) newVolume(noCreatePluginVolume bool, options ...VolumeCreateOption) (_ *Volume, deferredErr error) { volume := newVolume(r) for _, option := range options { if err := option(volume); err != nil { @@ -73,7 +76,7 @@ func (r *Runtime) newVolume(options ...VolumeCreateOption) (_ *Volume, deferredE return nil, errors.Wrapf(err, "invalid volume option %s for driver 'local'", key) } } - case "o", "type", "uid", "gid", "size", "inodes", "noquota": + case "o", "type", "uid", "gid", "size", "inodes", "noquota", "copy", "nocopy": // Do nothing, valid keys default: return nil, errors.Wrapf(define.ErrInvalidArg, "invalid mount option %s for driver 'local'", key) @@ -83,7 +86,7 @@ func (r *Runtime) newVolume(options ...VolumeCreateOption) (_ *Volume, deferredE // Now we get conditional: we either need to make the volume in the // volume plugin, or on disk if not using a plugin. - if volume.plugin != nil { + if volume.plugin != nil && !noCreatePluginVolume { // We can't chown, or relabel, or similar the path the volume is // using, because it's not managed by us. // TODO: reevaluate this once we actually have volume plugins in @@ -164,6 +167,85 @@ func (r *Runtime) newVolume(options ...VolumeCreateOption) (_ *Volume, deferredE return volume, nil } +// UpdateVolumePlugins reads all volumes from all configured volume plugins and +// imports them into the libpod db. It also checks if existing libpod volumes +// are removed in the plugin, in this case we try to remove it from libpod. +// On errors we continue and try to do as much as possible. all errors are +// returned as array in the returned struct. +// This function has many race conditions, it is best effort but cannot guarantee +// a perfect state since plugins can be modified from the outside at any time. +func (r *Runtime) UpdateVolumePlugins(ctx context.Context) *define.VolumeReload { + var ( + added []string + removed []string + errs []error + allPluginVolumes = map[string]struct{}{} + ) + + for driverName, socket := range r.config.Engine.VolumePlugins { + driver, err := volplugin.GetVolumePlugin(driverName, socket) + if err != nil { + errs = append(errs, err) + continue + } + vols, err := driver.ListVolumes() + if err != nil { + errs = append(errs, fmt.Errorf("failed to read volumes from plugin %q: %w", driverName, err)) + continue + } + for _, vol := range vols { + allPluginVolumes[vol.Name] = struct{}{} + if _, err := r.newVolume(true, WithVolumeName(vol.Name), WithVolumeDriver(driverName)); err != nil { + // If the volume exists this is not an error, just ignore it and log. It is very likely + // that the volume from the plugin was already in our db. + if !errors.Is(err, define.ErrVolumeExists) { + errs = append(errs, err) + continue + } + logrus.Infof("Volume %q already exists: %v", vol.Name, err) + continue + } + added = append(added, vol.Name) + } + } + + libpodVolumes, err := r.state.AllVolumes() + if err != nil { + errs = append(errs, fmt.Errorf("cannot delete dangling plugin volumes: failed to read libpod volumes: %w", err)) + } + for _, vol := range libpodVolumes { + if vol.UsesVolumeDriver() { + if _, ok := allPluginVolumes[vol.Name()]; !ok { + // The volume is no longer in the plugin, lets remove it from the libpod db. + if err := r.removeVolume(ctx, vol, false, nil, true); err != nil { + if errors.Is(err, define.ErrVolumeBeingUsed) { + // Volume is still used by at least one container. This is very bad, + // the plugin no longer has this but we still need it. + errs = append(errs, fmt.Errorf("volume was removed from the plugin %q but containers still require it: %w", vol.config.Driver, err)) + continue + } + if errors.Is(err, define.ErrNoSuchVolume) || errors.Is(err, define.ErrVolumeRemoved) || errors.Is(err, define.ErrMissingPlugin) { + // Volume was already removed, no problem just ignore it and continue. + continue + } + + // some other error + errs = append(errs, err) + continue + } + // Volume was successfully removed + removed = append(removed, vol.Name()) + } + } + } + + return &define.VolumeReload{ + Added: added, + Removed: removed, + Errors: errs, + } +} + // makeVolumeInPluginIfNotExist makes a volume in the given volume plugin if it // does not already exist. func makeVolumeInPluginIfNotExist(name string, options map[string]string, plugin *volplugin.VolumePlugin) error { @@ -197,8 +279,10 @@ func makeVolumeInPluginIfNotExist(name string, options map[string]string, plugin return nil } -// removeVolume removes the specified volume from state as well tears down its mountpoint and storage -func (r *Runtime) removeVolume(ctx context.Context, v *Volume, force bool, timeout *uint) error { +// removeVolume removes the specified volume from state as well tears down its mountpoint and storage. +// ignoreVolumePlugin is used to only remove the volume from the db and not the plugin, +// this is required when the volume was already removed from the plugin, i.e. in UpdateVolumePlugins(). +func (r *Runtime) removeVolume(ctx context.Context, v *Volume, force bool, timeout *uint, ignoreVolumePlugin bool) error { if !v.valid { if ok, _ := r.state.HasVolume(v.Name()); !ok { return nil @@ -263,7 +347,7 @@ func (r *Runtime) removeVolume(ctx context.Context, v *Volume, force bool, timeo var removalErr error // If we use a volume plugin, we need to remove from the plugin. - if v.UsesVolumeDriver() { + if v.UsesVolumeDriver() && !ignoreVolumePlugin { canRemove := true // Do we have a volume driver? diff --git a/libpod/state.go b/libpod/state.go index 471023769..4fbd3c302 100644 --- a/libpod/state.go +++ b/libpod/state.go @@ -111,6 +111,15 @@ type State interface { // Return a container config from the database by full ID GetContainerConfig(id string) (*ContainerConfig, error) + // Add the exit code for the specified container to the database. + AddContainerExitCode(id string, exitCode int32) error + + // Return the exit code for the specified container. + GetContainerExitCode(id string) (int32, error) + + // Remove exit codes older than 5 minutes. + PruneContainerExitCodes() error + // Add creates a reference to an exec session in the database. // The container the exec session is attached to will be recorded. // The container state will not be modified. diff --git a/libpod/stats.go b/libpod/stats.go index 25baa378d..eaac9d7d0 100644 --- a/libpod/stats.go +++ b/libpod/stats.go @@ -9,6 +9,8 @@ import ( "syscall" "time" + runccgroup "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/containers/common/pkg/cgroups" "github.com/containers/podman/v4/libpod/define" "github.com/pkg/errors" @@ -34,8 +36,9 @@ func (c *Container) GetContainerStats(previousStats *define.ContainerStats) (*de } } + // returns stats with the fields' default values respective of their type if c.state.State != define.ContainerStateRunning && c.state.State != define.ContainerStatePaused { - return stats, define.ErrCtrStateInvalid + return stats, nil } if previousStats == nil { @@ -68,29 +71,29 @@ func (c *Container) GetContainerStats(previousStats *define.ContainerStats) (*de // If the current total usage in the cgroup is less than what was previously // recorded then it means the container was restarted and runs in a new cgroup - if previousStats.Duration > cgroupStats.CPU.Usage.Total { + if previousStats.Duration > cgroupStats.CpuStats.CpuUsage.TotalUsage { previousStats = &define.ContainerStats{} } previousCPU := previousStats.CPUNano now := uint64(time.Now().UnixNano()) - stats.Duration = cgroupStats.CPU.Usage.Total + stats.Duration = cgroupStats.CpuStats.CpuUsage.TotalUsage stats.UpTime = time.Duration(stats.Duration) stats.CPU = calculateCPUPercent(cgroupStats, previousCPU, now, previousStats.SystemNano) // calc the average cpu usage for the time the container is running stats.AvgCPU = calculateCPUPercent(cgroupStats, 0, now, uint64(c.state.StartedTime.UnixNano())) - stats.MemUsage = cgroupStats.Memory.Usage.Usage + stats.MemUsage = cgroupStats.MemoryStats.Usage.Usage stats.MemLimit = c.getMemLimit() stats.MemPerc = (float64(stats.MemUsage) / float64(stats.MemLimit)) * 100 stats.PIDs = 0 if conState == define.ContainerStateRunning || conState == define.ContainerStatePaused { - stats.PIDs = cgroupStats.Pids.Current + stats.PIDs = cgroupStats.PidsStats.Current } stats.BlockInput, stats.BlockOutput = calculateBlockIO(cgroupStats) - stats.CPUNano = cgroupStats.CPU.Usage.Total - stats.CPUSystemNano = cgroupStats.CPU.Usage.Kernel + stats.CPUNano = cgroupStats.CpuStats.CpuUsage.TotalUsage + stats.CPUSystemNano = cgroupStats.CpuStats.CpuUsage.UsageInKernelmode stats.SystemNano = now - stats.PerCPU = cgroupStats.CPU.Usage.PerCPU + stats.PerCPU = cgroupStats.CpuStats.CpuUsage.PercpuUsage // Handle case where the container is not in a network namespace if netStats != nil { stats.NetInput = netStats.TxBytes @@ -132,10 +135,10 @@ func (c *Container) getMemLimit() uint64 { // previousCPU is the last value of stats.CPU.Usage.Total measured at the time previousSystem. // (now - previousSystem) is the time delta in nanoseconds, between the measurement in previousCPU // and the updated value in stats. -func calculateCPUPercent(stats *cgroups.Metrics, previousCPU, now, previousSystem uint64) float64 { +func calculateCPUPercent(stats *runccgroup.Stats, previousCPU, now, previousSystem uint64) float64 { var ( cpuPercent = 0.0 - cpuDelta = float64(stats.CPU.Usage.Total - previousCPU) + cpuDelta = float64(stats.CpuStats.CpuUsage.TotalUsage - previousCPU) systemDelta = float64(now - previousSystem) ) if systemDelta > 0.0 && cpuDelta > 0.0 { @@ -145,8 +148,8 @@ func calculateCPUPercent(stats *cgroups.Metrics, previousCPU, now, previousSyste return cpuPercent } -func calculateBlockIO(stats *cgroups.Metrics) (read uint64, write uint64) { - for _, blkIOEntry := range stats.Blkio.IoServiceBytesRecursive { +func calculateBlockIO(stats *runccgroup.Stats) (read uint64, write uint64) { + for _, blkIOEntry := range stats.BlkioStats.IoServiceBytesRecursive { switch strings.ToLower(blkIOEntry.Op) { case "read": read += blkIOEntry.Value diff --git a/libpod/util_linux.go b/libpod/util_linux.go index fe98056dc..414d1bff9 100644 --- a/libpod/util_linux.go +++ b/libpod/util_linux.go @@ -11,6 +11,7 @@ import ( "github.com/containers/common/pkg/cgroups" "github.com/containers/podman/v4/libpod/define" "github.com/containers/podman/v4/pkg/rootless" + spec "github.com/opencontainers/runtime-spec/specs-go" "github.com/opencontainers/selinux/go-selinux/label" "github.com/pkg/errors" "github.com/sirupsen/logrus" @@ -20,7 +21,7 @@ import ( // systemdSliceFromPath makes a new systemd slice under the given parent with // the given name. // The parent must be a slice. The name must NOT include ".slice" -func systemdSliceFromPath(parent, name string) (string, error) { +func systemdSliceFromPath(parent, name string, resources *spec.LinuxResources) (string, error) { cgroupPath, err := assembleSystemdCgroupName(parent, name) if err != nil { return "", err @@ -28,7 +29,7 @@ func systemdSliceFromPath(parent, name string) (string, error) { logrus.Debugf("Created cgroup path %s for parent %s and name %s", cgroupPath, parent, name) - if err := makeSystemdCgroup(cgroupPath); err != nil { + if err := makeSystemdCgroup(cgroupPath, resources); err != nil { return "", errors.Wrapf(err, "error creating cgroup %s", cgroupPath) } @@ -45,8 +46,12 @@ func getDefaultSystemdCgroup() string { } // makeSystemdCgroup creates a systemd Cgroup at the given location. -func makeSystemdCgroup(path string) error { - controller, err := cgroups.NewSystemd(getDefaultSystemdCgroup()) +func makeSystemdCgroup(path string, resources *spec.LinuxResources) error { + res, err := GetLimits(resources) + if err != nil { + return err + } + controller, err := cgroups.NewSystemd(getDefaultSystemdCgroup(), &res) if err != nil { return err } @@ -54,12 +59,20 @@ func makeSystemdCgroup(path string) error { if rootless.IsRootless() { return controller.CreateSystemdUserUnit(path, rootless.GetRootlessUID()) } - return controller.CreateSystemdUnit(path) + err = controller.CreateSystemdUnit(path) + if err != nil { + return err + } + return nil } // deleteSystemdCgroup deletes the systemd cgroup at the given location -func deleteSystemdCgroup(path string) error { - controller, err := cgroups.NewSystemd(getDefaultSystemdCgroup()) +func deleteSystemdCgroup(path string, resources *spec.LinuxResources) error { + res, err := GetLimits(resources) + if err != nil { + return err + } + controller, err := cgroups.NewSystemd(getDefaultSystemdCgroup(), &res) if err != nil { return err } diff --git a/libpod/volume_internal.go b/libpod/volume_internal.go index e0ebb729d..24522c0f9 100644 --- a/libpod/volume_internal.go +++ b/libpod/volume_internal.go @@ -55,6 +55,12 @@ func (v *Volume) needsMount() bool { if _, ok := v.config.Options["NOQUOTA"]; ok { index++ } + if _, ok := v.config.Options["nocopy"]; ok { + index++ + } + if _, ok := v.config.Options["copy"]; ok { + index++ + } // when uid or gid is set there is also the "o" option // set so we have to ignore this one as well if index > 0 { |