Skip to content

[v5.4-rhel] Fix: Ensure HealthCheck exec session terminates on timeout #26456

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmd/podman/common/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -623,7 +623,7 @@ func DefineCreateFlags(cmd *cobra.Command, cf *entities.ContainerCreateOptions,
createFlags.StringVar(
&cf.HealthTimeout,
healthTimeoutFlagName, define.DefaultHealthCheckTimeout,
"the maximum time allowed to complete the healthcheck before an interval is considered failed",
"the maximum time allowed to complete the healthcheck before an interval is considered failed and SIGKILL is sent to the healthcheck process",
)
_ = cmd.RegisterFlagCompletionFunc(healthTimeoutFlagName, completion.AutocompleteNone)

Expand Down
3 changes: 3 additions & 0 deletions docs/source/markdown/options/health-timeout.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,6 @@

The maximum time allowed to complete the healthcheck before an interval is considered failed. Like start-period, the
value can be expressed in a time format such as **1m22s**. The default value is **30s**.

Note: A timeout marks the healthcheck as failed. If the healthcheck command itself runs longer than the specified *timeout*,
it will be sent a `SIGKILL` signal.
138 changes: 113 additions & 25 deletions libpod/container_exec.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,34 +158,20 @@ type legacyExecSession struct {
PID int `json:"pid"`
}

// ExecCreate creates a new exec session for the container.
// The session is not started. The ID of the new exec session will be returned.
func (c *Container) ExecCreate(config *ExecConfig) (string, error) {
if !c.batched {
c.lock.Lock()
defer c.lock.Unlock()

if err := c.syncContainer(); err != nil {
return "", err
}
}

// Verify our config
func (c *Container) verifyExecConfig(config *ExecConfig) error {
if config == nil {
return "", fmt.Errorf("must provide a configuration to ExecCreate: %w", define.ErrInvalidArg)
return fmt.Errorf("must provide a configuration to ExecCreate: %w", define.ErrInvalidArg)
}
if len(config.Command) == 0 {
return "", fmt.Errorf("must provide a non-empty command to start an exec session: %w", define.ErrInvalidArg)
return fmt.Errorf("must provide a non-empty command to start an exec session: %w", define.ErrInvalidArg)
}
if config.ExitCommandDelay > 0 && len(config.ExitCommand) == 0 {
return "", fmt.Errorf("must provide a non-empty exit command if giving an exit command delay: %w", define.ErrInvalidArg)
}

// Verify that we are in a good state to continue
if !c.ensureState(define.ContainerStateRunning) {
return "", fmt.Errorf("can only create exec sessions on running containers: %w", define.ErrCtrStateInvalid)
return fmt.Errorf("must provide a non-empty exit command if giving an exit command delay: %w", define.ErrInvalidArg)
}
return nil
}

func (c *Container) getUniqueExecSessionID() string {
// Generate an ID for our new exec session
sessionID := stringid.GenerateRandomID()
found := true
Expand All @@ -202,20 +188,52 @@ func (c *Container) ExecCreate(config *ExecConfig) (string, error) {
sessionID = stringid.GenerateRandomID()
}
}
return sessionID
}

// Make our new exec session
func (c *Container) createExecSession(config *ExecConfig) (*ExecSession, error) {
session := new(ExecSession)
session.Id = sessionID
session.Id = c.getUniqueExecSessionID()
session.ContainerId = c.ID()
session.State = define.ExecStateCreated
session.Config = new(ExecConfig)
if err := JSONDeepCopy(config, session.Config); err != nil {
return "", fmt.Errorf("copying exec configuration into exec session: %w", err)
return nil, fmt.Errorf("copying exec configuration into exec session: %w", err)
}

if len(session.Config.ExitCommand) > 0 {
session.Config.ExitCommand = append(session.Config.ExitCommand, []string{session.ID(), c.ID()}...)
}
return session, nil
}

// ExecCreate creates a new exec session for the container.
// The session is not started. The ID of the new exec session will be returned.
func (c *Container) ExecCreate(config *ExecConfig) (string, error) {
if !c.batched {
c.lock.Lock()
defer c.lock.Unlock()

if err := c.syncContainer(); err != nil {
return "", err
}
}

// Verify our config
if err := c.verifyExecConfig(config); err != nil {
return "", err
}

// Verify that we are in a good state to continue
if !c.ensureState(define.ContainerStateRunning) {
return "", fmt.Errorf("can only create exec sessions on running containers: %w", define.ErrCtrStateInvalid)
}

// Make our new exec session
session, err := c.createExecSession(config)
if err != nil {
return "", err
}

if c.state.ExecSessions == nil {
c.state.ExecSessions = make(map[string]*ExecSession)
Expand All @@ -232,7 +250,7 @@ func (c *Container) ExecCreate(config *ExecConfig) (string, error) {

logrus.Infof("Created exec session %s in container %s", session.ID(), c.ID())

return sessionID, nil
return session.Id, nil
}

// ExecStart starts an exec session in the container, but does not attach to it.
Expand Down Expand Up @@ -775,6 +793,76 @@ func (c *Container) ExecResize(sessionID string, newSize resize.TerminalSize) er
return c.ociRuntime.ExecAttachResize(c, sessionID, newSize)
}

func (c *Container) healthCheckExec(config *ExecConfig, timeout time.Duration, streams *define.AttachStreams) (int, error) {
unlock := true
if !c.batched {
c.lock.Lock()
defer func() {
if unlock {
c.lock.Unlock()
}
}()

if err := c.syncContainer(); err != nil {
return -1, err
}
}

if err := c.verifyExecConfig(config); err != nil {
return -1, err
}

if !c.ensureState(define.ContainerStateRunning) {
return -1, fmt.Errorf("can only create exec sessions on running containers: %w", define.ErrCtrStateInvalid)
}

session, err := c.createExecSession(config)
if err != nil {
return -1, err
}

if c.state.ExecSessions == nil {
c.state.ExecSessions = make(map[string]*ExecSession)
}
c.state.ExecSessions[session.ID()] = session
defer delete(c.state.ExecSessions, session.ID())

opts, err := prepareForExec(c, session)
if err != nil {
return -1, err
}
defer func() {
if err := c.cleanupExecBundle(session.ID()); err != nil {
logrus.Errorf("Container %s light exec session cleanup error: %v", c.ID(), err)
}
}()

pid, attachErrChan, err := c.ociRuntime.ExecContainer(c, session.ID(), opts, streams, nil)
if err != nil {
return -1, err
}
session.PID = pid
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In 499ea11#diff-bf52e0a2dfcea051750fa86d05c8a2ce247c66a92a8562570f4d3555d76664a6R872
I'm seeing this line after this one:
session.PIDData = getPidData(pid)

Is that not needed?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not needed because we would have to backport this PR #25906.


if !c.batched {
c.lock.Unlock()
unlock = false
}

select {
case err = <-attachErrChan:
if err != nil {
return -1, fmt.Errorf("container %s light exec session with pid: %d error: %v", c.ID(), pid, err)
}
case <-time.After(timeout):
if err := c.ociRuntime.ExecStopContainer(c, session.ID(), 0); err != nil {
return -1, err
}
return -1, fmt.Errorf("%v of %s", define.ErrHealthCheckTimeout, c.HealthCheckConfig().Timeout.String())
}

return c.readExecExitCode(session.ID())
}

func (c *Container) Exec(config *ExecConfig, streams *define.AttachStreams, resize <-chan resize.TerminalSize) (int, error) {
return c.exec(config, streams, resize, false)
}
Expand Down
3 changes: 3 additions & 0 deletions libpod/define/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -217,4 +217,7 @@ var (
// ErrRemovingCtrs indicates that there was an error removing all
// containers from a pod.
ErrRemovingCtrs = errors.New("removing pod containers")

// ErrHealthCheckTimeout indicates that a HealthCheck timed out.
ErrHealthCheckTimeout = errors.New("healthcheck command exceeded timeout")
)
21 changes: 9 additions & 12 deletions libpod/healthcheck.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,19 +93,23 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define.
streams.AttachInput = true

logrus.Debugf("executing health check command %s for %s", strings.Join(newCommand, " "), c.ID())
timeStart := time.Now()
hcResult := define.HealthCheckSuccess
config := new(ExecConfig)
config.Command = newCommand
exitCode, hcErr := c.exec(config, streams, nil, true)
timeStart := time.Now()
exitCode, hcErr := c.healthCheckExec(config, c.HealthCheckConfig().Timeout, streams)
timeEnd := time.Now()
if hcErr != nil {
hcResult = define.HealthCheckFailure
if errors.Is(hcErr, define.ErrOCIRuntimeNotFound) ||
switch {
case errors.Is(hcErr, define.ErrOCIRuntimeNotFound) ||
errors.Is(hcErr, define.ErrOCIRuntimePermissionDenied) ||
errors.Is(hcErr, define.ErrOCIRuntime) {
errors.Is(hcErr, define.ErrOCIRuntime):
returnCode = 1
hcErr = nil
} else {
case errors.Is(hcErr, define.ErrHealthCheckTimeout):
returnCode = -1
default:
returnCode = 125
}
} else if exitCode != 0 {
Expand All @@ -124,7 +128,6 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define.
}
}

timeEnd := time.Now()
if c.HealthCheckConfig().StartPeriod > 0 {
// there is a start-period we need to honor; we add startPeriod to container start time
startPeriodTime := c.state.StartedTime.Add(c.HealthCheckConfig().StartPeriod)
Expand All @@ -140,12 +143,6 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define.
eventLog = eventLog[:c.HealthCheckMaxLogSize()]
}

if timeEnd.Sub(timeStart) > c.HealthCheckConfig().Timeout {
returnCode = -1
hcResult = define.HealthCheckFailure
hcErr = fmt.Errorf("healthcheck command exceeded timeout of %s", c.HealthCheckConfig().Timeout.String())
}

hcl := newHealthCheckLog(timeStart, timeEnd, returnCode, eventLog)

healthCheckResult, err := c.updateHealthCheckLog(hcl, inStartPeriod, isStartup)
Expand Down
2 changes: 1 addition & 1 deletion libpod/oci_conmon_exec_common.go
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ func (r *ConmonOCIRuntime) ExecStopContainer(ctr *Container, sessionID string, t

// SIGTERM did not work. On to SIGKILL.
logrus.Debugf("Killing exec session %s (PID %d) of container %s with SIGKILL", sessionID, pid, ctr.ID())
if err := unix.Kill(pid, unix.SIGTERM); err != nil {
if err := unix.Kill(pid, unix.SIGKILL); err != nil {
if err == unix.ESRCH {
return nil
}
Expand Down
9 changes: 9 additions & 0 deletions test/e2e/healthcheck_run_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -390,4 +390,13 @@ HEALTHCHECK CMD ls -l / 2>&1`, ALPINE)
Expect(ps.OutputToStringArray()).To(HaveLen(2))
Expect(ps.OutputToString()).To(ContainSubstring("hc"))
})

It("podman healthcheck - health timeout", func() {
ctrName := "c-h-" + RandomString(6)
podmanTest.PodmanExitCleanly("run", "-d", "--name", ctrName, "--health-cmd", "top", "--health-timeout=3s", ALPINE, "top")

hc := podmanTest.Podman([]string{"healthcheck", "run", ctrName})
hc.WaitWithTimeout(10)
Expect(hc).Should(ExitWithError(125, "Error: healthcheck command exceeded timeout of 3s"))
})
})
31 changes: 31 additions & 0 deletions test/system/220-healthcheck.bats
Original file line number Diff line number Diff line change
Expand Up @@ -418,4 +418,35 @@ function _check_health_log {
run_podman rm -t 0 -f $ctrname
}

@test "podman healthcheck - stop container when healthcheck runs" {
ctr="c-h-$(safename)"
msg="hc-msg-$(random_string)"

run_podman run -d --name $ctr \
--health-cmd "sleep 20; echo $msg" \
$IMAGE /home/podman/pause

timeout --foreground -v --kill=10 60 \
$PODMAN healthcheck run $ctr &
hc_pid=$!

run_podman inspect $ctr --format "{{.State.Status}}"
assert "$output" == "running" "Container is running"

run_podman stop $ctr

# Wait for background healthcheck to finish and make sure the exit status is 1
rc=0
wait -n $hc_pid || rc=$?
assert $rc -eq 1 "exit status check of healthcheck command"

run_podman inspect $ctr --format "{{.State.Status}}"
assert "$output" == "exited" "Container is stopped"

run_podman inspect $ctr --format "{{.State.Health.Log}}"
assert "$output" !~ "$msg" "Health log message not found"

run_podman rm -f -t0 $ctr
}

# vim: filetype=sh