Skip to content

Commit f944b21

Browse files
Merge pull request #26456 from Honny1/dev/jrodak/healthcheck-timeout-termination-v5.4-rhel
[v5.4-rhel] Fix: Ensure HealthCheck exec session terminates on timeout
2 parents 0ee1d49 + 70c161c commit f944b21

File tree

8 files changed

+170
-39
lines changed

8 files changed

+170
-39
lines changed

cmd/podman/common/create.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -623,7 +623,7 @@ func DefineCreateFlags(cmd *cobra.Command, cf *entities.ContainerCreateOptions,
623623
createFlags.StringVar(
624624
&cf.HealthTimeout,
625625
healthTimeoutFlagName, define.DefaultHealthCheckTimeout,
626-
"the maximum time allowed to complete the healthcheck before an interval is considered failed",
626+
"the maximum time allowed to complete the healthcheck before an interval is considered failed and SIGKILL is sent to the healthcheck process",
627627
)
628628
_ = cmd.RegisterFlagCompletionFunc(healthTimeoutFlagName, completion.AutocompleteNone)
629629

docs/source/markdown/options/health-timeout.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,6 @@
66

77
The maximum time allowed to complete the healthcheck before an interval is considered failed. Like start-period, the
88
value can be expressed in a time format such as **1m22s**. The default value is **30s**.
9+
10+
Note: A timeout marks the healthcheck as failed. If the healthcheck command itself runs longer than the specified *timeout*,
11+
it will be sent a `SIGKILL` signal.

libpod/container_exec.go

Lines changed: 113 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -158,34 +158,20 @@ type legacyExecSession struct {
158158
PID int `json:"pid"`
159159
}
160160

161-
// ExecCreate creates a new exec session for the container.
162-
// The session is not started. The ID of the new exec session will be returned.
163-
func (c *Container) ExecCreate(config *ExecConfig) (string, error) {
164-
if !c.batched {
165-
c.lock.Lock()
166-
defer c.lock.Unlock()
167-
168-
if err := c.syncContainer(); err != nil {
169-
return "", err
170-
}
171-
}
172-
173-
// Verify our config
161+
func (c *Container) verifyExecConfig(config *ExecConfig) error {
174162
if config == nil {
175-
return "", fmt.Errorf("must provide a configuration to ExecCreate: %w", define.ErrInvalidArg)
163+
return fmt.Errorf("must provide a configuration to ExecCreate: %w", define.ErrInvalidArg)
176164
}
177165
if len(config.Command) == 0 {
178-
return "", fmt.Errorf("must provide a non-empty command to start an exec session: %w", define.ErrInvalidArg)
166+
return fmt.Errorf("must provide a non-empty command to start an exec session: %w", define.ErrInvalidArg)
179167
}
180168
if config.ExitCommandDelay > 0 && len(config.ExitCommand) == 0 {
181-
return "", fmt.Errorf("must provide a non-empty exit command if giving an exit command delay: %w", define.ErrInvalidArg)
182-
}
183-
184-
// Verify that we are in a good state to continue
185-
if !c.ensureState(define.ContainerStateRunning) {
186-
return "", fmt.Errorf("can only create exec sessions on running containers: %w", define.ErrCtrStateInvalid)
169+
return fmt.Errorf("must provide a non-empty exit command if giving an exit command delay: %w", define.ErrInvalidArg)
187170
}
171+
return nil
172+
}
188173

174+
func (c *Container) getUniqueExecSessionID() string {
189175
// Generate an ID for our new exec session
190176
sessionID := stringid.GenerateRandomID()
191177
found := true
@@ -202,20 +188,52 @@ func (c *Container) ExecCreate(config *ExecConfig) (string, error) {
202188
sessionID = stringid.GenerateRandomID()
203189
}
204190
}
191+
return sessionID
192+
}
205193

206-
// Make our new exec session
194+
func (c *Container) createExecSession(config *ExecConfig) (*ExecSession, error) {
207195
session := new(ExecSession)
208-
session.Id = sessionID
196+
session.Id = c.getUniqueExecSessionID()
209197
session.ContainerId = c.ID()
210198
session.State = define.ExecStateCreated
211199
session.Config = new(ExecConfig)
212200
if err := JSONDeepCopy(config, session.Config); err != nil {
213-
return "", fmt.Errorf("copying exec configuration into exec session: %w", err)
201+
return nil, fmt.Errorf("copying exec configuration into exec session: %w", err)
214202
}
215203

216204
if len(session.Config.ExitCommand) > 0 {
217205
session.Config.ExitCommand = append(session.Config.ExitCommand, []string{session.ID(), c.ID()}...)
218206
}
207+
return session, nil
208+
}
209+
210+
// ExecCreate creates a new exec session for the container.
211+
// The session is not started. The ID of the new exec session will be returned.
212+
func (c *Container) ExecCreate(config *ExecConfig) (string, error) {
213+
if !c.batched {
214+
c.lock.Lock()
215+
defer c.lock.Unlock()
216+
217+
if err := c.syncContainer(); err != nil {
218+
return "", err
219+
}
220+
}
221+
222+
// Verify our config
223+
if err := c.verifyExecConfig(config); err != nil {
224+
return "", err
225+
}
226+
227+
// Verify that we are in a good state to continue
228+
if !c.ensureState(define.ContainerStateRunning) {
229+
return "", fmt.Errorf("can only create exec sessions on running containers: %w", define.ErrCtrStateInvalid)
230+
}
231+
232+
// Make our new exec session
233+
session, err := c.createExecSession(config)
234+
if err != nil {
235+
return "", err
236+
}
219237

220238
if c.state.ExecSessions == nil {
221239
c.state.ExecSessions = make(map[string]*ExecSession)
@@ -232,7 +250,7 @@ func (c *Container) ExecCreate(config *ExecConfig) (string, error) {
232250

233251
logrus.Infof("Created exec session %s in container %s", session.ID(), c.ID())
234252

235-
return sessionID, nil
253+
return session.Id, nil
236254
}
237255

238256
// ExecStart starts an exec session in the container, but does not attach to it.
@@ -775,6 +793,76 @@ func (c *Container) ExecResize(sessionID string, newSize resize.TerminalSize) er
775793
return c.ociRuntime.ExecAttachResize(c, sessionID, newSize)
776794
}
777795

796+
func (c *Container) healthCheckExec(config *ExecConfig, timeout time.Duration, streams *define.AttachStreams) (int, error) {
797+
unlock := true
798+
if !c.batched {
799+
c.lock.Lock()
800+
defer func() {
801+
if unlock {
802+
c.lock.Unlock()
803+
}
804+
}()
805+
806+
if err := c.syncContainer(); err != nil {
807+
return -1, err
808+
}
809+
}
810+
811+
if err := c.verifyExecConfig(config); err != nil {
812+
return -1, err
813+
}
814+
815+
if !c.ensureState(define.ContainerStateRunning) {
816+
return -1, fmt.Errorf("can only create exec sessions on running containers: %w", define.ErrCtrStateInvalid)
817+
}
818+
819+
session, err := c.createExecSession(config)
820+
if err != nil {
821+
return -1, err
822+
}
823+
824+
if c.state.ExecSessions == nil {
825+
c.state.ExecSessions = make(map[string]*ExecSession)
826+
}
827+
c.state.ExecSessions[session.ID()] = session
828+
defer delete(c.state.ExecSessions, session.ID())
829+
830+
opts, err := prepareForExec(c, session)
831+
if err != nil {
832+
return -1, err
833+
}
834+
defer func() {
835+
if err := c.cleanupExecBundle(session.ID()); err != nil {
836+
logrus.Errorf("Container %s light exec session cleanup error: %v", c.ID(), err)
837+
}
838+
}()
839+
840+
pid, attachErrChan, err := c.ociRuntime.ExecContainer(c, session.ID(), opts, streams, nil)
841+
if err != nil {
842+
return -1, err
843+
}
844+
session.PID = pid
845+
846+
if !c.batched {
847+
c.lock.Unlock()
848+
unlock = false
849+
}
850+
851+
select {
852+
case err = <-attachErrChan:
853+
if err != nil {
854+
return -1, fmt.Errorf("container %s light exec session with pid: %d error: %v", c.ID(), pid, err)
855+
}
856+
case <-time.After(timeout):
857+
if err := c.ociRuntime.ExecStopContainer(c, session.ID(), 0); err != nil {
858+
return -1, err
859+
}
860+
return -1, fmt.Errorf("%v of %s", define.ErrHealthCheckTimeout, c.HealthCheckConfig().Timeout.String())
861+
}
862+
863+
return c.readExecExitCode(session.ID())
864+
}
865+
778866
func (c *Container) Exec(config *ExecConfig, streams *define.AttachStreams, resize <-chan resize.TerminalSize) (int, error) {
779867
return c.exec(config, streams, resize, false)
780868
}

libpod/define/errors.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,4 +217,7 @@ var (
217217
// ErrRemovingCtrs indicates that there was an error removing all
218218
// containers from a pod.
219219
ErrRemovingCtrs = errors.New("removing pod containers")
220+
221+
// ErrHealthCheckTimeout indicates that a HealthCheck timed out.
222+
ErrHealthCheckTimeout = errors.New("healthcheck command exceeded timeout")
220223
)

libpod/healthcheck.go

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -93,19 +93,23 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define.
9393
streams.AttachInput = true
9494

9595
logrus.Debugf("executing health check command %s for %s", strings.Join(newCommand, " "), c.ID())
96-
timeStart := time.Now()
9796
hcResult := define.HealthCheckSuccess
9897
config := new(ExecConfig)
9998
config.Command = newCommand
100-
exitCode, hcErr := c.exec(config, streams, nil, true)
99+
timeStart := time.Now()
100+
exitCode, hcErr := c.healthCheckExec(config, c.HealthCheckConfig().Timeout, streams)
101+
timeEnd := time.Now()
101102
if hcErr != nil {
102103
hcResult = define.HealthCheckFailure
103-
if errors.Is(hcErr, define.ErrOCIRuntimeNotFound) ||
104+
switch {
105+
case errors.Is(hcErr, define.ErrOCIRuntimeNotFound) ||
104106
errors.Is(hcErr, define.ErrOCIRuntimePermissionDenied) ||
105-
errors.Is(hcErr, define.ErrOCIRuntime) {
107+
errors.Is(hcErr, define.ErrOCIRuntime):
106108
returnCode = 1
107109
hcErr = nil
108-
} else {
110+
case errors.Is(hcErr, define.ErrHealthCheckTimeout):
111+
returnCode = -1
112+
default:
109113
returnCode = 125
110114
}
111115
} else if exitCode != 0 {
@@ -124,7 +128,6 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define.
124128
}
125129
}
126130

127-
timeEnd := time.Now()
128131
if c.HealthCheckConfig().StartPeriod > 0 {
129132
// there is a start-period we need to honor; we add startPeriod to container start time
130133
startPeriodTime := c.state.StartedTime.Add(c.HealthCheckConfig().StartPeriod)
@@ -140,12 +143,6 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define.
140143
eventLog = eventLog[:c.HealthCheckMaxLogSize()]
141144
}
142145

143-
if timeEnd.Sub(timeStart) > c.HealthCheckConfig().Timeout {
144-
returnCode = -1
145-
hcResult = define.HealthCheckFailure
146-
hcErr = fmt.Errorf("healthcheck command exceeded timeout of %s", c.HealthCheckConfig().Timeout.String())
147-
}
148-
149146
hcl := newHealthCheckLog(timeStart, timeEnd, returnCode, eventLog)
150147

151148
healthCheckResult, err := c.updateHealthCheckLog(hcl, inStartPeriod, isStartup)

libpod/oci_conmon_exec_common.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,7 @@ func (r *ConmonOCIRuntime) ExecStopContainer(ctr *Container, sessionID string, t
250250

251251
// SIGTERM did not work. On to SIGKILL.
252252
logrus.Debugf("Killing exec session %s (PID %d) of container %s with SIGKILL", sessionID, pid, ctr.ID())
253-
if err := unix.Kill(pid, unix.SIGTERM); err != nil {
253+
if err := unix.Kill(pid, unix.SIGKILL); err != nil {
254254
if err == unix.ESRCH {
255255
return nil
256256
}

test/e2e/healthcheck_run_test.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -390,4 +390,13 @@ HEALTHCHECK CMD ls -l / 2>&1`, ALPINE)
390390
Expect(ps.OutputToStringArray()).To(HaveLen(2))
391391
Expect(ps.OutputToString()).To(ContainSubstring("hc"))
392392
})
393+
394+
It("podman healthcheck - health timeout", func() {
395+
ctrName := "c-h-" + RandomString(6)
396+
podmanTest.PodmanExitCleanly("run", "-d", "--name", ctrName, "--health-cmd", "top", "--health-timeout=3s", ALPINE, "top")
397+
398+
hc := podmanTest.Podman([]string{"healthcheck", "run", ctrName})
399+
hc.WaitWithTimeout(10)
400+
Expect(hc).Should(ExitWithError(125, "Error: healthcheck command exceeded timeout of 3s"))
401+
})
393402
})

test/system/220-healthcheck.bats

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -418,4 +418,35 @@ function _check_health_log {
418418
run_podman rm -t 0 -f $ctrname
419419
}
420420

421+
@test "podman healthcheck - stop container when healthcheck runs" {
422+
ctr="c-h-$(safename)"
423+
msg="hc-msg-$(random_string)"
424+
425+
run_podman run -d --name $ctr \
426+
--health-cmd "sleep 20; echo $msg" \
427+
$IMAGE /home/podman/pause
428+
429+
timeout --foreground -v --kill=10 60 \
430+
$PODMAN healthcheck run $ctr &
431+
hc_pid=$!
432+
433+
run_podman inspect $ctr --format "{{.State.Status}}"
434+
assert "$output" == "running" "Container is running"
435+
436+
run_podman stop $ctr
437+
438+
# Wait for background healthcheck to finish and make sure the exit status is 1
439+
rc=0
440+
wait -n $hc_pid || rc=$?
441+
assert $rc -eq 1 "exit status check of healthcheck command"
442+
443+
run_podman inspect $ctr --format "{{.State.Status}}"
444+
assert "$output" == "exited" "Container is stopped"
445+
446+
run_podman inspect $ctr --format "{{.State.Health.Log}}"
447+
assert "$output" !~ "$msg" "Health log message not found"
448+
449+
run_podman rm -f -t0 $ctr
450+
}
451+
421452
# vim: filetype=sh

0 commit comments

Comments
 (0)