From 00007bc171cc20e40d4cebbeea19cd66fa1dd0ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Rod=C3=A1k?= Date: Mon, 13 Jan 2025 15:49:16 +0100 Subject: [PATCH 1/3] [v5.4-rhel] Run HealthCheck without creating and removing the ExecSession in the database MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes: https://issues.redhat.com/browse/RHEL-69970 Fixes: https://issues.redhat.com/browse/RHEL-96916 Fixes: https://issues.redhat.com/browse/RHEL-96917 Signed-off-by: Jan Rodák (cherry picked from commit ad9839ac555865a81a7adae62a9a8be1f2c2268f) Signed-off-by: Jan Rodák --- libpod/container_exec.go | 130 +++++++++++++++++++++++++------ libpod/healthcheck.go | 2 +- test/system/220-healthcheck.bats | 31 ++++++++ 3 files changed, 137 insertions(+), 26 deletions(-) diff --git a/libpod/container_exec.go b/libpod/container_exec.go index c9efc707f5..98542efa53 100644 --- a/libpod/container_exec.go +++ b/libpod/container_exec.go @@ -158,34 +158,20 @@ type legacyExecSession struct { PID int `json:"pid"` } -// ExecCreate creates a new exec session for the container. -// The session is not started. The ID of the new exec session will be returned. -func (c *Container) ExecCreate(config *ExecConfig) (string, error) { - if !c.batched { - c.lock.Lock() - defer c.lock.Unlock() - - if err := c.syncContainer(); err != nil { - return "", err - } - } - - // Verify our config +func (c *Container) verifyExecConfig(config *ExecConfig) error { if config == nil { - return "", fmt.Errorf("must provide a configuration to ExecCreate: %w", define.ErrInvalidArg) + return fmt.Errorf("must provide a configuration to ExecCreate: %w", define.ErrInvalidArg) } if len(config.Command) == 0 { - return "", fmt.Errorf("must provide a non-empty command to start an exec session: %w", define.ErrInvalidArg) + return fmt.Errorf("must provide a non-empty command to start an exec session: %w", define.ErrInvalidArg) } if config.ExitCommandDelay > 0 && len(config.ExitCommand) == 0 { - return "", fmt.Errorf("must provide a non-empty exit command if giving an exit command delay: %w", define.ErrInvalidArg) - } - - // Verify that we are in a good state to continue - if !c.ensureState(define.ContainerStateRunning) { - return "", fmt.Errorf("can only create exec sessions on running containers: %w", define.ErrCtrStateInvalid) + return fmt.Errorf("must provide a non-empty exit command if giving an exit command delay: %w", define.ErrInvalidArg) } + return nil +} +func (c *Container) getUniqueExecSessionID() string { // Generate an ID for our new exec session sessionID := stringid.GenerateRandomID() found := true @@ -202,20 +188,52 @@ func (c *Container) ExecCreate(config *ExecConfig) (string, error) { sessionID = stringid.GenerateRandomID() } } + return sessionID +} - // Make our new exec session +func (c *Container) createExecSession(config *ExecConfig) (*ExecSession, error) { session := new(ExecSession) - session.Id = sessionID + session.Id = c.getUniqueExecSessionID() session.ContainerId = c.ID() session.State = define.ExecStateCreated session.Config = new(ExecConfig) if err := JSONDeepCopy(config, session.Config); err != nil { - return "", fmt.Errorf("copying exec configuration into exec session: %w", err) + return nil, fmt.Errorf("copying exec configuration into exec session: %w", err) } if len(session.Config.ExitCommand) > 0 { session.Config.ExitCommand = append(session.Config.ExitCommand, []string{session.ID(), c.ID()}...) } + return session, nil +} + +// ExecCreate creates a new exec session for the container. +// The session is not started. The ID of the new exec session will be returned. +func (c *Container) ExecCreate(config *ExecConfig) (string, error) { + if !c.batched { + c.lock.Lock() + defer c.lock.Unlock() + + if err := c.syncContainer(); err != nil { + return "", err + } + } + + // Verify our config + if err := c.verifyExecConfig(config); err != nil { + return "", err + } + + // Verify that we are in a good state to continue + if !c.ensureState(define.ContainerStateRunning) { + return "", fmt.Errorf("can only create exec sessions on running containers: %w", define.ErrCtrStateInvalid) + } + + // Make our new exec session + session, err := c.createExecSession(config) + if err != nil { + return "", err + } if c.state.ExecSessions == nil { c.state.ExecSessions = make(map[string]*ExecSession) @@ -232,7 +250,7 @@ func (c *Container) ExecCreate(config *ExecConfig) (string, error) { logrus.Infof("Created exec session %s in container %s", session.ID(), c.ID()) - return sessionID, nil + return session.Id, nil } // ExecStart starts an exec session in the container, but does not attach to it. @@ -775,6 +793,68 @@ func (c *Container) ExecResize(sessionID string, newSize resize.TerminalSize) er return c.ociRuntime.ExecAttachResize(c, sessionID, newSize) } +func (c *Container) healthCheckExec(config *ExecConfig, streams *define.AttachStreams) (int, error) { + unlock := true + if !c.batched { + c.lock.Lock() + defer func() { + if unlock { + c.lock.Unlock() + } + }() + + if err := c.syncContainer(); err != nil { + return -1, err + } + } + + if err := c.verifyExecConfig(config); err != nil { + return -1, err + } + + if !c.ensureState(define.ContainerStateRunning) { + return -1, fmt.Errorf("can only create exec sessions on running containers: %w", define.ErrCtrStateInvalid) + } + + session, err := c.createExecSession(config) + if err != nil { + return -1, err + } + + if c.state.ExecSessions == nil { + c.state.ExecSessions = make(map[string]*ExecSession) + } + c.state.ExecSessions[session.ID()] = session + defer delete(c.state.ExecSessions, session.ID()) + + opts, err := prepareForExec(c, session) + if err != nil { + return -1, err + } + defer func() { + if err := c.cleanupExecBundle(session.ID()); err != nil { + logrus.Errorf("Container %s light exec session cleanup error: %v", c.ID(), err) + } + }() + + pid, attachErrChan, err := c.ociRuntime.ExecContainer(c, session.ID(), opts, streams, nil) + if err != nil { + return -1, err + } + + if !c.batched { + c.lock.Unlock() + unlock = false + } + + err = <-attachErrChan + if err != nil { + return -1, fmt.Errorf("container %s light exec session with pid: %d error: %v", c.ID(), pid, err) + } + + return c.readExecExitCode(session.ID()) +} + func (c *Container) Exec(config *ExecConfig, streams *define.AttachStreams, resize <-chan resize.TerminalSize) (int, error) { return c.exec(config, streams, resize, false) } diff --git a/libpod/healthcheck.go b/libpod/healthcheck.go index 62312c5546..b6ec8cda85 100644 --- a/libpod/healthcheck.go +++ b/libpod/healthcheck.go @@ -97,7 +97,7 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define. hcResult := define.HealthCheckSuccess config := new(ExecConfig) config.Command = newCommand - exitCode, hcErr := c.exec(config, streams, nil, true) + exitCode, hcErr := c.healthCheckExec(config, streams) if hcErr != nil { hcResult = define.HealthCheckFailure if errors.Is(hcErr, define.ErrOCIRuntimeNotFound) || diff --git a/test/system/220-healthcheck.bats b/test/system/220-healthcheck.bats index 00b788858a..9aa121ecc2 100644 --- a/test/system/220-healthcheck.bats +++ b/test/system/220-healthcheck.bats @@ -418,4 +418,35 @@ function _check_health_log { run_podman rm -t 0 -f $ctrname } +@test "podman healthcheck - stop container when healthcheck runs" { + ctr="c-h-$(safename)" + msg="hc-msg-$(random_string)" + + run_podman run -d --name $ctr \ + --health-cmd "sleep 20; echo $msg" \ + $IMAGE /home/podman/pause + + timeout --foreground -v --kill=10 60 \ + $PODMAN healthcheck run $ctr & + hc_pid=$! + + run_podman inspect $ctr --format "{{.State.Status}}" + assert "$output" == "running" "Container is running" + + run_podman stop $ctr + + # Wait for background healthcheck to finish and make sure the exit status is 1 + rc=0 + wait -n $hc_pid || rc=$? + assert $rc -eq 1 "exit status check of healthcheck command" + + run_podman inspect $ctr --format "{{.State.Status}}" + assert "$output" == "exited" "Container is stopped" + + run_podman inspect $ctr --format "{{.State.Health.Log}}" + assert "$output" !~ "$msg" "Health log message not found" + + run_podman rm -f -t0 $ctr +} + # vim: filetype=sh From ce22ca96ef0bed008728fafa7627df839aeab24a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Rod=C3=A1k?= Date: Tue, 6 May 2025 16:42:13 +0200 Subject: [PATCH 2/3] [v5.4-rhel] Fix: Ensure HealthCheck exec session terminates on timeout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, the HealthCheck exec session would not terminate on timeout, allowing the healthcheck to run indefinitely. Fixes: https://issues.redhat.com/browse/RHEL-86096 Fixes: https://issues.redhat.com/browse/RHEL-96916 Fixes: https://issues.redhat.com/browse/RHEL-96917 Signed-off-by: Jan Rodák (cherry picked from commit 499ea1168bd52a36e98927222a987a6b959f0b50) Signed-off-by: Jan Rodák --- cmd/podman/common/create.go | 2 +- .../source/markdown/options/health-timeout.md | 3 +++ libpod/container_exec.go | 16 ++++++++++---- libpod/define/errors.go | 3 +++ libpod/healthcheck.go | 21 ++++++++----------- test/e2e/healthcheck_run_test.go | 9 ++++++++ 6 files changed, 37 insertions(+), 17 deletions(-) diff --git a/cmd/podman/common/create.go b/cmd/podman/common/create.go index ae18cee3d0..71a1b456d6 100644 --- a/cmd/podman/common/create.go +++ b/cmd/podman/common/create.go @@ -623,7 +623,7 @@ func DefineCreateFlags(cmd *cobra.Command, cf *entities.ContainerCreateOptions, createFlags.StringVar( &cf.HealthTimeout, healthTimeoutFlagName, define.DefaultHealthCheckTimeout, - "the maximum time allowed to complete the healthcheck before an interval is considered failed", + "the maximum time allowed to complete the healthcheck before an interval is considered failed and SIGKILL is sent to the healthcheck process", ) _ = cmd.RegisterFlagCompletionFunc(healthTimeoutFlagName, completion.AutocompleteNone) diff --git a/docs/source/markdown/options/health-timeout.md b/docs/source/markdown/options/health-timeout.md index 0540a48db3..26dbc17086 100644 --- a/docs/source/markdown/options/health-timeout.md +++ b/docs/source/markdown/options/health-timeout.md @@ -6,3 +6,6 @@ The maximum time allowed to complete the healthcheck before an interval is considered failed. Like start-period, the value can be expressed in a time format such as **1m22s**. The default value is **30s**. + +Note: A timeout marks the healthcheck as failed. If the healthcheck command itself runs longer than the specified *timeout*, +it will be sent a `SIGKILL` signal. diff --git a/libpod/container_exec.go b/libpod/container_exec.go index 98542efa53..b61581f895 100644 --- a/libpod/container_exec.go +++ b/libpod/container_exec.go @@ -793,7 +793,7 @@ func (c *Container) ExecResize(sessionID string, newSize resize.TerminalSize) er return c.ociRuntime.ExecAttachResize(c, sessionID, newSize) } -func (c *Container) healthCheckExec(config *ExecConfig, streams *define.AttachStreams) (int, error) { +func (c *Container) healthCheckExec(config *ExecConfig, timeout time.Duration, streams *define.AttachStreams) (int, error) { unlock := true if !c.batched { c.lock.Lock() @@ -841,15 +841,23 @@ func (c *Container) healthCheckExec(config *ExecConfig, streams *define.AttachSt if err != nil { return -1, err } + session.PID = pid if !c.batched { c.lock.Unlock() unlock = false } - err = <-attachErrChan - if err != nil { - return -1, fmt.Errorf("container %s light exec session with pid: %d error: %v", c.ID(), pid, err) + select { + case err = <-attachErrChan: + if err != nil { + return -1, fmt.Errorf("container %s light exec session with pid: %d error: %v", c.ID(), pid, err) + } + case <-time.After(timeout): + if err := c.ociRuntime.ExecStopContainer(c, session.ID(), 0); err != nil { + return -1, err + } + return -1, fmt.Errorf("%v of %s", define.ErrHealthCheckTimeout, c.HealthCheckConfig().Timeout.String()) } return c.readExecExitCode(session.ID()) diff --git a/libpod/define/errors.go b/libpod/define/errors.go index 96f4ac6224..b9f0d83fdd 100644 --- a/libpod/define/errors.go +++ b/libpod/define/errors.go @@ -217,4 +217,7 @@ var ( // ErrRemovingCtrs indicates that there was an error removing all // containers from a pod. ErrRemovingCtrs = errors.New("removing pod containers") + + // ErrHealthCheckTimeout indicates that a HealthCheck timed out. + ErrHealthCheckTimeout = errors.New("healthcheck command exceeded timeout") ) diff --git a/libpod/healthcheck.go b/libpod/healthcheck.go index b6ec8cda85..2310f2d9b4 100644 --- a/libpod/healthcheck.go +++ b/libpod/healthcheck.go @@ -93,19 +93,23 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define. streams.AttachInput = true logrus.Debugf("executing health check command %s for %s", strings.Join(newCommand, " "), c.ID()) - timeStart := time.Now() hcResult := define.HealthCheckSuccess config := new(ExecConfig) config.Command = newCommand - exitCode, hcErr := c.healthCheckExec(config, streams) + timeStart := time.Now() + exitCode, hcErr := c.healthCheckExec(config, c.HealthCheckConfig().Timeout, streams) + timeEnd := time.Now() if hcErr != nil { hcResult = define.HealthCheckFailure - if errors.Is(hcErr, define.ErrOCIRuntimeNotFound) || + switch { + case errors.Is(hcErr, define.ErrOCIRuntimeNotFound) || errors.Is(hcErr, define.ErrOCIRuntimePermissionDenied) || - errors.Is(hcErr, define.ErrOCIRuntime) { + errors.Is(hcErr, define.ErrOCIRuntime): returnCode = 1 hcErr = nil - } else { + case errors.Is(hcErr, define.ErrHealthCheckTimeout): + returnCode = -1 + default: returnCode = 125 } } else if exitCode != 0 { @@ -124,7 +128,6 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define. } } - timeEnd := time.Now() if c.HealthCheckConfig().StartPeriod > 0 { // there is a start-period we need to honor; we add startPeriod to container start time startPeriodTime := c.state.StartedTime.Add(c.HealthCheckConfig().StartPeriod) @@ -140,12 +143,6 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define. eventLog = eventLog[:c.HealthCheckMaxLogSize()] } - if timeEnd.Sub(timeStart) > c.HealthCheckConfig().Timeout { - returnCode = -1 - hcResult = define.HealthCheckFailure - hcErr = fmt.Errorf("healthcheck command exceeded timeout of %s", c.HealthCheckConfig().Timeout.String()) - } - hcl := newHealthCheckLog(timeStart, timeEnd, returnCode, eventLog) healthCheckResult, err := c.updateHealthCheckLog(hcl, inStartPeriod, isStartup) diff --git a/test/e2e/healthcheck_run_test.go b/test/e2e/healthcheck_run_test.go index b65419569f..37c206eee2 100644 --- a/test/e2e/healthcheck_run_test.go +++ b/test/e2e/healthcheck_run_test.go @@ -390,4 +390,13 @@ HEALTHCHECK CMD ls -l / 2>&1`, ALPINE) Expect(ps.OutputToStringArray()).To(HaveLen(2)) Expect(ps.OutputToString()).To(ContainSubstring("hc")) }) + + It("podman healthcheck - health timeout", func() { + ctrName := "c-h-" + RandomString(6) + podmanTest.PodmanExitCleanly("run", "-d", "--name", ctrName, "--health-cmd", "top", "--health-timeout=3s", ALPINE, "top") + + hc := podmanTest.Podman([]string{"healthcheck", "run", ctrName}) + hc.WaitWithTimeout(10) + Expect(hc).Should(ExitWithError(125, "Error: healthcheck command exceeded timeout of 3s")) + }) }) From 70c161cab1afc7d6c270bfd2deef480995a5ca2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Rod=C3=A1k?= Date: Mon, 12 May 2025 12:47:27 +0200 Subject: [PATCH 3/3] [v5.4-rhel] Fix: Use SIGKILL instead of SIGTERM when ExecStopContainer timeout is 0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Aligns behavior with documentation stating SIGKILL should be sent immediately if the timeout is zero. Fixes: https://issues.redhat.com/browse/RHEL-96916 Fixes: https://issues.redhat.com/browse/RHEL-96917 Signed-off-by: Jan Rodák (cherry picked from commit 077649f9d0c53583654a61c94ebca0f4fa15b877) --- libpod/oci_conmon_exec_common.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libpod/oci_conmon_exec_common.go b/libpod/oci_conmon_exec_common.go index ca917ff1a3..662e81b5f4 100644 --- a/libpod/oci_conmon_exec_common.go +++ b/libpod/oci_conmon_exec_common.go @@ -250,7 +250,7 @@ func (r *ConmonOCIRuntime) ExecStopContainer(ctr *Container, sessionID string, t // SIGTERM did not work. On to SIGKILL. logrus.Debugf("Killing exec session %s (PID %d) of container %s with SIGKILL", sessionID, pid, ctr.ID()) - if err := unix.Kill(pid, unix.SIGTERM); err != nil { + if err := unix.Kill(pid, unix.SIGKILL); err != nil { if err == unix.ESRCH { return nil }