Skip to content

Commit 2df93e6

Browse files
committed
Stop failing services after retry budget
* Treat termination.maxRetries on Deployment-backed services as a main-container crash budget instead of a warning-based deletion timer. * Mark services failed once failed exits exceed the budget, while allowing negative values to opt into unlimited retries.
1 parent cf6d6d0 commit 2df93e6

4 files changed

Lines changed: 467 additions & 28 deletions

File tree

internal/controller/service/cleanup.go

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ package service
22

33
import (
44
"context"
5-
"math"
65
"time"
76

87
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -82,12 +81,3 @@ func (r *ServiceReconciler) handlePastActiveDeadline(ctx context.Context, instan
8281

8382
return nil, nil
8483
}
85-
86-
func (r *ServiceReconciler) getBackOff(backOff int32) time.Duration {
87-
// The backoff is capped such that 'calculated' value never overflows.
88-
delay := float64(1) * math.Pow(2, float64(backOff))
89-
if delay > math.MaxInt64 {
90-
return utils.MaxBackOff
91-
}
92-
return time.Duration(delay)
93-
}

internal/controller/service/service.go

Lines changed: 68 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -45,19 +45,18 @@ func (r *ServiceReconciler) reconcileService(ctx context.Context, instance *apiv
4545
return ctrl.Result{Requeue: true, RequeueAfter: *duration}, nil
4646
}
4747

48-
if duration, err := r.handleCulling(ctx, instance); err != nil || duration != nil {
48+
if done, err := r.handleServiceContainerExit(ctx, instance); err != nil || done {
4949
if err != nil {
5050
return ctrl.Result{}, err
5151
}
52-
return ctrl.Result{Requeue: true, RequeueAfter: *duration}, nil
52+
return ctrl.Result{Requeue: true}, nil
5353
}
5454

55-
if instance.Status.IsWarning() {
56-
if err := r.handleServiceBackoffLimit(ctx, instance); err != nil {
55+
if duration, err := r.handleCulling(ctx, instance); err != nil || duration != nil {
56+
if err != nil {
5757
return ctrl.Result{}, err
5858
}
59-
// log.V(1).Info("service has warning", "Reschdule check in", 30)
60-
// return ctrl.Result{Requeue: true, RequeueAfter: time.Second * time.Duration(30)}, nil
59+
return ctrl.Result{Requeue: true, RequeueAfter: *duration}, nil
6160
}
6261

6362
return ctrl.Result{}, nil
@@ -221,24 +220,75 @@ func (r *ServiceReconciler) cleanUpService(ctx context.Context, instance *apiv1.
221220
return r.handleTTL(ctx, instance)
222221
}
223222

224-
// handleServiceBackoffLimit checks if service has BackoffLimit and translate it to a warning duration with back-off limit
225-
func (r *ServiceReconciler) handleServiceBackoffLimit(ctx context.Context, instance *apiv1.Service) error {
223+
func (r *ServiceReconciler) handleServiceContainerExit(ctx context.Context, instance *apiv1.Service) (bool, error) {
226224
log := r.Log
227225

228-
backoffLimit := instance.Termination.BackoffLimit
229-
if backoffLimit == nil {
230-
return nil
226+
instanceID, ok := instance.Labels["app.kubernetes.io/instance"]
227+
if !ok {
228+
return false, nil
231229
}
232-
lastTransitionTime := instance.Status.Conditions[len(instance.Status.Conditions)-1].LastTransitionTime
233-
currentTime := metav1.Now()
234-
duration := currentTime.Sub(lastTransitionTime.Time)
235230

236-
if duration >= r.getBackOff(*backoffLimit) {
237-
log.V(1).Info("Cleanup triggered based on ActiveDeadlineSeconds")
238-
return r.delete(ctx, instance)
231+
exitStatus, err := managers.GetMainContainerExitStatusByInstance(r.Client, instanceID, instance.Namespace)
232+
if err != nil || exitStatus == nil {
233+
return false, err
239234
}
240235

241-
return nil
236+
replicas := int32(managers.DefaultServiceReplicas)
237+
if instance.ServiceSpec != nil {
238+
replicas = managers.GetReplicas(managers.DefaultServiceReplicas, *instance.ServiceSpec)
239+
}
240+
241+
now := metav1.Now()
242+
// Multi-replica services do not have unambiguous completion semantics.
243+
if replicas == 1 && exitStatus.ExitCode == 0 {
244+
if updated := instance.Status.LogSucceeded(); updated {
245+
instance.Status.CompletionTime = &now
246+
log.Info("Service main container succeeded", "pod", exitStatus.PodName, "container", exitStatus.ContainerName)
247+
if statusErr := r.Status().Update(ctx, instance); statusErr != nil {
248+
return false, statusErr
249+
}
250+
_ = r.instanceSyncStatus(instance)
251+
}
252+
return true, nil
253+
}
254+
255+
if exitStatus.ExitCode == 0 {
256+
return false, nil
257+
}
258+
259+
backoffLimit := int32(0)
260+
if instance.Termination.BackoffLimit != nil {
261+
backoffLimit = *instance.Termination.BackoffLimit
262+
}
263+
if backoffLimit < 0 || exitStatus.ContainerFailedAttempts() <= backoffLimit {
264+
return false, nil
265+
}
266+
267+
reason := exitStatus.Reason
268+
if reason == "" {
269+
reason = "ContainerFailed"
270+
}
271+
message := fmt.Sprintf(
272+
"Main container %q in pod %q exited with code %d after %d failed attempt(s), exceeding maxRetries %d",
273+
exitStatus.ContainerName,
274+
exitStatus.PodName,
275+
exitStatus.ExitCode,
276+
exitStatus.ContainerFailedAttempts(),
277+
backoffLimit,
278+
)
279+
if exitStatus.Message != "" {
280+
message = fmt.Sprintf("%s: %s", message, exitStatus.Message)
281+
}
282+
if updated := instance.Status.LogFailed(reason, message); updated {
283+
instance.Status.CompletionTime = &now
284+
log.Info("Service main container exceeded retry budget", "pod", exitStatus.PodName, "container", exitStatus.ContainerName, "failedAttempts", exitStatus.ContainerFailedAttempts(), "maxRetries", backoffLimit)
285+
if statusErr := r.Status().Update(ctx, instance); statusErr != nil {
286+
return false, statusErr
287+
}
288+
_ = r.instanceSyncStatus(instance)
289+
}
290+
291+
return true, nil
242292
}
243293

244294
// handleCulling checks if the service is idle and should be culled

0 commit comments

Comments
 (0)