@@ -45,19 +45,18 @@ func (r *ServiceReconciler) reconcileService(ctx context.Context, instance *apiv
4545 return ctrl.Result {Requeue : true , RequeueAfter : * duration }, nil
4646 }
4747
48- if duration , err := r .handleCulling (ctx , instance ); err != nil || duration != nil {
48+ if done , err := r .handleServiceContainerExit (ctx , instance ); err != nil || done {
4949 if err != nil {
5050 return ctrl.Result {}, err
5151 }
52- return ctrl.Result {Requeue : true , RequeueAfter : * duration }, nil
52+ return ctrl.Result {Requeue : true }, nil
5353 }
5454
55- if instance . Status . IsWarning () {
56- if err := r . handleServiceBackoffLimit ( ctx , instance ); err != nil {
55+ if duration , err := r . handleCulling ( ctx , instance ); err != nil || duration != nil {
56+ if err != nil {
5757 return ctrl.Result {}, err
5858 }
59- // log.V(1).Info("service has warning", "Reschdule check in", 30)
60- // return ctrl.Result{Requeue: true, RequeueAfter: time.Second * time.Duration(30)}, nil
59+ return ctrl.Result {Requeue : true , RequeueAfter : * duration }, nil
6160 }
6261
6362 return ctrl.Result {}, nil
@@ -221,24 +220,75 @@ func (r *ServiceReconciler) cleanUpService(ctx context.Context, instance *apiv1.
221220 return r .handleTTL (ctx , instance )
222221}
223222
224- // handleServiceBackoffLimit checks if service has BackoffLimit and translate it to a warning duration with back-off limit
225- func (r * ServiceReconciler ) handleServiceBackoffLimit (ctx context.Context , instance * apiv1.Service ) error {
223+ func (r * ServiceReconciler ) handleServiceContainerExit (ctx context.Context , instance * apiv1.Service ) (bool , error ) {
226224 log := r .Log
227225
228- backoffLimit := instance .Termination . BackoffLimit
229- if backoffLimit == nil {
230- return nil
226+ instanceID , ok := instance .Labels [ "app.kubernetes.io/instance" ]
227+ if ! ok {
228+ return false , nil
231229 }
232- lastTransitionTime := instance .Status .Conditions [len (instance .Status .Conditions )- 1 ].LastTransitionTime
233- currentTime := metav1 .Now ()
234- duration := currentTime .Sub (lastTransitionTime .Time )
235230
236- if duration >= r . getBackOff ( * backoffLimit ) {
237- log . V ( 1 ). Info ( "Cleanup triggered based on ActiveDeadlineSeconds" )
238- return r . delete ( ctx , instance )
231+ exitStatus , err := managers . GetMainContainerExitStatusByInstance ( r . Client , instanceID , instance . Namespace )
232+ if err != nil || exitStatus == nil {
233+ return false , err
239234 }
240235
241- return nil
236+ replicas := int32 (managers .DefaultServiceReplicas )
237+ if instance .ServiceSpec != nil {
238+ replicas = managers .GetReplicas (managers .DefaultServiceReplicas , * instance .ServiceSpec )
239+ }
240+
241+ now := metav1 .Now ()
242+ // Multi-replica services do not have unambiguous completion semantics.
243+ if replicas == 1 && exitStatus .ExitCode == 0 {
244+ if updated := instance .Status .LogSucceeded (); updated {
245+ instance .Status .CompletionTime = & now
246+ log .Info ("Service main container succeeded" , "pod" , exitStatus .PodName , "container" , exitStatus .ContainerName )
247+ if statusErr := r .Status ().Update (ctx , instance ); statusErr != nil {
248+ return false , statusErr
249+ }
250+ _ = r .instanceSyncStatus (instance )
251+ }
252+ return true , nil
253+ }
254+
255+ if exitStatus .ExitCode == 0 {
256+ return false , nil
257+ }
258+
259+ backoffLimit := int32 (0 )
260+ if instance .Termination .BackoffLimit != nil {
261+ backoffLimit = * instance .Termination .BackoffLimit
262+ }
263+ if backoffLimit < 0 || exitStatus .ContainerFailedAttempts () <= backoffLimit {
264+ return false , nil
265+ }
266+
267+ reason := exitStatus .Reason
268+ if reason == "" {
269+ reason = "ContainerFailed"
270+ }
271+ message := fmt .Sprintf (
272+ "Main container %q in pod %q exited with code %d after %d failed attempt(s), exceeding maxRetries %d" ,
273+ exitStatus .ContainerName ,
274+ exitStatus .PodName ,
275+ exitStatus .ExitCode ,
276+ exitStatus .ContainerFailedAttempts (),
277+ backoffLimit ,
278+ )
279+ if exitStatus .Message != "" {
280+ message = fmt .Sprintf ("%s: %s" , message , exitStatus .Message )
281+ }
282+ if updated := instance .Status .LogFailed (reason , message ); updated {
283+ instance .Status .CompletionTime = & now
284+ log .Info ("Service main container exceeded retry budget" , "pod" , exitStatus .PodName , "container" , exitStatus .ContainerName , "failedAttempts" , exitStatus .ContainerFailedAttempts (), "maxRetries" , backoffLimit )
285+ if statusErr := r .Status ().Update (ctx , instance ); statusErr != nil {
286+ return false , statusErr
287+ }
288+ _ = r .instanceSyncStatus (instance )
289+ }
290+
291+ return true , nil
242292}
243293
244294// handleCulling checks if the service is idle and should be culled
0 commit comments