@@ -86,7 +86,8 @@ type Event struct {
86
86
// additional debug message.
87
87
Message string
88
88
89
- //TODO(janisz): Create domain objects for this.
89
+ // TODO(medzin): remove abstractions, because they only obscure all the communication
90
+ kill executor.Event_Kill
90
91
subscribed executor.Event_Subscribed
91
92
launch executor.Event_Launch
92
93
}
@@ -115,6 +116,9 @@ const (
115
116
// Kill means command should be killed and executor exit.
116
117
Kill
117
118
119
+ // Shutdown means executor should kill all tasks and exit.
120
+ Shutdown
121
+
118
122
// Subscribed means executor attach to mesos Agent.
119
123
Subscribed
120
124
// Launch means executor should start a task.
@@ -143,11 +147,14 @@ func NewExecutor(cfg Config, hooks ...hook.Hook) *Executor {
143
147
config : cfg ,
144
148
context : ctx ,
145
149
contextCancel : ctxCancel ,
146
- events : make (chan Event ),
147
- hookManager : hook.Manager {Hooks : hooks },
148
- stateUpdater : state .BufferedUpdater (cfg .MesosConfig , cfg .StateUpdateBufferSize ),
149
- clock : systemClock {},
150
- random : newRandom (),
150
+ // workaound for the problem when Mesos agent sends many KILL events to
151
+ // the executor, and it locks itself on this channel, because after first
152
+ // kill nobody is listening to it
153
+ events : make (chan Event , 128 ),
154
+ hookManager : hook.Manager {Hooks : hooks },
155
+ stateUpdater : state .BufferedUpdater (cfg .MesosConfig , cfg .StateUpdateBufferSize ),
156
+ clock : systemClock {},
157
+ random : newRandom (),
151
158
}
152
159
}
153
160
@@ -208,7 +215,7 @@ SUBSCRIBE_LOOP:
208
215
case <- recoveryTimeout .C :
209
216
return fmt .Errorf ("failed to re-establish subscription with agent within %v, aborting" , e .config .MesosConfig .RecoveryTimeout )
210
217
case <- e .context .Done ():
211
- log .Debug ("Executor context cancelled, breaking subscribe loop" )
218
+ log .Info ("Executor context cancelled, breaking subscribe loop" )
212
219
break SUBSCRIBE_LOOP
213
220
case <- shouldConnect :
214
221
subscribe := calls .Subscribe (nil , e .stateUpdater .GetUnacknowledged ()).With (callOptions ... )
@@ -231,6 +238,7 @@ SUBSCRIBE_LOOP:
231
238
}
232
239
}
233
240
241
+ log .Info ("Trying to to send remaining state updates with %s timeout" , e .config .StateUpdateWaitTimeout )
234
242
if err := e .stateUpdater .Wait (e .config .StateUpdateWaitTimeout ); err != nil { // try to send remaining state updates
235
243
log .WithError (err ).Error ("Unable to send remaining state updates to Mesos agent" )
236
244
}
@@ -264,8 +272,10 @@ func (e *Executor) handleMesosEvent(event executor.Event) error {
264
272
e .events <- Event {Type : Subscribed , subscribed : * event .GetSubscribed ()}
265
273
case executor .Event_LAUNCH :
266
274
e .events <- Event {Type : Launch , launch : * event .GetLaunch ()}
267
- case executor .Event_KILL , executor .Event_SHUTDOWN :
268
- e .events <- Event {Type : Kill }
275
+ case executor .Event_KILL :
276
+ e .events <- Event {Type : Kill , kill : * event .GetKill ()}
277
+ case executor .Event_SHUTDOWN :
278
+ e .events <- Event {Type : Shutdown }
269
279
case executor .Event_ERROR :
270
280
return errMustAbort
271
281
case executor .Event_ACKNOWLEDGED :
@@ -351,11 +361,11 @@ func (e *Executor) taskEventLoop() {
351
361
return
352
362
case Kill :
353
363
e .shutDown (taskInfo , cmd )
354
- message := "Task killed due to receiving an event from Mesos agent"
355
- taskID := mesos. TaskID { Value : "MISSING" }
356
- if taskInfo != nil {
357
- taskID = taskInfo .GetTaskID ()
358
- }
364
+ // relaying on TaskInfo can be tricky here, as the launch event may
365
+ // be lost, so we will not have it, and agent still waits for some
366
+ // TaskStatus with valid ID
367
+ taskID := event . kill .GetTaskID ()
368
+ message := "Task killed due to receiving a kill event from Mesos agent"
359
369
e .stateUpdater .UpdateWithOptions (
360
370
taskID ,
361
371
mesos .TASK_KILLED ,
@@ -364,6 +374,19 @@ func (e *Executor) taskEventLoop() {
364
374
},
365
375
)
366
376
return
377
+ case Shutdown :
378
+ e .shutDown (taskInfo , cmd )
379
+ // it is possible to receive a shutdown without launch
380
+ if taskInfo != nil {
381
+ message := "Task killed due to receiving a shutdown event from Mesos agent"
382
+ e .stateUpdater .UpdateWithOptions (
383
+ event .kill .GetTaskID (),
384
+ mesos .TASK_KILLED ,
385
+ state.OptionalInfo {
386
+ Message : & message ,
387
+ },
388
+ )
389
+ }
367
390
}
368
391
}
369
392
}
0 commit comments