Skip to content

Commit a27cc69

Browse files
authored
Fix deadlock on multiple kill events (#111)
After first kill event received from Mesos agent executor is stopping the goroutine that is responsible for listening to events channel. This commit changes the events channel to be buffered so sending second event will not lock the whole executor process. Also the kill logic is now based on task ID received from Mesos agent, so if executor missed the launch event it can still return valid TaskStatus messages.
1 parent f506fde commit a27cc69

File tree

3 files changed

+42
-17
lines changed

3 files changed

+42
-17
lines changed

executor.go

Lines changed: 37 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,8 @@ type Event struct {
8686
// additional debug message.
8787
Message string
8888

89-
//TODO(janisz): Create domain objects for this.
89+
// TODO(medzin): remove abstractions, because they only obscure all the communication
90+
kill executor.Event_Kill
9091
subscribed executor.Event_Subscribed
9192
launch executor.Event_Launch
9293
}
@@ -115,6 +116,9 @@ const (
115116
// Kill means command should be killed and executor exit.
116117
Kill
117118

119+
// Shutdown means executor should kill all tasks and exit.
120+
Shutdown
121+
118122
// Subscribed means executor attach to mesos Agent.
119123
Subscribed
120124
// Launch means executor should start a task.
@@ -143,11 +147,14 @@ func NewExecutor(cfg Config, hooks ...hook.Hook) *Executor {
143147
config: cfg,
144148
context: ctx,
145149
contextCancel: ctxCancel,
146-
events: make(chan Event),
147-
hookManager: hook.Manager{Hooks: hooks},
148-
stateUpdater: state.BufferedUpdater(cfg.MesosConfig, cfg.StateUpdateBufferSize),
149-
clock: systemClock{},
150-
random: newRandom(),
150+
// workaound for the problem when Mesos agent sends many KILL events to
151+
// the executor, and it locks itself on this channel, because after first
152+
// kill nobody is listening to it
153+
events: make(chan Event, 128),
154+
hookManager: hook.Manager{Hooks: hooks},
155+
stateUpdater: state.BufferedUpdater(cfg.MesosConfig, cfg.StateUpdateBufferSize),
156+
clock: systemClock{},
157+
random: newRandom(),
151158
}
152159
}
153160

@@ -208,7 +215,7 @@ SUBSCRIBE_LOOP:
208215
case <-recoveryTimeout.C:
209216
return fmt.Errorf("failed to re-establish subscription with agent within %v, aborting", e.config.MesosConfig.RecoveryTimeout)
210217
case <-e.context.Done():
211-
log.Debug("Executor context cancelled, breaking subscribe loop")
218+
log.Info("Executor context cancelled, breaking subscribe loop")
212219
break SUBSCRIBE_LOOP
213220
case <-shouldConnect:
214221
subscribe := calls.Subscribe(nil, e.stateUpdater.GetUnacknowledged()).With(callOptions...)
@@ -231,6 +238,7 @@ SUBSCRIBE_LOOP:
231238
}
232239
}
233240

241+
log.Info("Trying to to send remaining state updates with %s timeout", e.config.StateUpdateWaitTimeout)
234242
if err := e.stateUpdater.Wait(e.config.StateUpdateWaitTimeout); err != nil { // try to send remaining state updates
235243
log.WithError(err).Error("Unable to send remaining state updates to Mesos agent")
236244
}
@@ -264,8 +272,10 @@ func (e *Executor) handleMesosEvent(event executor.Event) error {
264272
e.events <- Event{Type: Subscribed, subscribed: *event.GetSubscribed()}
265273
case executor.Event_LAUNCH:
266274
e.events <- Event{Type: Launch, launch: *event.GetLaunch()}
267-
case executor.Event_KILL, executor.Event_SHUTDOWN:
268-
e.events <- Event{Type: Kill}
275+
case executor.Event_KILL:
276+
e.events <- Event{Type: Kill, kill: *event.GetKill()}
277+
case executor.Event_SHUTDOWN:
278+
e.events <- Event{Type: Shutdown}
269279
case executor.Event_ERROR:
270280
return errMustAbort
271281
case executor.Event_ACKNOWLEDGED:
@@ -351,11 +361,11 @@ func (e *Executor) taskEventLoop() {
351361
return
352362
case Kill:
353363
e.shutDown(taskInfo, cmd)
354-
message := "Task killed due to receiving an event from Mesos agent"
355-
taskID := mesos.TaskID{Value: "MISSING"}
356-
if taskInfo != nil {
357-
taskID = taskInfo.GetTaskID()
358-
}
364+
// relaying on TaskInfo can be tricky here, as the launch event may
365+
// be lost, so we will not have it, and agent still waits for some
366+
// TaskStatus with valid ID
367+
taskID := event.kill.GetTaskID()
368+
message := "Task killed due to receiving a kill event from Mesos agent"
359369
e.stateUpdater.UpdateWithOptions(
360370
taskID,
361371
mesos.TASK_KILLED,
@@ -364,6 +374,19 @@ func (e *Executor) taskEventLoop() {
364374
},
365375
)
366376
return
377+
case Shutdown:
378+
e.shutDown(taskInfo, cmd)
379+
// it is possible to receive a shutdown without launch
380+
if taskInfo != nil {
381+
message := "Task killed due to receiving a shutdown event from Mesos agent"
382+
e.stateUpdater.UpdateWithOptions(
383+
event.kill.GetTaskID(),
384+
mesos.TASK_KILLED,
385+
state.OptionalInfo{
386+
Message: &message,
387+
},
388+
)
389+
}
367390
}
368391
}
369392
}

executor_test.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -363,7 +363,9 @@ func TestCertificateCheckScheduleTaskKillBeforeCertificateExpires(t *testing.T)
363363
func TestIfNotPanicsWhenKillWithoutLaunch(t *testing.T) {
364364
stateUpdater := new(mockUpdater)
365365
stateUpdater.On("UpdateWithOptions",
366-
mock.AnythingOfType("mesos.TaskID"),
366+
mock.MatchedBy(func(taskID mesos.TaskID) bool {
367+
return taskID.GetValue() == "taskID"
368+
}),
367369
mesos.TASK_KILLED,
368370
mock.AnythingOfType("state.OptionalInfo")).Once()
369371
events := make(chan Event, 1)
@@ -375,7 +377,7 @@ func TestIfNotPanicsWhenKillWithoutLaunch(t *testing.T) {
375377
}
376378

377379
assert.NotPanics(t, func() {
378-
events <- Event{Type: Kill}
380+
events <- Event{Type: Kill, kill: executor.Event_Kill{TaskID: mesos.TaskID{Value: "taskID"}}}
379381
exec.taskEventLoop()
380382
stateUpdater.AssertExpectations(t)
381383
})

gometalinter.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"Aggregate": true,
33
"Concurrency": 2,
4-
"Cyclo": 14,
4+
"Cyclo": 15,
55
"Deadline": "300s",
66
"DisableAll": true,
77
"Enable": [

0 commit comments

Comments
 (0)