Skip to content

Commit a178620

Browse files
pbetkierAlfredBroda
authored andcommitted
Omit SIGTERM to certain processes during stop (#136)
* Refactor os/kill.KillTree * sendSignalsToProcesses: don't interrupt on kill error * Add os/kill.KillTreeWithExcludes * Add tests for KillTreeWithExcludes and KillTree * Add configuration of SIGTERM excludes via env var * Fix KillTree tests on Linux platforms * Set go 1.13.1 on travis * Add README entry about kill tree exclude, remove unused wrapWithStopAndCont parameter * Use good old KillTree for SIGTERMs when ALLEGRO_EXECUTOR_SIGTERM_EXCLUDE_PROCESSES not set * Don't exclude processes from kill for which name could not be read * Bring back 'pgrep -g' requirement to readme
1 parent f073a42 commit a178620

File tree

8 files changed

+310
-30
lines changed

8 files changed

+310
-30
lines changed

.travis.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
language: go
22
go:
3-
- 1.10.x
3+
- 1.13.1
44

55
script:
66
- make
@@ -19,4 +19,4 @@ deploy:
1919
file_glob: true
2020
file: target/*.zip
2121
on:
22-
tags: true
22+
tags: true

README.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,10 @@ It is performed in the following steps:
5050
3. Wait `KillPolicyGracePeriod` (can be overridden with Task Kill Policy Grace Period).
5151
4. Sent SIGKILL to process tree.
5252

53+
Executor can be configured to exclude certain processes from SIGTERM signal. Provide
54+
process names to exclude in `ALLEGRO_EXECUTOR_SIGTERM_EXCLUDE_PROCESSES` environment variable
55+
as a comma-separated string. This feature requires `pgrep -g` to be available on the machine.
56+
5357
## Log scraping
5458

5559
By default executor forwards service stdout/stderr to its own standard streams.
@@ -188,4 +192,4 @@ Mesos Executor is distributed under the [Apache 2.0 License](LICENSE).
188192
[10]: https://mesos.apache.org/documentation/latest/mesos-containerizer/
189193
[11]: https://www.elastic.co/products/logstash
190194
[12]: https://brandur.org/logfmt
191-
[14]: https://godoc.org/github.com/allegro/mesos-executor/servicelog
195+
[14]: https://godoc.org/github.com/allegro/mesos-executor/servicelog

command.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ const (
4242
type Command interface {
4343
Start() error
4444
Wait() <-chan TaskExitState
45-
Stop(gracePeriod time.Duration)
45+
Stop(gracePeriod time.Duration, sigtermExcludeProcesses []string)
4646
}
4747

4848
type cancellableCommand struct {
@@ -102,13 +102,13 @@ func (c *cancellableCommand) waitForCommand() {
102102
close(c.doneChan)
103103
}
104104

105-
func (c *cancellableCommand) Stop(gracePeriod time.Duration) {
105+
func (c *cancellableCommand) Stop(gracePeriod time.Duration, sigtermExcludeProcesses []string) {
106106
// Return if Stop was already called.
107107
if c.killing {
108108
return
109109
}
110110
c.killing = true
111-
err := osutil.KillTree(syscall.SIGTERM, int32(c.cmd.Process.Pid))
111+
err := osutil.KillTreeWithExcludes(syscall.SIGTERM, int32(c.cmd.Process.Pid), sigtermExcludeProcesses)
112112
if err != nil {
113113
log.WithError(err).Errorf("There was a problem with sending %s to %d children", syscall.SIGTERM, c.cmd.Process.Pid)
114114
return

executor.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,9 @@ type Config struct {
6363
// Range in which certificate will be considered as expired. Used to
6464
// prevent shutdown of all tasks at once.
6565
RandomExpirationRange time.Duration `default:"3h" split_words:"true"`
66+
67+
// SigtermExcludeProcesses specifies process names to omit when sending SIGTERM to process tree during shutdown
68+
SigtermExcludeProcesses []string `split_words:"true"`
6669
}
6770

6871
var errMustAbort = errors.New("received abort signal from mesos, will attempt to re-subscribe")
@@ -522,7 +525,7 @@ func (e *Executor) shutDown(taskInfo *mesos.TaskInfo, cmd Command) {
522525
TaskInfo: mesosutils.TaskInfo{TaskInfo: *taskInfo},
523526
}
524527
_, _ = e.hookManager.HandleEvent(beforeTerminateEvent, true) // ignore errors here, so every hook will have a chance to be called
525-
cmd.Stop(gracePeriod) // blocking call
528+
cmd.Stop(gracePeriod, e.config.SigtermExcludeProcesses) // blocking call
526529
}
527530

528531
func taskExitToEvent(exitStateChan <-chan TaskExitState, events chan<- Event) {

os/kill.go

Lines changed: 124 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,11 @@
33
package os
44

55
import (
6+
"bufio"
67
"fmt"
8+
"os/exec"
9+
"strconv"
10+
"strings"
711
"syscall"
812

913
"github.com/shirou/gopsutil/process"
@@ -13,26 +17,36 @@ import (
1317
// KillTree sends signal to whole process tree, starting from given pid as root.
1418
// Order of signalling in process tree is undefined.
1519
func KillTree(signal syscall.Signal, pid int32) error {
16-
proc, err := process.NewProcess(pid)
20+
pgids, err := getProcessGroupsInTree(pid)
1721
if err != nil {
1822
return err
1923
}
2024

25+
signals := wrapWithStopAndCont(signal)
26+
return sendSignalsToProcessGroups(signals, pgids)
27+
}
28+
29+
func getProcessGroupsInTree(pid int32) ([]int, error) {
30+
proc, err := process.NewProcess(pid)
31+
if err != nil {
32+
return nil, err
33+
}
34+
2135
processes := getAllChildren(proc)
2236
processes = append(processes, proc)
2337

2438
curPid := syscall.Getpid()
2539
curPgid, err := syscall.Getpgid(curPid)
2640
if err != nil {
27-
return fmt.Errorf("error getting current process pgid: %s", err)
41+
return nil, fmt.Errorf("error getting current process pgid: %s", err)
2842
}
2943

3044
var pgids []int
3145
pgidsSeen := map[int]bool{}
3246
for _, proc := range processes {
3347
pgid, err := syscall.Getpgid(int(proc.Pid))
3448
if err != nil {
35-
return fmt.Errorf("error getting child process pgid: %s", err)
49+
return nil, fmt.Errorf("error getting child process pgid: %s", err)
3650
}
3751
if pgid == curPgid {
3852
continue
@@ -42,8 +56,7 @@ func KillTree(signal syscall.Signal, pid int32) error {
4256
pgidsSeen[pgid] = true
4357
}
4458
}
45-
46-
return wrapWithStopAndCont(signal, pgids)
59+
return pgids, nil
4760
}
4861

4962
// getAllChildren gets whole descendants tree of given process. Order of returned
@@ -61,27 +74,124 @@ func getAllChildren(proc *process.Process) []*process.Process {
6174
// wrapWithStopAndCont wraps original process tree signal sending with SIGSTOP and
6275
// SIGCONT to prevent processes from forking during termination, so we will not
6376
// have orphaned processes after.
64-
func wrapWithStopAndCont(signal syscall.Signal, pgids []int) error {
77+
func wrapWithStopAndCont(signal syscall.Signal) []syscall.Signal {
6578
signals := []syscall.Signal{syscall.SIGSTOP, signal}
6679
if signal != syscall.SIGKILL { // no point in sending any signal after SIGKILL
6780
signals = append(signals, syscall.SIGCONT)
6881
}
82+
return signals
83+
}
6984

70-
for _, currentSignal := range signals {
71-
if err := sendSignalToProcessGroups(currentSignal, pgids); err != nil {
72-
return err
85+
func sendSignalsToProcessGroups(signals []syscall.Signal, pgids []int) error {
86+
for _, signal := range signals {
87+
for _, pgid := range pgids {
88+
log.Infof("Sending signal %s to pgid %d", signal, pgid)
89+
err := syscall.Kill(-pgid, signal)
90+
if err != nil {
91+
log.Infof("Error sending signal to pgid %d: %s", pgid, err)
92+
}
7393
}
7494
}
7595
return nil
7696
}
7797

78-
func sendSignalToProcessGroups(signal syscall.Signal, pgids []int) error {
98+
// KillTreeWithExcludes sends signal to whole process tree, starting from given pid as root.
99+
// Omits processes matching names specified in processesToExclude. Kills using pids instead of pgids.
100+
func KillTreeWithExcludes(signal syscall.Signal, pid int32, processesToExclude []string) error {
101+
log.Infof("Will send signal %s to tree starting from %d", signal.String(), pid)
102+
103+
if len(processesToExclude) == 0 {
104+
return KillTree(signal, pid)
105+
}
106+
107+
pgids, err := getProcessGroupsInTree(pid)
108+
if err != nil {
109+
return err
110+
}
111+
112+
log.Infof("Found process groups: %v", pgids)
113+
114+
pids, err := findProcessesInGroups(pgids)
115+
if err != nil {
116+
return err
117+
}
118+
119+
log.Infof("Found processes in groups: %v", pids)
120+
121+
pids, err = excludeProcesses(pids, processesToExclude)
122+
if err != nil {
123+
return err
124+
}
125+
126+
signals := wrapWithStopAndCont(signal)
127+
return sendSignalsToProcesses(signals, pids)
128+
}
129+
130+
func findProcessesInGroups(pgids []int) ([]int, error) {
131+
var pids []int
79132
for _, pgid := range pgids {
80-
log.Infof("Sending signal %s to pgid %d", signal, pgid)
81-
err := syscall.Kill(-pgid, signal)
133+
cmd := exec.Command("pgrep", "-g", strconv.Itoa(pgid))
134+
output, err := cmd.CombinedOutput()
135+
if err != nil {
136+
return nil, fmt.Errorf("'pgrep -g %d' failed: %s", pgid, err)
137+
}
138+
if !cmd.ProcessState.Success() {
139+
return nil, fmt.Errorf("'pgrep -g %d' failed, output was: '%s'", pgid, output)
140+
}
141+
142+
scanner := bufio.NewScanner(strings.NewReader(string(output)))
143+
for scanner.Scan() {
144+
pid, err := strconv.Atoi(scanner.Text())
145+
if err != nil {
146+
return nil, fmt.Errorf("cannot convert pgrep output: %s. Output was '%s'", err, output)
147+
}
148+
pids = append(pids, pid)
149+
}
150+
}
151+
152+
return pids, nil
153+
}
154+
155+
func excludeProcesses(pids []int, processesToExclude []string) ([]int, error) {
156+
var retainedPids []int
157+
for _, pid := range pids {
158+
proc, err := process.NewProcess(int32(pid))
159+
if err != nil {
160+
return nil, err
161+
}
162+
163+
name, err := proc.Name()
82164
if err != nil {
83-
log.Infof("Error sending signal to pgid %d: %s", pgid, err)
84-
return err
165+
log.Infof("Could not get process name of %d, will not exclude it from kill", pid)
166+
} else if isExcluded(name, processesToExclude) {
167+
log.Infof("Excluding process %s with pid %d from kill", name, pid)
168+
continue
169+
}
170+
171+
retainedPids = append(retainedPids, pid)
172+
}
173+
174+
return retainedPids, nil
175+
}
176+
177+
func isExcluded(name string, namesToExclude []string) bool {
178+
for _, exclude := range namesToExclude {
179+
if strings.ToLower(name) == strings.ToLower(exclude) {
180+
return true
181+
}
182+
}
183+
184+
return false
185+
}
186+
187+
func sendSignalsToProcesses(signals []syscall.Signal, pids []int) error {
188+
for _, signal := range signals {
189+
for _, pid := range pids {
190+
log.Infof("Sending signal %s to pid %d", signal, pid)
191+
err := syscall.Kill(pid, signal)
192+
if err != nil {
193+
log.Infof("Error sending signal to pid %d: %s", pid, err)
194+
}
85195
}
86196
}
87197
return nil

0 commit comments

Comments
 (0)