Skip to content

Commit b77f687

Browse files
authored
[ws-daemon] start backup even pod still report the container is running after 5 minutes (#20382)
1 parent 08d82bc commit b77f687

File tree

8 files changed

+133
-11
lines changed

8 files changed

+133
-11
lines changed

components/workspacekit/cmd/rings.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ var ring0Cmd = &cobra.Command{
151151

152152
_ = cmd.Process.Signal(unix.SIGTERM)
153153
time.Sleep(ring1ShutdownTimeout)
154-
if cmd.Process == nil {
154+
if cmd.Process == nil || cmd.ProcessState.Exited() {
155155
return
156156
}
157157

components/ws-daemon/pkg/container/container.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99

1010
"golang.org/x/xerrors"
1111

12+
"github.com/containerd/containerd/api/types/task"
1213
workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1"
1314
)
1415

@@ -54,6 +55,10 @@ type Runtime interface {
5455

5556
// DisposeContainer removes a stopped container, and everything we know about it
5657
DisposeContainer(ctx context.Context, workspaceInstanceID string)
58+
59+
GetContainerTaskInfo(ctx context.Context, id ID) (*task.Process, error)
60+
61+
ForceKillContainerTask(ctx context.Context, id ID) error
5762
}
5863

5964
var (

components/ws-daemon/pkg/container/containerd.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"github.com/containerd/containerd/api/events"
2121
"github.com/containerd/containerd/api/services/tasks/v1"
2222
"github.com/containerd/containerd/api/types"
23+
"github.com/containerd/containerd/api/types/task"
2324
"github.com/containerd/containerd/containers"
2425
"github.com/containerd/containerd/errdefs"
2526
"github.com/containerd/containerd/images"
@@ -576,6 +577,28 @@ func (s *Containerd) IsContainerdReady(ctx context.Context) (bool, error) {
576577
return true, nil
577578
}
578579

580+
func (s *Containerd) GetContainerTaskInfo(ctx context.Context, id ID) (*task.Process, error) {
581+
task, err := s.Client.TaskService().Get(ctx, &tasks.GetRequest{
582+
ContainerID: string(id),
583+
})
584+
if err != nil {
585+
return nil, err
586+
}
587+
if task.Process == nil {
588+
return nil, fmt.Errorf("task has no process")
589+
}
590+
return task.Process, nil
591+
}
592+
593+
func (s *Containerd) ForceKillContainerTask(ctx context.Context, id ID) error {
594+
_, err := s.Client.TaskService().Kill(ctx, &tasks.KillRequest{
595+
ContainerID: string(id),
596+
Signal: 9,
597+
All: true,
598+
})
599+
return err
600+
}
601+
579602
var kubepodsQoSRegexp = regexp.MustCompile(`([^/]+)-([^/]+)-pod`)
580603
var kubepodsRegexp = regexp.MustCompile(`([^/]+)-pod`)
581604

components/ws-daemon/pkg/controller/workspace_controller.go

Lines changed: 65 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1"
2121
"github.com/opentracing/opentracing-go"
2222
"github.com/prometheus/client_golang/prometheus"
23+
"github.com/sirupsen/logrus"
2324

2425
"google.golang.org/protobuf/proto"
2526
corev1 "k8s.io/api/core/v1"
@@ -349,9 +350,43 @@ func (wsc *WorkspaceController) doWorkspaceContentBackup(ctx context.Context, sp
349350

350351
if ws.IsConditionTrue(workspacev1.WorkspaceConditionContainerRunning) {
351352
// Container is still running, we need to wait for it to stop.
352-
// We should get an event when the condition changes, but requeue
353-
// anyways to make sure we act on it in time.
354-
return ctrl.Result{RequeueAfter: 500 * time.Millisecond}, nil
353+
// We will wait for this situation for up to 5 minutes.
354+
// If the container is still in a running state after that,
355+
// there may be an issue with state synchronization.
356+
// We should start backup anyway to avoid data loss.
357+
if !(ws.Status.PodStoppingTime != nil && time.Since(ws.Status.PodStoppingTime.Time) > 5*time.Minute) {
358+
// We should get an event when the condition changes, but requeue
359+
// anyways to make sure we act on it in time.
360+
return ctrl.Result{RequeueAfter: 500 * time.Millisecond}, nil
361+
}
362+
363+
if !ws.IsConditionTrue(workspacev1.WorkspaceConditionForceKilledTask) {
364+
err = wsc.forceKillContainerTask(ctx, ws)
365+
if err != nil {
366+
glog.WithFields(ws.OWI()).WithField("workspace", req.NamespacedName).Errorf("failed to force kill task: %v", err)
367+
}
368+
err = retry.RetryOnConflict(retryParams, func() error {
369+
if err := wsc.Get(ctx, req.NamespacedName, ws); err != nil {
370+
return err
371+
}
372+
ws.Status.SetCondition(workspacev1.NewWorkspaceConditionForceKilledTask())
373+
return wsc.Client.Status().Update(ctx, ws)
374+
})
375+
if err != nil {
376+
return ctrl.Result{}, fmt.Errorf("failed to set force killed task condition: %w", err)
377+
}
378+
return ctrl.Result{Requeue: true, RequeueAfter: 2 * time.Second}, nil
379+
}
380+
381+
if time.Since(wsk8s.GetCondition(ws.Status.Conditions, string(workspacev1.WorkspaceConditionForceKilledTask)).LastTransitionTime.Time) < 2*time.Second {
382+
return ctrl.Result{Requeue: true, RequeueAfter: 2 * time.Second}, nil
383+
}
384+
385+
glog.WithFields(ws.OWI()).WithField("workspace", req.NamespacedName).Warn("workspace container is still running after 5 minutes of deletion, starting backup anyway")
386+
err = wsc.dumpWorkspaceContainerInfo(ctx, ws)
387+
if err != nil {
388+
glog.WithFields(ws.OWI()).WithField("workspace", req.NamespacedName).Errorf("failed to dump container info: %v", err)
389+
}
355390
}
356391

357392
if wsc.latestWorkspace(ctx, ws) != nil {
@@ -442,6 +477,33 @@ func (wsc *WorkspaceController) doWorkspaceContentBackup(ctx context.Context, sp
442477
return ctrl.Result{}, nil
443478
}
444479

480+
func (wsc *WorkspaceController) dumpWorkspaceContainerInfo(ctx context.Context, ws *workspacev1.Workspace) error {
481+
id, err := wsc.runtime.WaitForContainer(ctx, ws.Name)
482+
if err != nil {
483+
return fmt.Errorf("failed to wait for container: %w", err)
484+
}
485+
task, err := wsc.runtime.GetContainerTaskInfo(ctx, id)
486+
if err != nil {
487+
return fmt.Errorf("failed to get container task info: %w", err)
488+
}
489+
glog.WithFields(ws.OWI()).WithFields(logrus.Fields{
490+
"containerID": id,
491+
"exitStatus": task.ExitStatus,
492+
"pid": task.Pid,
493+
"exitedAt": task.ExitedAt.AsTime(),
494+
"status": task.Status.String(),
495+
}).Info("container task info")
496+
return nil
497+
}
498+
499+
func (wsc *WorkspaceController) forceKillContainerTask(ctx context.Context, ws *workspacev1.Workspace) error {
500+
id, err := wsc.runtime.WaitForContainer(ctx, ws.Name)
501+
if err != nil {
502+
return fmt.Errorf("failed to wait for container: %w", err)
503+
}
504+
return wsc.runtime.ForceKillContainerTask(ctx, id)
505+
}
506+
445507
func (wsc *WorkspaceController) prepareInitializer(ctx context.Context, ws *workspacev1.Workspace) (*csapi.WorkspaceInitializer, error) {
446508
var init csapi.WorkspaceInitializer
447509
err := proto.Unmarshal(ws.Spec.Initializer, &init)

components/ws-manager-api/go/crd/v1/workspace_types.go

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -181,11 +181,17 @@ type WorkspaceImageInfo struct {
181181

182182
// WorkspaceStatus defines the observed state of Workspace
183183
type WorkspaceStatus struct {
184-
PodStarts int `json:"podStarts"`
185-
PodRecreated int `json:"podRecreated"`
184+
PodStarts int `json:"podStarts"`
185+
186+
// +kubebuilder:validation:Optional
187+
PodRecreated int `json:"podRecreated"`
188+
// +kubebuilder:validation:Optional
186189
PodDeletionTime *metav1.Time `json:"podDeletionTime,omitempty"`
187-
URL string `json:"url,omitempty" scrub:"redact"`
188-
OwnerToken string `json:"ownerToken,omitempty" scrub:"redact"`
190+
// +kubebuilder:validation:Optional
191+
PodStoppingTime *metav1.Time `json:"podStoppingTime,omitempty"`
192+
193+
URL string `json:"url,omitempty" scrub:"redact"`
194+
OwnerToken string `json:"ownerToken,omitempty" scrub:"redact"`
189195

190196
// +kubebuilder:default=Unknown
191197
Phase WorkspacePhase `json:"phase,omitempty"`
@@ -285,6 +291,9 @@ const (
285291

286292
// WorkspaceConditionStateWiped is true once all state has successfully been wiped by ws-daemon. This is only set if PodRejected=true, and the rejected workspace has been deleted.
287293
WorkspaceConditionStateWiped WorkspaceCondition = "StateWiped"
294+
295+
// WorkspaceConditionForceKilledTask is true if we send a SIGKILL to the task
296+
WorkspaceConditionForceKilledTask WorkspaceCondition = "ForceKilledTask"
288297
)
289298

290299
func NewWorkspaceConditionDeployed() metav1.Condition {
@@ -439,6 +448,14 @@ func NewWorkspaceConditionContainerRunning(status metav1.ConditionStatus) metav1
439448
}
440449
}
441450

451+
func NewWorkspaceConditionForceKilledTask() metav1.Condition {
452+
return metav1.Condition{
453+
Type: string(WorkspaceConditionForceKilledTask),
454+
LastTransitionTime: metav1.Now(),
455+
Status: metav1.ConditionTrue,
456+
}
457+
}
458+
442459
// +kubebuilder:validation:Enum:=Unknown;Pending;Imagebuild;Creating;Initializing;Running;Stopping;Stopped
443460
type WorkspacePhase string
444461

components/ws-manager-api/go/crd/v1/zz_generated.deepcopy.go

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

components/ws-manager-mk2/config/crd/bases/workspace.gitpod.io_workspaces.yaml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -548,11 +548,14 @@ spec:
548548
- Stopping
549549
- Stopped
550550
type: string
551-
podStarts:
552-
type: integer
551+
podDeletionTime:
552+
format: date-time
553+
type: string
553554
podRecreated:
554555
type: integer
555-
podDeletionTime:
556+
podStarts:
557+
type: integer
558+
podStoppingTime:
556559
format: date-time
557560
type: string
558561
runtime:

components/ws-manager-mk2/controllers/status.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,10 @@ func (r *WorkspaceReconciler) updateWorkspaceStatus(ctx context.Context, workspa
5959
defer func() {
6060
if oldPhase != workspace.Status.Phase {
6161
log.Info("workspace phase updated", "oldPhase", oldPhase, "phase", workspace.Status.Phase)
62+
if workspace.Status.Phase == workspacev1.WorkspacePhaseStopping {
63+
t := metav1.Now()
64+
workspace.Status.PodStoppingTime = &t
65+
}
6266
}
6367
}()
6468

0 commit comments

Comments
 (0)