Skip to content

Commit aaa27c2

Browse files
authored
[server] Add /ready probe to smooth rollout of server pods (#20673)
* [server] Move /ready to /startup, and rename code to StartupController (because it's used by the StartupProbe) Tool: gitpod/catfood.gitpod.cloud * [server] Introduce special /ready handler that only returns "false" during the shutdown phase Tool: gitpod/catfood.gitpod.cloud
1 parent fad0801 commit aaa27c2

File tree

6 files changed

+157
-122
lines changed

6 files changed

+157
-122
lines changed

components/server/src/container-module.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ import { WebhookEventGarbageCollector } from "./jobs/webhook-gc";
7171
import { WorkspaceGarbageCollector } from "./jobs/workspace-gc";
7272
import { LinkedInService } from "./linkedin-service";
7373
import { LivenessController } from "./liveness/liveness-controller";
74-
import { ReadinessController } from "./liveness/readiness-controller";
74+
import { StartupController } from "./liveness/startup-controller";
7575
import { RedisSubscriber } from "./messaging/redis-subscriber";
7676
import { MonitoringEndpointsApp } from "./monitoring-endpoints";
7777
import { OAuthController } from "./oauth-server/oauth-controller";
@@ -137,6 +137,7 @@ import { InstallationAdminCleanup } from "./jobs/installation-admin-cleanup";
137137
import { AuditLogService } from "./audit/AuditLogService";
138138
import { AuditLogGarbageCollectorJob } from "./jobs/auditlog-gc";
139139
import { ProbesApp } from "./liveness/probes";
140+
import { ReadinessController } from "./liveness/readiness-controller";
140141

141142
export const productionContainerModule = new ContainerModule(
142143
(bind, unbind, isBound, rebind, unbindAsync, onActivation, onDeactivation) => {
@@ -245,6 +246,7 @@ export const productionContainerModule = new ContainerModule(
245246

246247
bind(ProbesApp).toSelf().inSingletonScope();
247248
bind(LivenessController).toSelf().inSingletonScope();
249+
bind(StartupController).toSelf().inSingletonScope();
248250
bind(ReadinessController).toSelf().inSingletonScope();
249251

250252
bind(OneTimeSecretServer).toSelf().inSingletonScope();

components/server/src/liveness/probes.ts

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,9 @@ import * as http from "http";
88
import express from "express";
99
import { inject, injectable } from "inversify";
1010
import { LivenessController } from "./liveness-controller";
11-
import { ReadinessController } from "./readiness-controller";
11+
import { StartupController } from "./startup-controller";
1212
import { AddressInfo } from "net";
13+
import { ReadinessController } from "./readiness-controller";
1314

1415
@injectable()
1516
export class ProbesApp {
@@ -18,17 +19,17 @@ export class ProbesApp {
1819

1920
constructor(
2021
@inject(LivenessController) protected readonly livenessController: LivenessController,
22+
@inject(StartupController) protected readonly startupController: StartupController,
2123
@inject(ReadinessController) protected readonly readinessController: ReadinessController,
2224
) {
2325
const probesApp = express();
2426
probesApp.use("/live", this.livenessController.apiRouter);
27+
probesApp.use("/startup", this.startupController.apiRouter);
2528
probesApp.use("/ready", this.readinessController.apiRouter);
2629
this.app = probesApp;
2730
}
2831

2932
public async start(port: number): Promise<number> {
30-
await this.readinessController.start();
31-
3233
return new Promise((resolve, reject) => {
3334
const probeServer = this.app.listen(port, () => {
3435
resolve((<AddressInfo>probeServer.address()).port);
@@ -37,8 +38,11 @@ export class ProbesApp {
3738
});
3839
}
3940

41+
public notifyShutdown(): void {
42+
this.readinessController.notifyShutdown();
43+
}
44+
4045
public async stop(): Promise<void> {
4146
this.httpServer?.close();
42-
await this.readinessController.stop();
4347
}
4448
}

components/server/src/liveness/readiness-controller.ts

Lines changed: 19 additions & 115 deletions
Original file line numberDiff line numberDiff line change
@@ -4,138 +4,42 @@
44
* See License.AGPL.txt in the project root for license information.
55
*/
66

7-
import { injectable, inject } from "inversify";
7+
import { injectable } from "inversify";
88
import express from "express";
9-
import { TypeORM } from "@gitpod/gitpod-db/lib";
10-
import { SpiceDBClientProvider } from "../authorization/spicedb";
119
import { log } from "@gitpod/gitpod-protocol/lib/util/logging";
12-
import { v1 } from "@authzed/authzed-node";
13-
import { getExperimentsClientForBackend } from "@gitpod/gitpod-protocol/lib/experiments/configcat-server";
14-
import { Redis } from "ioredis";
15-
import { repeat } from "@gitpod/gitpod-protocol/lib/util/repeat";
16-
import { Disposable, DisposableCollection } from "@gitpod/gitpod-protocol";
1710

11+
/**
12+
* ReadinessController is mimicking the behavior server had in the past: Behave as there is not ready probe - except during shutdown.
13+
*
14+
* Why? In Gitpod, our error strategy has always been "keep it local and retry", instead of "fail loud and have someone else handle it".
15+
* As we don't want to change this now, we keep the same behavior for most of the services lifetime.
16+
*
17+
* Only during shutdown, we want to signal that the service is not ready anymore, to reduce error peaks.
18+
*/
1819
@injectable()
1920
export class ReadinessController {
20-
@inject(TypeORM) protected readonly typeOrm: TypeORM;
21-
@inject(SpiceDBClientProvider) protected readonly spiceDBClientProvider: SpiceDBClientProvider;
22-
@inject(Redis) protected readonly redis: Redis;
23-
24-
private readinessProbeEnabled: boolean = true;
25-
private disposables: DisposableCollection = new DisposableCollection();
21+
private shutdown: boolean = false;
2622

2723
get apiRouter(): express.Router {
2824
const router = express.Router();
2925
this.addReadinessHandler(router);
3026
return router;
3127
}
3228

33-
public async start() {
34-
this.disposables.push(this.startPollingFeatureFlag());
35-
}
36-
37-
public async stop() {
38-
this.disposables.dispose();
29+
public notifyShutdown(): void {
30+
this.shutdown = true;
3931
}
4032

4133
protected addReadinessHandler(router: express.Router) {
4234
router.get("/", async (_, res) => {
43-
try {
44-
if (!this.readinessProbeEnabled) {
45-
log.debug("Readiness check skipped due to feature flag");
46-
res.status(200);
47-
return;
48-
}
49-
50-
// Check database connection
51-
const dbConnection = await this.checkDatabaseConnection();
52-
if (!dbConnection) {
53-
log.warn("Readiness check failed: Database connection failed");
54-
res.status(503).send("Database connection failed");
55-
return;
56-
}
57-
58-
// Check SpiceDB connection
59-
const spiceDBConnection = await this.checkSpiceDBConnection();
60-
if (!spiceDBConnection) {
61-
log.warn("Readiness check failed: SpiceDB connection failed");
62-
res.status(503).send("SpiceDB connection failed");
63-
return;
64-
}
65-
66-
// Check Redis connection
67-
const redisConnection = await this.checkRedisConnection();
68-
if (!redisConnection) {
69-
log.warn("Readiness check failed: Redis connection failed");
70-
res.status(503).send("Redis connection failed");
71-
return;
72-
}
73-
74-
// All connections are good
75-
res.status(200).send("Ready");
76-
log.debug("Readiness check successful");
77-
} catch (error) {
78-
log.error("Readiness check failed", error);
79-
res.status(503).send("Readiness check failed");
35+
if (this.shutdown) {
36+
log.warn("Readiness check failed: Server is shutting down");
37+
res.status(503).send("Server is shutting down");
38+
return;
8039
}
81-
});
82-
}
8340

84-
private async checkDatabaseConnection(): Promise<boolean> {
85-
try {
86-
const connection = await this.typeOrm.getConnection();
87-
// Simple query to verify connection is working
88-
await connection.query("SELECT 1");
89-
log.debug("Database connection check successful");
90-
return true;
91-
} catch (error) {
92-
log.error("Database connection check failed", error);
93-
return false;
94-
}
95-
}
96-
97-
private async checkSpiceDBConnection(): Promise<boolean> {
98-
try {
99-
const client = this.spiceDBClientProvider.getClient();
100-
101-
// Send a request, to verify that the connection works
102-
const req = v1.ReadSchemaRequest.create({});
103-
const response = await client.readSchema(req);
104-
log.debug("SpiceDB connection check successful", { schemaLength: response.schemaText.length });
105-
106-
return true;
107-
} catch (error) {
108-
log.error("SpiceDB connection check failed", error);
109-
return false;
110-
}
111-
}
112-
113-
private async checkRedisConnection(): Promise<boolean> {
114-
try {
115-
// Simple PING command to verify connection is working
116-
const result = await this.redis.ping();
117-
log.debug("Redis connection check successful", { result });
118-
return result === "PONG";
119-
} catch (error) {
120-
log.error("Redis connection check failed", error);
121-
return false;
122-
}
123-
}
124-
125-
private startPollingFeatureFlag(): Disposable {
126-
return repeat(async () => {
127-
// Check feature flag first
128-
const readinessProbeEnabled = await getExperimentsClientForBackend().getValueAsync(
129-
"server_readiness_probe",
130-
true, // Default to readiness probe, skip if false
131-
{},
132-
);
133-
134-
log.debug("Feature flag server_readiness_probe updated", {
135-
readinessProbeEnabled,
136-
oldValue: this.readinessProbeEnabled,
137-
});
138-
this.readinessProbeEnabled = readinessProbeEnabled;
139-
}, 10_000);
41+
res.status(200).send("Ready");
42+
log.debug("Readiness check successful");
43+
});
14044
}
14145
}
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
/**
2+
* Copyright (c) 2025 Gitpod GmbH. All rights reserved.
3+
* Licensed under the GNU Affero General Public License (AGPL).
4+
* See License.AGPL.txt in the project root for license information.
5+
*/
6+
7+
import { injectable, inject } from "inversify";
8+
import express from "express";
9+
import { TypeORM } from "@gitpod/gitpod-db/lib";
10+
import { SpiceDBClientProvider } from "../authorization/spicedb";
11+
import { log } from "@gitpod/gitpod-protocol/lib/util/logging";
12+
import { v1 } from "@authzed/authzed-node";
13+
import { Redis } from "ioredis";
14+
15+
@injectable()
16+
export class StartupController {
17+
@inject(TypeORM) protected readonly typeOrm: TypeORM;
18+
@inject(SpiceDBClientProvider) protected readonly spiceDBClientProvider: SpiceDBClientProvider;
19+
@inject(Redis) protected readonly redis: Redis;
20+
21+
get apiRouter(): express.Router {
22+
const router = express.Router();
23+
this.addStartupHandler(router);
24+
return router;
25+
}
26+
27+
protected addStartupHandler(router: express.Router) {
28+
router.get("/", async (_, res) => {
29+
try {
30+
// Check database connection
31+
const dbConnection = await this.checkDatabaseConnection();
32+
if (!dbConnection) {
33+
log.warn("Startup check failed: Database connection failed");
34+
res.status(503).send("Database connection failed");
35+
return;
36+
}
37+
38+
// Check SpiceDB connection
39+
const spiceDBConnection = await this.checkSpiceDBConnection();
40+
if (!spiceDBConnection) {
41+
log.warn("Startup check failed: SpiceDB connection failed");
42+
res.status(503).send("SpiceDB connection failed");
43+
return;
44+
}
45+
46+
// Check Redis connection
47+
const redisConnection = await this.checkRedisConnection();
48+
if (!redisConnection) {
49+
log.warn("Startup check failed: Redis connection failed");
50+
res.status(503).send("Redis connection failed");
51+
return;
52+
}
53+
54+
// All connections are good
55+
res.status(200).send("Ready");
56+
log.debug("Startup check successful");
57+
} catch (error) {
58+
log.error("Startup check failed", error);
59+
res.status(503).send("Startup check failed");
60+
}
61+
});
62+
}
63+
64+
private async checkDatabaseConnection(): Promise<boolean> {
65+
try {
66+
const connection = await this.typeOrm.getConnection();
67+
// Simple query to verify connection is working
68+
await connection.query("SELECT 1");
69+
log.debug("Database connection check successful");
70+
return true;
71+
} catch (error) {
72+
log.error("Database connection check failed", error);
73+
return false;
74+
}
75+
}
76+
77+
private async checkSpiceDBConnection(): Promise<boolean> {
78+
try {
79+
const client = this.spiceDBClientProvider.getClient();
80+
81+
// Send a request, to verify that the connection works
82+
const req = v1.ReadSchemaRequest.create({});
83+
const response = await client.readSchema(req);
84+
log.debug("SpiceDB connection check successful", { schemaLength: response.schemaText.length });
85+
86+
return true;
87+
} catch (error) {
88+
log.error("SpiceDB connection check failed", error);
89+
return false;
90+
}
91+
}
92+
93+
private async checkRedisConnection(): Promise<boolean> {
94+
try {
95+
// Simple PING command to verify connection is working
96+
const result = await this.redis.ping();
97+
log.debug("Redis connection check successful", { result });
98+
return result === "PONG";
99+
} catch (error) {
100+
log.error("Redis connection check failed", error);
101+
return false;
102+
}
103+
}
104+
}

components/server/src/server.ts

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,9 @@ export class Server {
387387
}
388388

389389
public async stop() {
390+
// mark as not-ready
391+
this.probesApp.notifyShutdown();
392+
390393
// run each stop with a timeout of 30s
391394
async function race(workLoad: Promise<any>, task: string, ms: number = 30 * 1000): Promise<void> {
392395
const before = Date.now();
@@ -413,10 +416,13 @@ export class Server {
413416
race(this.stopServer(this.httpServer), "stop httpserver"),
414417
race(this.stopServer(this.privateApiServer), "stop private api server"),
415418
race(this.stopServer(this.publicApiServer), "stop public api server"),
416-
race(this.probesApp.stop(), "stop probe server"),
417419
race((async () => this.disposables.dispose())(), "dispose disposables"),
418420
]);
419421

422+
this.probesApp.stop().catch(() => {
423+
/* ignore */
424+
});
425+
420426
log.info("server stopped.");
421427
}
422428

install/installer/pkg/components/server/deployment.go

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -376,7 +376,7 @@ func deployment(ctx *common.RenderContext) ([]runtime.Object, error) {
376376
StartupProbe: &corev1.Probe{
377377
ProbeHandler: corev1.ProbeHandler{
378378
HTTPGet: &corev1.HTTPGetAction{
379-
Path: "/ready",
379+
Path: "/startup",
380380
Port: intstr.IntOrString{
381381
Type: intstr.Int,
382382
IntVal: ProbesPort,
@@ -387,6 +387,21 @@ func deployment(ctx *common.RenderContext) ([]runtime.Object, error) {
387387
PeriodSeconds: 10,
388388
FailureThreshold: 18, // try for 180 seconds, then the Pod is restarted
389389
},
390+
// /ready will only return false on shutdown (SIGTERM), always true otherwise
391+
ReadinessProbe: &corev1.Probe{
392+
ProbeHandler: corev1.ProbeHandler{
393+
HTTPGet: &corev1.HTTPGetAction{
394+
Path: "/ready",
395+
Port: intstr.IntOrString{
396+
Type: intstr.Int,
397+
IntVal: ProbesPort,
398+
},
399+
},
400+
},
401+
InitialDelaySeconds: 5,
402+
PeriodSeconds: 5,
403+
FailureThreshold: 1, // mark as "not ready" as quick as possible after receiving SIGTERM
404+
},
390405
SecurityContext: &corev1.SecurityContext{
391406
Privileged: pointer.Bool(false),
392407
AllowPrivilegeEscalation: pointer.Bool(false),

0 commit comments

Comments
 (0)