Skip to content

Commit 4da371e

Browse files
emyllerkhvn26
andauthored
fix(Monitoring): Serve /metrics on a dedicated port (#158)
Co-authored-by: Kim Gustyr <kim.gustyr@flagsmith.com>
1 parent 6cb67b6 commit 4da371e

File tree

9 files changed

+247
-7
lines changed

9 files changed

+247
-7
lines changed

src/common/core/templates/docgen-metrics.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@ sidebar_position: 20
66

77
## Prometheus
88

9-
To enable the Prometheus `/metrics` endpoint, set the `PROMETHEUS_ENABLED` environment variable to `true`.
9+
To enable the Prometheus `/metrics` endpoint, set the `PROMETHEUS_ENABLED` environment variable to `true`.
10+
11+
When enabled, Flagsmith serves the `/metrics` endpoint on port 9100.
1012

1113
The metrics provided by Flagsmith are described below.
1214

src/common/gunicorn/conf.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
It is used to correctly support Prometheus metrics in a multi-process environment.
55
"""
66

7+
import os
78
import typing
89

910
from prometheus_client.multiprocess import mark_process_dead
@@ -13,6 +14,15 @@
1314
from gunicorn.workers.base import Worker # type: ignore[import-untyped]
1415

1516

16-
def worker_exit(server: "Arbiter", worker: "Worker") -> None:
17+
def when_ready(server: "Arbiter") -> None:
18+
"""Start the standalone Prometheus metrics server after Gunicorn is ready."""
19+
prometheus_enabled = os.getenv("PROMETHEUS_ENABLED", "")
20+
if prometheus_enabled.lower() == "true": # Django settings are not available
21+
from common.gunicorn.metrics_server import start_metrics_server
22+
23+
start_metrics_server()
24+
25+
26+
def child_exit(server: "Arbiter", worker: "Worker") -> None:
1727
"""Detach the process Prometheus metrics collector when a worker exits."""
1828
mark_process_dead(worker.pid) # type: ignore[no-untyped-call]
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
"""
2+
Standalone Prometheus metrics HTTP server.
3+
4+
This module provides a separate HTTP server for Prometheus metrics,
5+
independent of the main Gunicorn application server. This improves
6+
metrics reliability under high API load.
7+
8+
The server runs in a daemon thread and serves metrics from the shared
9+
PROMETHEUS_MULTIPROC_DIR directory.
10+
"""
11+
12+
import logging
13+
import os
14+
import threading
15+
16+
from prometheus_client import CollectorRegistry, start_http_server
17+
from prometheus_client.multiprocess import MultiProcessCollector
18+
19+
logger = logging.getLogger(__name__)
20+
21+
METRICS_SERVER_PORT = 9100
22+
23+
_server_started = False
24+
_server_lock = threading.Lock()
25+
26+
27+
def get_multiprocess_registry() -> CollectorRegistry:
28+
"""Create a registry configured for multiprocess metric collection."""
29+
registry = CollectorRegistry()
30+
MultiProcessCollector(registry) # type: ignore[no-untyped-call]
31+
return registry
32+
33+
34+
def start_metrics_server(
35+
port: int = METRICS_SERVER_PORT,
36+
) -> None:
37+
"""
38+
Start the standalone Prometheus metrics HTTP server.
39+
40+
This function is idempotent - calling it multiple times will only
41+
start one server. The server runs in a daemon thread.
42+
43+
Args:
44+
port: The port to serve metrics on. Defaults to 9100.
45+
"""
46+
global _server_started
47+
48+
with _server_lock:
49+
if _server_started:
50+
logger.debug("Metrics server already started")
51+
return
52+
53+
prometheus_multiproc_dir = os.environ.get("PROMETHEUS_MULTIPROC_DIR")
54+
if not prometheus_multiproc_dir:
55+
logger.warning("PROMETHEUS_MULTIPROC_DIR not set, skipping metrics server")
56+
return
57+
58+
registry = get_multiprocess_registry()
59+
60+
try:
61+
start_http_server(port=port, registry=registry)
62+
_server_started = True
63+
logger.info("Prometheus metrics server started on port %d", port)
64+
except OSError as e:
65+
logger.error("Failed to start metrics server on port %d: %s", port, e)

tests/integration/core/snapshots/test_docgen__metrics__runs_expected.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@ sidebar_position: 20
66

77
## Prometheus
88

9-
To enable the Prometheus `/metrics` endpoint, set the `PROMETHEUS_ENABLED` environment variable to `true`.
9+
To enable the Prometheus `/metrics` endpoint, set the `PROMETHEUS_ENABLED` environment variable to `true`.
10+
11+
When enabled, Flagsmith serves the `/metrics` endpoint on port 9100.
1012

1113
The metrics provided by Flagsmith are described below.
1214

tests/integration/gunicorn/__init__.py

Whitespace-only changes.
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
from typing import Generator
2+
3+
import pytest
4+
5+
6+
@pytest.fixture(autouse=True)
7+
def reset_metrics_server_state() -> Generator[None, None, None]:
8+
"""Reset the metrics server global state between tests."""
9+
from common.gunicorn import metrics_server
10+
11+
metrics_server._server_started = False
12+
13+
yield
14+
15+
metrics_server._server_started = False
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
import socket
2+
import urllib.request
3+
4+
import prometheus_client
5+
import pytest
6+
7+
from common.gunicorn.metrics_server import start_metrics_server
8+
from tests import GetLogsFixture
9+
10+
11+
@pytest.mark.prometheus_multiprocess_mode
12+
def test_start_metrics_server__multiprocess_mode__serves_metrics(
13+
unused_tcp_port: int,
14+
test_metric: prometheus_client.Counter,
15+
) -> None:
16+
# Given
17+
test_metric.labels(test_name="standalone_server_test").inc()
18+
19+
# When
20+
start_metrics_server(port=unused_tcp_port)
21+
22+
# Then
23+
with urllib.request.urlopen(
24+
f"http://localhost:{unused_tcp_port}/metrics"
25+
) as response:
26+
content = response.read().decode()
27+
28+
assert response.status == 200
29+
assert "pytest_tests_run_total" in content
30+
assert 'test_name="standalone_server_test"' in content
31+
32+
33+
def test_start_metrics_server__multiproc_dir_unset__logs_warning_and_skips(
34+
get_logs: GetLogsFixture,
35+
) -> None:
36+
# Given
37+
# PROMETHEUS_MULTIPROC_DIR is not set (default state)
38+
39+
# When
40+
start_metrics_server()
41+
42+
# Then
43+
logs = get_logs("common.gunicorn.metrics_server")
44+
assert (
45+
"WARNING",
46+
"PROMETHEUS_MULTIPROC_DIR not set, skipping metrics server",
47+
) in logs
48+
49+
50+
@pytest.mark.prometheus_multiprocess_mode
51+
def test_start_metrics_server__called_multiple_times__remains_idempotent(
52+
unused_tcp_port: int,
53+
) -> None:
54+
# Given
55+
start_metrics_server(port=unused_tcp_port)
56+
57+
# When
58+
start_metrics_server(port=unused_tcp_port)
59+
start_metrics_server(port=unused_tcp_port)
60+
61+
# Then
62+
with urllib.request.urlopen(
63+
f"http://localhost:{unused_tcp_port}/metrics"
64+
) as response:
65+
assert response.status == 200
66+
67+
68+
@pytest.mark.prometheus_multiprocess_mode
69+
def test_start_metrics_server__port_unavailable__logs_error(
70+
unused_tcp_port: int,
71+
get_logs: GetLogsFixture,
72+
) -> None:
73+
# Given
74+
# Bind to 0.0.0.0 to match prometheus_client's default address
75+
blocker = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
76+
blocker.bind(("0.0.0.0", unused_tcp_port))
77+
blocker.listen(1)
78+
79+
try:
80+
# When
81+
start_metrics_server(port=unused_tcp_port)
82+
83+
# Then
84+
logs = get_logs("common.gunicorn.metrics_server")
85+
assert any(
86+
level == "ERROR" and "Failed to start metrics server" in msg
87+
for level, msg in logs
88+
)
89+
finally:
90+
blocker.close()
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
from unittest.mock import Mock
2+
3+
import pytest
4+
from pytest_mock import MockerFixture
5+
6+
from common.gunicorn.conf import child_exit, when_ready
7+
8+
9+
def test_child_exit__calls_mark_process_dead_with_worker_pid(
10+
mocker: MockerFixture,
11+
) -> None:
12+
# Given
13+
mark_process_dead_mock = mocker.patch("common.gunicorn.conf.mark_process_dead")
14+
server = Mock()
15+
worker = Mock()
16+
worker.pid = 12345
17+
18+
# When
19+
child_exit(server, worker)
20+
21+
# Then
22+
mark_process_dead_mock.assert_called_once_with(12345)
23+
24+
25+
@pytest.mark.parametrize("prometheus_enabled", ("true", "TRUE"))
26+
def test_when_ready__prometheus_enabled__starts_metrics_server(
27+
mocker: MockerFixture,
28+
prometheus_enabled: str,
29+
) -> None:
30+
# Given
31+
mocker.patch.dict("os.environ", {"PROMETHEUS_ENABLED": prometheus_enabled})
32+
start_metrics_server_mock = mocker.patch(
33+
"common.gunicorn.metrics_server.start_metrics_server"
34+
)
35+
server = Mock()
36+
37+
# When
38+
when_ready(server)
39+
40+
# Then
41+
start_metrics_server_mock.assert_called_once()
42+
43+
44+
@pytest.mark.parametrize("prometheus_enabled", ("", "false"))
45+
def test_when_ready__prometheus_disabled__does_not_start_metrics_server(
46+
mocker: MockerFixture,
47+
prometheus_enabled: str,
48+
) -> None:
49+
# Given
50+
mocker.patch.dict("os.environ", {"PROMETHEUS_ENABLED": prometheus_enabled})
51+
start_metrics_server_mock = mocker.patch(
52+
"common.gunicorn.metrics_server.start_metrics_server"
53+
)
54+
server = Mock()
55+
56+
# When
57+
when_ready(server)
58+
59+
# Then
60+
start_metrics_server_mock.assert_not_called()

tests/unit/common/gunicorn/test_utils.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@ def test_run_server__default_config_file__runs_expected(
4444
# Given
4545
# prevent real forking from Gunicorn
4646
mocker.patch("os.fork").return_value = 0
47-
mark_process_dead_mock = mocker.patch("common.gunicorn.conf.mark_process_dead")
4847

4948
pid = os.getpid()
5049

@@ -58,9 +57,6 @@ def delay_kill(pid: int = pid) -> None:
5857
with pytest.raises(SystemExit):
5958
run_server({"bind": f"0.0.0.0:{unused_tcp_port}"})
6059

61-
# Then
62-
mark_process_dead_mock.assert_called_once_with(pid)
63-
6460

6561
def test_get_route_template__returns_expected__caches_expected(
6662
mocker: MockerFixture,

0 commit comments

Comments
 (0)