Skip to content

Commit 2494ee2

Browse files
pandafynemesifier
andauthored
[fix] Don't evaluate threshold for historical data #666
Fixes #666 Co-authored-by: Federico Capoano <[email protected]>
1 parent 5102868 commit 2494ee2

File tree

5 files changed

+109
-19
lines changed

5 files changed

+109
-19
lines changed

docs/user/device-checks-and-alert-settings.rst

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
Managing Device Checks & Alert Settings
2-
=======================================
1+
Device Checks & Alert Settings
2+
==============================
33

44
We can add checks and define alert settings directly from the **device
55
page**.
@@ -81,3 +81,22 @@ page as shown below:
8181
.. figure:: https://raw.githubusercontent.com/openwisp/openwisp-monitoring/docs/docs/1.1/inline-permissions.png
8282
:target: https://raw.githubusercontent.com/openwisp/openwisp-monitoring/docs/docs/1.1/inline-permissions.png
8383
:align: center
84+
85+
How is Historical Data Handled?
86+
-------------------------------
87+
88+
The :doc:`OpenWrt Monitoring Agent </openwrt-monitoring-agent/index>`
89+
collects and :ref:`temporarily stores monitoring data locally on the
90+
device <monitoring_agent_send_mode>` when it cannot reach OpenWISP, for
91+
example, during network or server outages.
92+
93+
OpenWISP Monitoring supports the submission of historical data, meaning
94+
monitoring information that could not be delivered in real time and is
95+
sent at a later stage. This capability makes both the agent and the server
96+
resilient to occasional disruptions.
97+
98+
However, it's important to note that **historical data does not trigger
99+
alerts** or affect the **health status** of a device. Threshold checks and
100+
health evaluations are only applied to fresh data. This approach prevents
101+
conflicts between outdated information and the device's current state,
102+
which may have changed significantly.

openwisp_monitoring/db/backends/influxdb/tests.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from celery.exceptions import Retry
55
from django.core.exceptions import ValidationError
66
from django.test import TestCase, tag
7-
from django.utils.timezone import now
7+
from django.utils.timezone import make_aware, now
88
from freezegun import freeze_time
99
from influxdb import InfluxDBClient
1010
from influxdb.exceptions import InfluxDBClientError, InfluxDBServerError
@@ -309,8 +309,14 @@ def test_metric_write_microseconds_precision(self):
309309
m = self._create_object_metric(
310310
name="wlan0", key="wlan0", configuration="clients"
311311
)
312-
m.write("00:14:5c:00:00:00", time=datetime(2020, 7, 31, 22, 5, 47, 235142))
313-
m.write("00:23:4a:00:00:00", time=datetime(2020, 7, 31, 22, 5, 47, 235152))
312+
m.write(
313+
"00:14:5c:00:00:00",
314+
time=make_aware(datetime(2020, 7, 31, 22, 5, 47, 235142)),
315+
)
316+
m.write(
317+
"00:23:4a:00:00:00",
318+
time=make_aware(datetime(2020, 7, 31, 22, 5, 47, 235152)),
319+
)
314320
self.assertEqual(len(m.read()), 2)
315321

316322
@patch.object(

openwisp_monitoring/monitoring/base/models.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -376,12 +376,20 @@ def _set_is_healthy_tolerant(
376376
self._notify_users(notification_type, alert_settings)
377377
return first_time
378378

379+
def _is_historical_data(self, time):
380+
"""
381+
Data older than 5 minutes is considered historical data.
382+
"""
383+
recent_time = timezone.now() - timedelta(minutes=5)
384+
return time < recent_time
385+
379386
def check_threshold(self, value, time=None, retention_policy=None, send_alert=True):
380387
"""Checks if the threshold is crossed and notifies users accordingly"""
381388
try:
382389
alert_settings = self.alertsettings
383390
except ObjectDoesNotExist:
384391
return
392+
385393
is_healthy_changed = self._set_is_healthy(alert_settings, value)
386394
tolerance_healthy_changed_first_time = self._set_is_healthy_tolerant(
387395
alert_settings, value, time, retention_policy, send_alert
@@ -444,7 +452,7 @@ def write(
444452
)
445453
# check can be disabled,
446454
# mostly for automated testing and debugging purposes
447-
if check:
455+
if check and not self._is_historical_data(timestamp):
448456
options["check_threshold_kwargs"] = {
449457
"value": value,
450458
"time": time,

openwisp_monitoring/monitoring/tests/test_models.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from django.db import IntegrityError
99
from django.test import TestCase
1010
from django.utils import timezone
11+
from freezegun import freeze_time
1112
from swapper import load_model
1213

1314
from openwisp_utils.tests import catch_signal
@@ -355,25 +356,33 @@ def test_tolerance(self):
355356
self._create_alert_settings(
356357
metric=m, custom_operator=">", custom_threshold=90, custom_tolerance=5
357358
)
359+
with freeze_time(start_time):
360+
m.write(99)
358361
with self.subTest("within tolerance, no alerts expected"):
359-
m.write(99, time=timezone.now() - timedelta(minutes=2))
362+
with freeze_time(start_time + timedelta(minutes=2)):
363+
m.write(99)
360364
m.refresh_from_db(fields=["is_healthy", "is_healthy_tolerant"])
361365
self.assertEqual(m.is_healthy, False)
362366
self.assertEqual(m.is_healthy_tolerant, True)
363367
self.assertEqual(Notification.objects.count(), 0)
364-
m.write(99, time=timezone.now() - timedelta(minutes=4))
368+
with freeze_time(start_time + timedelta(minutes=4)):
369+
m.write(99)
365370
m.refresh_from_db(fields=["is_healthy", "is_healthy_tolerant"])
366371
self.assertEqual(m.is_healthy, False)
367372
self.assertEqual(m.is_healthy_tolerant, True)
368373
self.assertEqual(Notification.objects.count(), 0)
369374
with self.subTest("tolerance trepassed, alerts expected"):
370-
m.write(99, time=timezone.now() - timedelta(minutes=6))
375+
with freeze_time(start_time + timedelta(minutes=6)):
376+
m.write(99)
371377
m.refresh_from_db(fields=["is_healthy", "is_healthy_tolerant"])
372378
self.assertEqual(m.is_healthy, False)
373379
self.assertEqual(m.is_healthy_tolerant, False)
374380
self.assertEqual(Notification.objects.count(), 1)
375-
with self.subTest("value back to normal, tolerance not considered"):
376-
m.write(71, time=timezone.now() - timedelta(minutes=7))
381+
with self.subTest("value back to normal"):
382+
with freeze_time(start_time + timedelta(minutes=7)):
383+
m.write(71)
384+
with freeze_time(start_time + timedelta(minutes=12)):
385+
m.write(71)
377386
m.refresh_from_db(fields=["is_healthy", "is_healthy_tolerant"])
378387
self.assertEqual(m.is_healthy, True)
379388
self.assertEqual(m.is_healthy_tolerant, True)

openwisp_monitoring/monitoring/tests/test_monitoring_notifications.py

Lines changed: 56 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,10 @@ def test_general_check_threshold_crossed_deferred(self):
7878
self._create_alert_settings(
7979
metric=m, custom_operator=">", custom_threshold=90, custom_tolerance=1
8080
)
81-
m.write(99, time=ten_minutes_ago)
81+
with freeze_time(start_time):
82+
m.write(99)
83+
with freeze_time(ten_minutes_after):
84+
m.write(99)
8285
m.refresh_from_db()
8386
self.assertEqual(m.is_healthy, False)
8487
self.assertEqual(m.is_healthy_tolerant, False)
@@ -122,7 +125,8 @@ def test_general_check_threshold_crossed_for_long_time(self):
122125
)
123126

124127
with self.subTest("Test no notification is generated for healthy status"):
125-
m.write(89, time=ten_minutes_ago)
128+
with freeze_time(ten_minutes_ago):
129+
m.write(89)
126130
m.refresh_from_db()
127131
self.assertEqual(m.is_healthy, True)
128132
self.assertEqual(m.is_healthy_tolerant, True)
@@ -133,7 +137,8 @@ def test_general_check_threshold_crossed_for_long_time(self):
133137
# timeseries database to process the transaction.
134138
self._read_metric(m)
135139
with self.subTest("Test no notification is generated when check=False"):
136-
m.write(91, time=ten_minutes_ago, check=False)
140+
with freeze_time(ten_minutes_ago):
141+
m.write(91, check=False)
137142
self.assertEqual(Notification.objects.count(), 0)
138143

139144
self._read_metric(m)
@@ -241,7 +246,10 @@ def test_object_check_threshold_crossed_deferred(self):
241246
alert_s = self._create_alert_settings(
242247
metric=om, custom_operator=">", custom_threshold=90, custom_tolerance=1
243248
)
244-
om.write(99, time=ten_minutes_ago)
249+
with freeze_time(start_time):
250+
om.write(99)
251+
with freeze_time(ten_minutes_after):
252+
om.write(99)
245253
om.refresh_from_db()
246254
self.assertEqual(om.is_healthy, False)
247255
self.assertEqual(om.is_healthy_tolerant, False)
@@ -271,9 +279,11 @@ def test_object_check_threshold_crossed_for_long_time(self):
271279
alert_s = self._create_alert_settings(
272280
metric=om, custom_operator=">", custom_threshold=90, custom_tolerance=1
273281
)
274-
self._write_metric(om, 89, time=ten_minutes_ago)
282+
with freeze_time(ten_minutes_ago):
283+
self._write_metric(om, 89)
275284
self.assertEqual(Notification.objects.count(), 0)
276-
self._write_metric(om, 91, time=ten_minutes_ago, check=False)
285+
with freeze_time(ten_minutes_ago):
286+
self._write_metric(om, 91, check=False)
277287
self.assertEqual(Notification.objects.count(), 0)
278288
self._write_metric(om, 92)
279289
om.refresh_from_db()
@@ -319,6 +329,38 @@ def test_object_check_threshold_crossed_for_long_time(self):
319329
self.assertEqual(om.is_healthy_tolerant, True)
320330
self.assertEqual(Notification.objects.count(), 2)
321331

332+
@freeze_time(start_time)
333+
def test_object_check_threshold_crossed_historical_data(self):
334+
"""
335+
Do not evaluate threshold crossed for historical data
336+
"""
337+
self._create_admin()
338+
om = self._create_object_metric(name="load")
339+
self._create_alert_settings(
340+
metric=om, custom_operator=">", custom_threshold=90, custom_tolerance=1
341+
)
342+
343+
# We need to write with "time" argument instead of freeze_time to
344+
# simulate historical data
345+
self._write_metric(om, 99, time=start_time - timedelta(minutes=60))
346+
om.refresh_from_db()
347+
self.assertEqual(om.is_healthy, True)
348+
self.assertEqual(om.is_healthy_tolerant, True)
349+
self.assertEqual(Notification.objects.count(), 0)
350+
351+
self._write_metric(om, 99, time=start_time - timedelta(minutes=10))
352+
om.refresh_from_db()
353+
self.assertEqual(om.is_healthy, True)
354+
self.assertEqual(om.is_healthy_tolerant, True)
355+
self.assertEqual(Notification.objects.count(), 0)
356+
357+
# Writing real-time data should enforce the threshold check
358+
self._write_metric(om, 99, time=start_time)
359+
om.refresh_from_db()
360+
self.assertEqual(om.is_healthy, False)
361+
self.assertEqual(om.is_healthy_tolerant, True)
362+
self.assertEqual(Notification.objects.count(), 0)
363+
322364
def test_flapping_metric_with_tolerance(self):
323365
self._create_admin()
324366
om = self._create_object_metric(name="ping")
@@ -403,7 +445,10 @@ def test_alerts_disabled(self):
403445
custom_tolerance=1,
404446
is_active=False,
405447
)
406-
m.write(99, time=ten_minutes_ago)
448+
with freeze_time(start_time):
449+
m.write(99)
450+
with freeze_time(ten_minutes_after):
451+
m.write(99)
407452
m.refresh_from_db()
408453
self.assertEqual(m.is_healthy, False)
409454
self.assertEqual(m.is_healthy_tolerant, False)
@@ -511,7 +556,10 @@ def test_general_check_threshold_with_alert_field_crossed_deferred(self):
511556
self._create_alert_settings(
512557
metric=m, custom_operator=">", custom_threshold=30, custom_tolerance=1
513558
)
514-
m.write(10, time=ten_minutes_ago, extra_values={"test_related_2": 35})
559+
with freeze_time(start_time):
560+
m.write(10, extra_values={"test_related_2": 35})
561+
with freeze_time(ten_minutes_after):
562+
m.write(10, extra_values={"test_related_2": 35})
515563
m.refresh_from_db()
516564
self.assertEqual(m.is_healthy, False)
517565
self.assertEqual(m.is_healthy_tolerant, False)

0 commit comments

Comments
 (0)