Skip to content

[fix] Don't evaluate threshold for historical data #666 #669

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
May 29, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 21 additions & 2 deletions docs/user/device-checks-and-alert-settings.rst
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Managing Device Checks & Alert Settings
=======================================
Device Checks & Alert Settings
==============================

We can add checks and define alert settings directly from the **device
page**.
Expand Down Expand Up @@ -81,3 +81,22 @@ page as shown below:
.. figure:: https://raw.githubusercontent.com/openwisp/openwisp-monitoring/docs/docs/1.1/inline-permissions.png
:target: https://raw.githubusercontent.com/openwisp/openwisp-monitoring/docs/docs/1.1/inline-permissions.png
:align: center

How is Historical Data Handled?
-------------------------------

The :doc:`OpenWrt Monitoring Agent </openwrt-monitoring-agent/index>`
collects and :ref:`temporarily stores monitoring data locally on the
device <monitoring_agent_send_mode>` when it cannot reach OpenWISP, for
example, during network or server outages.

OpenWISP Monitoring supports the submission of historical data, meaning
monitoring information that could not be delivered in real time and is
sent at a later stage. This capability makes both the agent and the server
resilient to occasional disruptions.

However, it's important to note that **historical data does not trigger
alerts** or affect the **health status** of a device. Threshold checks and
health evaluations are only applied to fresh data. This approach prevents
conflicts between outdated information and the device's current state,
which may have changed significantly.
12 changes: 9 additions & 3 deletions openwisp_monitoring/db/backends/influxdb/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from celery.exceptions import Retry
from django.core.exceptions import ValidationError
from django.test import TestCase, tag
from django.utils.timezone import now
from django.utils.timezone import make_aware, now
from freezegun import freeze_time
from influxdb import InfluxDBClient
from influxdb.exceptions import InfluxDBClientError, InfluxDBServerError
Expand Down Expand Up @@ -309,8 +309,14 @@ def test_metric_write_microseconds_precision(self):
m = self._create_object_metric(
name="wlan0", key="wlan0", configuration="clients"
)
m.write("00:14:5c:00:00:00", time=datetime(2020, 7, 31, 22, 5, 47, 235142))
m.write("00:23:4a:00:00:00", time=datetime(2020, 7, 31, 22, 5, 47, 235152))
m.write(
"00:14:5c:00:00:00",
time=make_aware(datetime(2020, 7, 31, 22, 5, 47, 235142)),
)
m.write(
"00:23:4a:00:00:00",
time=make_aware(datetime(2020, 7, 31, 22, 5, 47, 235152)),
)
self.assertEqual(len(m.read()), 2)

@patch.object(
Expand Down
10 changes: 9 additions & 1 deletion openwisp_monitoring/monitoring/base/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,12 +376,20 @@ def _set_is_healthy_tolerant(
self._notify_users(notification_type, alert_settings)
return first_time

def _is_historical_data(self, time):
"""
Data older than 5 minutes is considered historical data.
"""
recent_time = timezone.now() - timedelta(minutes=5)
return time < recent_time

def check_threshold(self, value, time=None, retention_policy=None, send_alert=True):
"""Checks if the threshold is crossed and notifies users accordingly"""
try:
alert_settings = self.alertsettings
except ObjectDoesNotExist:
return

is_healthy_changed = self._set_is_healthy(alert_settings, value)
tolerance_healthy_changed_first_time = self._set_is_healthy_tolerant(
alert_settings, value, time, retention_policy, send_alert
Expand Down Expand Up @@ -444,7 +452,7 @@ def write(
)
# check can be disabled,
# mostly for automated testing and debugging purposes
if check:
if check and not self._is_historical_data(timestamp):
options["check_threshold_kwargs"] = {
"value": value,
"time": time,
Expand Down
19 changes: 14 additions & 5 deletions openwisp_monitoring/monitoring/tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from django.db import IntegrityError
from django.test import TestCase
from django.utils import timezone
from freezegun import freeze_time
from swapper import load_model

from openwisp_utils.tests import catch_signal
Expand Down Expand Up @@ -355,25 +356,33 @@ def test_tolerance(self):
self._create_alert_settings(
metric=m, custom_operator=">", custom_threshold=90, custom_tolerance=5
)
with freeze_time(start_time):
m.write(99)
with self.subTest("within tolerance, no alerts expected"):
m.write(99, time=timezone.now() - timedelta(minutes=2))
with freeze_time(start_time + timedelta(minutes=2)):
m.write(99)
m.refresh_from_db(fields=["is_healthy", "is_healthy_tolerant"])
self.assertEqual(m.is_healthy, False)
self.assertEqual(m.is_healthy_tolerant, True)
self.assertEqual(Notification.objects.count(), 0)
m.write(99, time=timezone.now() - timedelta(minutes=4))
with freeze_time(start_time + timedelta(minutes=4)):
m.write(99)
m.refresh_from_db(fields=["is_healthy", "is_healthy_tolerant"])
self.assertEqual(m.is_healthy, False)
self.assertEqual(m.is_healthy_tolerant, True)
self.assertEqual(Notification.objects.count(), 0)
with self.subTest("tolerance trepassed, alerts expected"):
m.write(99, time=timezone.now() - timedelta(minutes=6))
with freeze_time(start_time + timedelta(minutes=6)):
m.write(99)
m.refresh_from_db(fields=["is_healthy", "is_healthy_tolerant"])
self.assertEqual(m.is_healthy, False)
self.assertEqual(m.is_healthy_tolerant, False)
self.assertEqual(Notification.objects.count(), 1)
with self.subTest("value back to normal, tolerance not considered"):
m.write(71, time=timezone.now() - timedelta(minutes=7))
with self.subTest("value back to normal"):
with freeze_time(start_time + timedelta(minutes=7)):
m.write(71)
with freeze_time(start_time + timedelta(minutes=12)):
m.write(71)
m.refresh_from_db(fields=["is_healthy", "is_healthy_tolerant"])
self.assertEqual(m.is_healthy, True)
self.assertEqual(m.is_healthy_tolerant, True)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,10 @@ def test_general_check_threshold_crossed_deferred(self):
self._create_alert_settings(
metric=m, custom_operator=">", custom_threshold=90, custom_tolerance=1
)
m.write(99, time=ten_minutes_ago)
with freeze_time(start_time):
m.write(99)
with freeze_time(ten_minutes_after):
m.write(99)
m.refresh_from_db()
self.assertEqual(m.is_healthy, False)
self.assertEqual(m.is_healthy_tolerant, False)
Expand Down Expand Up @@ -122,7 +125,8 @@ def test_general_check_threshold_crossed_for_long_time(self):
)

with self.subTest("Test no notification is generated for healthy status"):
m.write(89, time=ten_minutes_ago)
with freeze_time(ten_minutes_ago):
m.write(89)
m.refresh_from_db()
self.assertEqual(m.is_healthy, True)
self.assertEqual(m.is_healthy_tolerant, True)
Expand All @@ -133,7 +137,8 @@ def test_general_check_threshold_crossed_for_long_time(self):
# timeseries database to process the transaction.
self._read_metric(m)
with self.subTest("Test no notification is generated when check=False"):
m.write(91, time=ten_minutes_ago, check=False)
with freeze_time(ten_minutes_ago):
m.write(91, check=False)
self.assertEqual(Notification.objects.count(), 0)

self._read_metric(m)
Expand Down Expand Up @@ -241,7 +246,10 @@ def test_object_check_threshold_crossed_deferred(self):
alert_s = self._create_alert_settings(
metric=om, custom_operator=">", custom_threshold=90, custom_tolerance=1
)
om.write(99, time=ten_minutes_ago)
with freeze_time(start_time):
om.write(99)
with freeze_time(ten_minutes_after):
om.write(99)
om.refresh_from_db()
self.assertEqual(om.is_healthy, False)
self.assertEqual(om.is_healthy_tolerant, False)
Expand Down Expand Up @@ -271,9 +279,11 @@ def test_object_check_threshold_crossed_for_long_time(self):
alert_s = self._create_alert_settings(
metric=om, custom_operator=">", custom_threshold=90, custom_tolerance=1
)
self._write_metric(om, 89, time=ten_minutes_ago)
with freeze_time(ten_minutes_ago):
self._write_metric(om, 89)
self.assertEqual(Notification.objects.count(), 0)
self._write_metric(om, 91, time=ten_minutes_ago, check=False)
with freeze_time(ten_minutes_ago):
self._write_metric(om, 91, check=False)
self.assertEqual(Notification.objects.count(), 0)
self._write_metric(om, 92)
om.refresh_from_db()
Expand Down Expand Up @@ -319,6 +329,38 @@ def test_object_check_threshold_crossed_for_long_time(self):
self.assertEqual(om.is_healthy_tolerant, True)
self.assertEqual(Notification.objects.count(), 2)

@freeze_time(start_time)
def test_object_check_threshold_crossed_historical_data(self):
"""
Do not evaluate threshold crossed for historical data
"""
self._create_admin()
om = self._create_object_metric(name="load")
self._create_alert_settings(
metric=om, custom_operator=">", custom_threshold=90, custom_tolerance=1
)

# We need to write with "time" argument instead of freeze_time to
# simulate historical data
self._write_metric(om, 99, time=start_time - timedelta(minutes=60))
om.refresh_from_db()
self.assertEqual(om.is_healthy, True)
self.assertEqual(om.is_healthy_tolerant, True)
self.assertEqual(Notification.objects.count(), 0)

self._write_metric(om, 99, time=start_time - timedelta(minutes=10))
om.refresh_from_db()
self.assertEqual(om.is_healthy, True)
self.assertEqual(om.is_healthy_tolerant, True)
self.assertEqual(Notification.objects.count(), 0)

# Writing real-time data should enforce the threshold check
self._write_metric(om, 99, time=start_time)
om.refresh_from_db()
self.assertEqual(om.is_healthy, False)
self.assertEqual(om.is_healthy_tolerant, True)
self.assertEqual(Notification.objects.count(), 0)

def test_flapping_metric_with_tolerance(self):
self._create_admin()
om = self._create_object_metric(name="ping")
Expand Down Expand Up @@ -403,7 +445,10 @@ def test_alerts_disabled(self):
custom_tolerance=1,
is_active=False,
)
m.write(99, time=ten_minutes_ago)
with freeze_time(start_time):
m.write(99)
with freeze_time(ten_minutes_after):
m.write(99)
m.refresh_from_db()
self.assertEqual(m.is_healthy, False)
self.assertEqual(m.is_healthy_tolerant, False)
Expand Down Expand Up @@ -511,7 +556,10 @@ def test_general_check_threshold_with_alert_field_crossed_deferred(self):
self._create_alert_settings(
metric=m, custom_operator=">", custom_threshold=30, custom_tolerance=1
)
m.write(10, time=ten_minutes_ago, extra_values={"test_related_2": 35})
with freeze_time(start_time):
m.write(10, extra_values={"test_related_2": 35})
with freeze_time(ten_minutes_after):
m.write(10, extra_values={"test_related_2": 35})
m.refresh_from_db()
self.assertEqual(m.is_healthy, False)
self.assertEqual(m.is_healthy_tolerant, False)
Expand Down
Loading