From 84efd0976d38897b1896ac1067ea49f78c41443a Mon Sep 17 00:00:00 2001 From: Raito Bezarius Date: Fri, 9 Aug 2024 16:05:19 +0200 Subject: [PATCH] feat(alerts): add a sync failed too often alert Signed-off-by: Raito Bezarius --- services/monitoring/lgtm/alerts/forkos.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 services/monitoring/lgtm/alerts/forkos.yaml diff --git a/services/monitoring/lgtm/alerts/forkos.yaml b/services/monitoring/lgtm/alerts/forkos.yaml new file mode 100644 index 0000000..d48318a --- /dev/null +++ b/services/monitoring/lgtm/alerts/forkos.yaml @@ -0,0 +1,11 @@ +groups: + - name: ForkOS automation + rules: + - alert: SyncFailedTooOften + expr: 'changes(node_systemd_unit_state{name=~"ows.*.service",state="failed"}[24h]) > 2' + for: 30m + labels: + severity: critical + annotations: + summary: "Synchronization job {{ $labels.name }} has failed more than twice in the last 24 hours" + description: "On {{ $labels.instance }}, the synchronization job has failed more than twice in the last 24 hours, check if there's a conflict or a stdenv change."