infra/services/monitoring/lgtm/alerts/forkos.yaml
raito 84efd0976d feat(alerts): add a sync failed too often alert
Signed-off-by: Raito Bezarius <masterancpp@gmail.com>
2024-08-09 16:25:34 +02:00

12 lines
541 B
YAML

groups:
- name: ForkOS automation
rules:
- alert: SyncFailedTooOften
expr: 'changes(node_systemd_unit_state{name=~"ows.*.service",state="failed"}[24h]) > 2'
for: 30m
labels:
severity: critical
annotations:
summary: "Synchronization job {{ $labels.name }} has failed more than twice in the last 24 hours"
description: "On {{ $labels.instance }}, the synchronization job has failed more than twice in the last 24 hours, check if there's a conflict or a stdenv change."