infra/services/monitoring/lgtm/alerts/forkos.yaml

13 lines
574 B
YAML
Raw Normal View History

namespace: forkos
groups:
- name: ForkOS automation
rules:
- alert: SyncFailedTooOften
expr: 'changes(node_systemd_unit_state{name=~"ows.*.service",state="failed"}[1d]) > 2'
for: 30m
labels:
severity: critical
annotations:
description: On {{ $labels.instance }}, the synchronization job has failed more than twice in the last 24 hours, check if there's a conflict or a stdenv change.
summary: Synchronization job {{ $labels.name }} has failed more than twice in the last 24 hours