From d6e22b8c6b21d18319a7c40a23ca86c9e2b81b65 Mon Sep 17 00:00:00 2001 From: Matan Horovitz Date: Fri, 17 Nov 2023 17:01:38 +0200 Subject: [PATCH] More container alerts --- config/alerts.yml | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/config/alerts.yml b/config/alerts.yml index efdadf8..3b5a4da 100755 --- a/config/alerts.yml +++ b/config/alerts.yml @@ -150,6 +150,39 @@ groups: description: "The container update metric indicates a failure. Check logs for details." source: 'https://monitor.pukeko.xyz' + - alert: ContainerFailure + expr: container_last_seen == 0 + for: 1h + labels: + severity: critical + annotations: + summary: "Container failure on {{ $labels.instance }}" + description: "No data received from a container for the last hour on {{ $labels.instance }}" + source: 'https://monitor.pukeko.xyz' + + - alert: ContainerRestartRate + expr: rate(container_restart_total[5m]) > 0.2 + for: 10m + labels: + severity: critical + annotations: + summary: "High container restart rate on {{ $labels.instance }}" + description: "Container restart rate is above 0.2 restarts per minute on {{ $labels.instance }}" + source: 'https://monitor.pukeko.xyz' + + - alert: ContainerHighPacketLoss + expr: rate(container_network_receive_errors_total[5m]) > 0.1 + for: 10m + labels: + severity: warning + annotations: + summary: "High packet loss on container network ({{ $labels.name }})" + description: "Packet loss rate is above 0.1 errors per minute on {{ $labels.name }}" + source: 'https://monitor.pukeko.xyz' + + + + - name: Backups rules: