Fixed Prometheus and added bunches of alerts

2023-11-17 16:50:39 +02:00
parent b875c663c9
commit 3d90e52a15
5 changed files with 242 additions and 70 deletions
--- a/config/alerts.yml
+++ b/config/alerts.yml
@@ -1,14 +1,177 @@
 groups:
-  - name: Uptime
+
+  - name: Takahe
    rules:
-      - alert: InstanceDown 
-        expr: up{job="services"} < 1 
-        for: 5m 
-  - name: Usage
+
+      - alert: HostOutOfMemoryWarning
+        expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host out of memory (instance {{ $labels.instance }})
+          description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+          source: 'https://monitor.pukeko.xyz'
+
+      - alert: HostOutOfMemory
+        expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host out of memory (instance {{ $labels.instance }})
+          description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+          source: 'https://monitor.pukeko.xyz'
+
+      - alert: HostOomKillDetected
+        expr: (increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host OOM kill detected (instance {{ $labels.instance }})
+          description: "OOM kill detected\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+          source: 'https://monitor.pukeko.xyz'
+
+      - alert: HostOutOfMemory
+        expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host out of memory (instance {{ $labels.instance }})
+          description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+          source: 'https://monitor.pukeko.xyz'
+
+  - name: Storage
    rules:
-      - alert: HighRootFSDiskUsage
-        expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/",fstype!="rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/",fstype!="rootfs"}) > 80
-        for: 1m
-      - alert: HighRedVolDiskUsage
-        expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/Red-Vol",fstype!="rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/Red-Vol",fstype!="rootfs"}) > 70
-        for: 1m
+    - alert: HighFilesystemUsage
+      expr: 100 * (1 - (btrfs_filesystem_free / btrfs_filesystem_size)) > 90
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: "High Btrfs filesystem usage on {{ $labels.instance }}"
+        description: "Btrfs filesystem usage is above 90% on {{ $labels.instance }}"
+        source: 'https://monitor.pukeko.xyz'
+
+    - alert: HighRootFSDiskUsage
+      expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/",fstype!="rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/",fstype!="rootfs"}) > 80
+      for: 1m
+      labels:
+        severity: error
+      annotations:
+        summary: "Low disk space on Takahe"
+        description: "root disk is filling up on {{ $labels.instance }}"
+        source: 'https://monitor.pukeko.xyz'
+
+    - alert: HighRedVolDiskUsage
+      expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/Red-Vol",fstype!="rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/Red-Vol",fstype!="rootfs"}) > 70
+      for: 1m
+      labels:
+        severity: error
+      annotations:
+        summary: "Low disk space on Red-Vol"
+        description: "Red-Vol is filling up on {{ $labels.instance }}"
+        source: 'https://monitor.pukeko.xyz'
+
+    - alert: DegradedBtrfsRAID
+      expr: btrfs_raid_status == 1
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        summary: "Degraded Btrfs RAID on {{ $labels.instance }}"
+        description: "The Btrfs RAID array is in a degraded state on {{ $labels.instance }}"
+        source: 'https://monitor.pukeko.xyz'
+
+    - alert: BtrfsRAIDScrubFailed
+      expr: btrfs_raid_scrub_status == 1
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: "Btrfs RAID scrub failed on {{ $labels.instance }}"
+        description: "The Btrfs RAID scrub failed on {{ $labels.instance }}"
+        source: 'https://monitor.pukeko.xyz'
+  
+  - name: Docker
+    rules:
+
+    - alert: ContainerHighCpuUtilization
+      expr: (sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Container High CPU utilization (instance {{ $labels.instance }})
+        description: "Container CPU utilization is above 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        source: 'https://monitor.pukeko.xyz'
+        
+    - alert: ContainerHighMemoryUsage
+      expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Container High Memory usage (instance {{ $labels.instance }})
+        description: "Container Memory usage is above 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        source: 'https://monitor.pukeko.xyz'
+
+    - alert: ContainerUpdateStatus
+      expr: |
+        increase(container_updated[1m]) > 0 and container_updated == 1
+      for: 1m
+      labels:
+        severity: info
+      annotations:
+        summary: "Container update status"
+        description: "The container was successfully updated."
+        source: 'https://monitor.pukeko.xyz'
+
+    - alert: NewContainerUpdate
+      expr: |
+        container_updated{job="takahe", name=~".+"} == 1
+      for: 1m
+      labels:
+        severity: warning
+      annotations:
+        summary: "New container update detected"
+        description: "A new container ('{{ $labels.name }}') was successfully updated."
+        source: 'https://monitor.pukeko.xyz'
+
+    - alert: ContainerUpdateFailure
+      expr: |
+        container_updated == 0 or container_updated == -1
+      labels:
+        severity: critical
+      annotations:
+        summary: "Container update failed"
+        description: "The container update metric indicates a failure. Check logs for details."
+        source: 'https://monitor.pukeko.xyz'
+
+
+  - name: Backups
+    rules:
+    - alert: KumonoboruFailure
+      expr: |
+        up{job="systemd", unit="Kumonoboru", state="failed"} == 1
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: "Service failure detected in {{ $labels.instance }}"
+        description: "The service '{{ $labels.instance }}' has a failed status"
+        source: 'https://monitor.pukeko.xyz'
+
+    - alert: KumonoboruTimerFailure
+      expr: |
+        up{job="systemd", unit="Kumonoboru", state="failed"} == 1
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: "Timer failure detected in {{ $labels.instance }}"
+        description: "The timer '{{ $labels.instance }}' has a failed status"
+        source: 'https://monitor.pukeko.xyz'
+