Fixed Prometheus and added bunches of alerts
This commit is contained in:
@@ -1,14 +1,177 @@
|
||||
groups:
|
||||
- name: Uptime
|
||||
|
||||
- name: Takahe
|
||||
rules:
|
||||
- alert: InstanceDown
|
||||
expr: up{job="services"} < 1
|
||||
for: 5m
|
||||
- name: Usage
|
||||
|
||||
- alert: HostOutOfMemoryWarning
|
||||
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of memory (instance {{ $labels.instance }})
|
||||
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
source: 'https://monitor.pukeko.xyz'
|
||||
|
||||
- alert: HostOutOfMemory
|
||||
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of memory (instance {{ $labels.instance }})
|
||||
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
source: 'https://monitor.pukeko.xyz'
|
||||
|
||||
- alert: HostOomKillDetected
|
||||
expr: (increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host OOM kill detected (instance {{ $labels.instance }})
|
||||
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
source: 'https://monitor.pukeko.xyz'
|
||||
|
||||
- alert: HostOutOfMemory
|
||||
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of memory (instance {{ $labels.instance }})
|
||||
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
source: 'https://monitor.pukeko.xyz'
|
||||
|
||||
- name: Storage
|
||||
rules:
|
||||
- alert: HighRootFSDiskUsage
|
||||
expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/",fstype!="rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/",fstype!="rootfs"}) > 80
|
||||
for: 1m
|
||||
- alert: HighRedVolDiskUsage
|
||||
expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/Red-Vol",fstype!="rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/Red-Vol",fstype!="rootfs"}) > 70
|
||||
for: 1m
|
||||
- alert: HighFilesystemUsage
|
||||
expr: 100 * (1 - (btrfs_filesystem_free / btrfs_filesystem_size)) > 90
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High Btrfs filesystem usage on {{ $labels.instance }}"
|
||||
description: "Btrfs filesystem usage is above 90% on {{ $labels.instance }}"
|
||||
source: 'https://monitor.pukeko.xyz'
|
||||
|
||||
- alert: HighRootFSDiskUsage
|
||||
expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/",fstype!="rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/",fstype!="rootfs"}) > 80
|
||||
for: 1m
|
||||
labels:
|
||||
severity: error
|
||||
annotations:
|
||||
summary: "Low disk space on Takahe"
|
||||
description: "root disk is filling up on {{ $labels.instance }}"
|
||||
source: 'https://monitor.pukeko.xyz'
|
||||
|
||||
- alert: HighRedVolDiskUsage
|
||||
expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/Red-Vol",fstype!="rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/Red-Vol",fstype!="rootfs"}) > 70
|
||||
for: 1m
|
||||
labels:
|
||||
severity: error
|
||||
annotations:
|
||||
summary: "Low disk space on Red-Vol"
|
||||
description: "Red-Vol is filling up on {{ $labels.instance }}"
|
||||
source: 'https://monitor.pukeko.xyz'
|
||||
|
||||
- alert: DegradedBtrfsRAID
|
||||
expr: btrfs_raid_status == 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Degraded Btrfs RAID on {{ $labels.instance }}"
|
||||
description: "The Btrfs RAID array is in a degraded state on {{ $labels.instance }}"
|
||||
source: 'https://monitor.pukeko.xyz'
|
||||
|
||||
- alert: BtrfsRAIDScrubFailed
|
||||
expr: btrfs_raid_scrub_status == 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Btrfs RAID scrub failed on {{ $labels.instance }}"
|
||||
description: "The Btrfs RAID scrub failed on {{ $labels.instance }}"
|
||||
source: 'https://monitor.pukeko.xyz'
|
||||
|
||||
- name: Docker
|
||||
rules:
|
||||
|
||||
- alert: ContainerHighCpuUtilization
|
||||
expr: (sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Container High CPU utilization (instance {{ $labels.instance }})
|
||||
description: "Container CPU utilization is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
source: 'https://monitor.pukeko.xyz'
|
||||
|
||||
- alert: ContainerHighMemoryUsage
|
||||
expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Container High Memory usage (instance {{ $labels.instance }})
|
||||
description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
source: 'https://monitor.pukeko.xyz'
|
||||
|
||||
- alert: ContainerUpdateStatus
|
||||
expr: |
|
||||
increase(container_updated[1m]) > 0 and container_updated == 1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: "Container update status"
|
||||
description: "The container was successfully updated."
|
||||
source: 'https://monitor.pukeko.xyz'
|
||||
|
||||
- alert: NewContainerUpdate
|
||||
expr: |
|
||||
container_updated{job="takahe", name=~".+"} == 1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "New container update detected"
|
||||
description: "A new container ('{{ $labels.name }}') was successfully updated."
|
||||
source: 'https://monitor.pukeko.xyz'
|
||||
|
||||
- alert: ContainerUpdateFailure
|
||||
expr: |
|
||||
container_updated == 0 or container_updated == -1
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Container update failed"
|
||||
description: "The container update metric indicates a failure. Check logs for details."
|
||||
source: 'https://monitor.pukeko.xyz'
|
||||
|
||||
|
||||
- name: Backups
|
||||
rules:
|
||||
- alert: KumonoboruFailure
|
||||
expr: |
|
||||
up{job="systemd", unit="Kumonoboru", state="failed"} == 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Service failure detected in {{ $labels.instance }}"
|
||||
description: "The service '{{ $labels.instance }}' has a failed status"
|
||||
source: 'https://monitor.pukeko.xyz'
|
||||
|
||||
- alert: KumonoboruTimerFailure
|
||||
expr: |
|
||||
up{job="systemd", unit="Kumonoboru", state="failed"} == 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Timer failure detected in {{ $labels.instance }}"
|
||||
description: "The timer '{{ $labels.instance }}' has a failed status"
|
||||
source: 'https://monitor.pukeko.xyz'
|
||||
|
||||
|
||||
Reference in New Issue
Block a user