groups: - name: Takahe rules: - alert: HostOutOfMemoryWarning expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 2m labels: severity: warning annotations: summary: Host out of memory (instance {{ $labels.instance }}) description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" source: 'https://monitor.pukeko.xyz' - alert: HostOutOfMemory expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 2m labels: severity: warning annotations: summary: Host out of memory (instance {{ $labels.instance }}) description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" source: 'https://monitor.pukeko.xyz' - alert: HostOomKillDetected expr: (increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 0m labels: severity: warning annotations: summary: Host OOM kill detected (instance {{ $labels.instance }}) description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" source: 'https://monitor.pukeko.xyz' - alert: HostOutOfMemory expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 2m labels: severity: warning annotations: summary: Host out of memory (instance {{ $labels.instance }}) description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" source: 'https://monitor.pukeko.xyz' - name: Storage rules: - alert: HighFilesystemUsage expr: 100 * (1 - (btrfs_filesystem_free / btrfs_filesystem_size)) > 90 for: 5m labels: severity: warning annotations: summary: "High Btrfs filesystem usage on {{ $labels.instance }}" description: "Btrfs filesystem usage is above 90% on {{ $labels.instance }}" source: 'https://monitor.pukeko.xyz' - alert: HighRootFSDiskUsage expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/",fstype!="rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/",fstype!="rootfs"}) > 80 for: 1m labels: severity: error annotations: summary: "Low disk space on Takahe" description: "root disk is filling up on {{ $labels.instance }}" source: 'https://monitor.pukeko.xyz' - alert: HighRedVolDiskUsage expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/Red-Vol",fstype!="rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/Red-Vol",fstype!="rootfs"}) > 70 for: 1m labels: severity: error annotations: summary: "Low disk space on Red-Vol" description: "Red-Vol is filling up on {{ $labels.instance }}" source: 'https://monitor.pukeko.xyz' - alert: DegradedBtrfsRAID expr: btrfs_raid_status == 1 for: 5m labels: severity: critical annotations: summary: "Degraded Btrfs RAID on {{ $labels.instance }}" description: "The Btrfs RAID array is in a degraded state on {{ $labels.instance }}" source: 'https://monitor.pukeko.xyz' - alert: BtrfsRAIDScrubFailed expr: btrfs_raid_scrub_status == 1 for: 5m labels: severity: warning annotations: summary: "Btrfs RAID scrub failed on {{ $labels.instance }}" description: "The Btrfs RAID scrub failed on {{ $labels.instance }}" source: 'https://monitor.pukeko.xyz' - name: Docker rules: - alert: ContainerHighMemoryUsage expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80 for: 2m labels: severity: warning annotations: summary: Container High Memory usage (instance {{ $labels.instance }}) description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" source: 'https://monitor.pukeko.xyz' - alert: ContainerUpdateStatus expr: | increase(container_updated[1m]) > 0 and container_updated == 1 for: 1m labels: severity: info annotations: summary: "Container update status" description: "The container was successfully updated." source: 'https://monitor.pukeko.xyz' - alert: NewContainerUpdate expr: | container_updated{job="takahe", name=~".+"} == 1 for: 1m labels: severity: warning annotations: summary: "New container update detected" description: "A new container ('{{ $labels.name }}') was successfully updated." source: 'https://monitor.pukeko.xyz' - alert: ContainerUpdateFailure expr: | container_updated == 0 or container_updated == -1 labels: severity: critical annotations: summary: "Container update failed" description: "The container update metric indicates a failure. Check logs for details." source: 'https://monitor.pukeko.xyz' - alert: ContainerFailure expr: container_last_seen == 0 for: 1h labels: severity: critical annotations: summary: "Container failure on {{ $labels.instance }}" description: "No data received from a container for the last hour on {{ $labels.instance }}" source: 'https://monitor.pukeko.xyz' - alert: ContainerRestartRate expr: rate(container_restart_total[5m]) > 0.2 for: 10m labels: severity: critical annotations: summary: "High container restart rate on {{ $labels.instance }}" description: "Container restart rate is above 0.2 restarts per minute on {{ $labels.instance }}" source: 'https://monitor.pukeko.xyz' - alert: ContainerHighPacketLoss expr: rate(container_network_receive_errors_total[5m]) > 0.1 for: 10m labels: severity: warning annotations: summary: "High packet loss on container network ({{ $labels.name }})" description: "Packet loss rate is above 0.1 errors per minute on {{ $labels.name }}" source: 'https://monitor.pukeko.xyz' - name: Backups rules: - alert: KumonoboruFailure expr: | up{job="systemd", unit="Kumonoboru", state="failed"} == 1 for: 5m labels: severity: warning annotations: summary: "Service failure detected in {{ $labels.instance }}" description: "The service '{{ $labels.instance }}' has a failed status" source: 'https://monitor.pukeko.xyz' - alert: KumonoboruTimerFailure expr: | up{job="systemd", unit="Kumonoboru", state="failed"} == 1 for: 5m labels: severity: warning annotations: summary: "Timer failure detected in {{ $labels.instance }}" description: "The timer '{{ $labels.instance }}' has a failed status" source: 'https://monitor.pukeko.xyz' - alert: SystemBackupFailed expr: | system_backup == 1 labels: severity: critical annotations: summary: "System Backup has failed" description: "The repository {{ $labels.instance }} has failed the backup process" - alert: SystemBackupUnlockFailed expr: | system_backup == -1 labels: severity: critical annotations: summary: "System Backup cannot proceed" description: "The repository {{ $labels.instance }} has failed to unlock. Backup cannot be performed" - alert: SystemBackupIntegrityFailed expr: | system_backup == -2 labels: severity: critical annotations: summary: "System Backup containes error" description: "The repository {{ $labels.instance }} has failed an integrity check. Backup data may be corrupted" - alert: SystemBackupCleaningFailed expr: | system_backup == -3 labels: severity: critical annotations: summary: "System Backup cannot be cleaned" description: "The repository {{ $labels.instance }} has failed the cleanup process. Backup may contain excess data" - alert: SystemBackupSucceeded expr: | system_backup == 0 labels: severity: info annotations: summary: "System Backup has succeeded" description: "The repository {{ $labels.instance }} has succesfully completed the backup process" - alert: SystemBackupIntegritySucceeded expr: | system_backup == 2 labels: severity: info annotations: summary: "System Backup has no errors" description: "The repository {{ $labels.instance }} passed the integrity check" - alert: SystemBackupCleaningSucceeded expr: | system_backup == 3 labels: severity: info annotations: summary: "System Backup has been cleaned" description: "The repository {{ $labels.instance }} has completed the cleanup process"