docker/config/alerts.yml

groups:

  - name: Takahe
    rules:

      - alert: HostOutOfMemoryWarning
        expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Host out of memory (instance {{ $labels.instance }})
          description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
          source: 'https://monitor.pukeko.xyz'

      - alert: HostOutOfMemory
        expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Host out of memory (instance {{ $labels.instance }})
          description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
          source: 'https://monitor.pukeko.xyz'

      - alert: HostOomKillDetected
        expr: (increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Host OOM kill detected (instance {{ $labels.instance }})
          description: "OOM kill detected\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
          source: 'https://monitor.pukeko.xyz'

      - alert: HostOutOfMemory
        expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Host out of memory (instance {{ $labels.instance }})
          description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
          source: 'https://monitor.pukeko.xyz'

  - name: Storage
    rules:
    - alert: HighFilesystemUsage
      expr: 100 * (1 - (btrfs_filesystem_free / btrfs_filesystem_size)) > 90
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: "High Btrfs filesystem usage on {{ $labels.instance }}"
        description: "Btrfs filesystem usage is above 90% on {{ $labels.instance }}"
        source: 'https://monitor.pukeko.xyz'

    - alert: HighRootFSDiskUsage
      expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/",fstype!="rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/",fstype!="rootfs"}) > 80
      for: 1m
      labels:
        severity: error
      annotations:
        summary: "Low disk space on Takahe"
        description: "root disk is filling up on {{ $labels.instance }}"
        source: 'https://monitor.pukeko.xyz'

    - alert: HighRedVolDiskUsage
      expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/Red-Vol",fstype!="rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/Red-Vol",fstype!="rootfs"}) > 70
      for: 1m
      labels:
        severity: error
      annotations:
        summary: "Low disk space on Red-Vol"
        description: "Red-Vol is filling up on {{ $labels.instance }}"
        source: 'https://monitor.pukeko.xyz'

    - alert: DegradedBtrfsRAID
      expr: btrfs_raid_status == 1
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: "Degraded Btrfs RAID on {{ $labels.instance }}"
        description: "The Btrfs RAID array is in a degraded state on {{ $labels.instance }}"
        source: 'https://monitor.pukeko.xyz'

    - alert: BtrfsRAIDScrubFailed
      expr: btrfs_raid_scrub_status == 1
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: "Btrfs RAID scrub failed on {{ $labels.instance }}"
        description: "The Btrfs RAID scrub failed on {{ $labels.instance }}"
        source: 'https://monitor.pukeko.xyz'

  - name: Docker
    rules:

    - alert: ContainerHighMemoryUsage
      expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Container High Memory usage (instance {{ $labels.instance }})
        description: "Container Memory usage is above 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        source: 'https://monitor.pukeko.xyz'

    - alert: ContainerUpdateStatus
      expr: |
        increase(container_updated[1m]) > 0 and container_updated == 1
      for: 1m
      labels:
        severity: info
      annotations:
        summary: "Container update status"
        description: "The container was successfully updated."
        source: 'https://monitor.pukeko.xyz'

    - alert: NewContainerUpdate
      expr: |
        container_updated{job="takahe", name=~".+"} == 1
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: "New container update detected"
        description: "A new container ('{{ $labels.name }}') was successfully updated."
        source: 'https://monitor.pukeko.xyz'

    - alert: ContainerUpdateFailure
      expr: |
        container_updated == 0 or container_updated == -1
      labels:
        severity: critical
      annotations:
        summary: "Container update failed"
        description: "The container update metric indicates a failure. Check logs for details."
        source: 'https://monitor.pukeko.xyz'

    - alert: ContainerFailure
      expr: container_last_seen == 0
      for: 1h
      labels:
        severity: critical
      annotations:
        summary: "Container failure on {{ $labels.instance }}"
        description: "No data received from a container for the last hour on {{ $labels.instance }}"
        source: 'https://monitor.pukeko.xyz'

    - alert: ContainerRestartRate
      expr: rate(container_restart_total[5m]) > 0.2
      for: 10m
      labels:
        severity: critical
      annotations:
        summary: "High container restart rate on {{ $labels.instance }}"
        description: "Container restart rate is above 0.2 restarts per minute on {{ $labels.instance }}"
        source: 'https://monitor.pukeko.xyz'

    - alert: ContainerHighPacketLoss
      expr: rate(container_network_receive_errors_total[5m]) > 0.1
      for: 10m
      labels:
        severity: warning
      annotations:
        summary: "High packet loss on container network ({{ $labels.name }})"
        description: "Packet loss rate is above 0.1 errors per minute on {{ $labels.name }}"
        source: 'https://monitor.pukeko.xyz'


  - name: Backups
    rules:
    - alert: KumonoboruFailure
      expr: |
        up{job="systemd", unit="Kumonoboru", state="failed"} == 1
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: "Service failure detected in {{ $labels.instance }}"
        description: "The service '{{ $labels.instance }}' has a failed status"
        source: 'https://monitor.pukeko.xyz'

    - alert: KumonoboruTimerFailure
      expr: |
        up{job="systemd", unit="Kumonoboru", state="failed"} == 1
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: "Timer failure detected in {{ $labels.instance }}"
        description: "The timer '{{ $labels.instance }}' has a failed status"
        source: 'https://monitor.pukeko.xyz'

    - alert: SystemBackupFailed
      expr: |
        system_backup == 1
      labels:
        severity: critical
      annotations:
        summary: "System Backup has failed"
        description: "The repository {{ $labels.instance }} has failed the backup process"

    - alert: SystemBackupUnlockFailed
      expr: |
        system_backup == -1
      labels:
        severity: critical
      annotations:
        summary: "System Backup cannot proceed"
        description: "The repository {{ $labels.instance }} has failed to unlock. Backup cannot be performed"

    - alert: SystemBackupIntegrityFailed
      expr: |
        system_backup == -2
      labels:
        severity: critical
      annotations:
        summary: "System Backup containes error"
        description: "The repository {{ $labels.instance }} has failed an integrity check. Backup data may be corrupted"

    - alert: SystemBackupCleaningFailed
      expr: |
        system_backup == -3
      labels:
        severity: critical
      annotations:
        summary: "System Backup cannot be cleaned"
        description: "The repository {{ $labels.instance }} has failed the cleanup process. Backup may contain excess data"

    - alert: SystemBackupSucceeded
      expr: |
        system_backup == 0
      labels:
        severity: info
      annotations:
        summary: "System Backup has succeeded"
        description: "The repository {{ $labels.instance }} has succesfully completed the backup process"

    - alert: SystemBackupIntegritySucceeded
      expr: |
        system_backup == 2
      labels:
        severity: info
      annotations:
        summary: "System Backup has no errors"
        description: "The repository {{ $labels.instance }} passed the integrity check"

    - alert: SystemBackupCleaningSucceeded
      expr: |
        system_backup == 3
      labels:
        severity: info
      annotations:
        summary: "System Backup has been cleaned"
        description: "The repository {{ $labels.instance }} has completed the cleanup process"