Fixed Prometheus and added bunches of alerts

This commit is contained in:
2023-11-17 16:50:39 +02:00
parent b875c663c9
commit 3d90e52a15
5 changed files with 242 additions and 70 deletions

1
config/alertmanager.yml Normal file → Executable file
View File

@@ -7,6 +7,7 @@ route:
group_interval: 10s
repeat_interval: 24h
receiver: 'email'
receivers:
- name: 'email'
email_configs:

View File

@@ -1,14 +1,177 @@
groups:
- name: Uptime
- name: Takahe
rules:
- alert: InstanceDown
expr: up{job="services"} < 1
- alert: HostOutOfMemoryWarning
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{ $labels.instance }})
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
source: 'https://monitor.pukeko.xyz'
- alert: HostOutOfMemory
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{ $labels.instance }})
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
source: 'https://monitor.pukeko.xyz'
- alert: HostOomKillDetected
expr: (increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 0m
labels:
severity: warning
annotations:
summary: Host OOM kill detected (instance {{ $labels.instance }})
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
source: 'https://monitor.pukeko.xyz'
- alert: HostOutOfMemory
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{ $labels.instance }})
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
source: 'https://monitor.pukeko.xyz'
- name: Storage
rules:
- alert: HighFilesystemUsage
expr: 100 * (1 - (btrfs_filesystem_free / btrfs_filesystem_size)) > 90
for: 5m
- name: Usage
rules:
labels:
severity: warning
annotations:
summary: "High Btrfs filesystem usage on {{ $labels.instance }}"
description: "Btrfs filesystem usage is above 90% on {{ $labels.instance }}"
source: 'https://monitor.pukeko.xyz'
- alert: HighRootFSDiskUsage
expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/",fstype!="rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/",fstype!="rootfs"}) > 80
for: 1m
labels:
severity: error
annotations:
summary: "Low disk space on Takahe"
description: "root disk is filling up on {{ $labels.instance }}"
source: 'https://monitor.pukeko.xyz'
- alert: HighRedVolDiskUsage
expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/Red-Vol",fstype!="rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/Red-Vol",fstype!="rootfs"}) > 70
for: 1m
labels:
severity: error
annotations:
summary: "Low disk space on Red-Vol"
description: "Red-Vol is filling up on {{ $labels.instance }}"
source: 'https://monitor.pukeko.xyz'
- alert: DegradedBtrfsRAID
expr: btrfs_raid_status == 1
for: 5m
labels:
severity: critical
annotations:
summary: "Degraded Btrfs RAID on {{ $labels.instance }}"
description: "The Btrfs RAID array is in a degraded state on {{ $labels.instance }}"
source: 'https://monitor.pukeko.xyz'
- alert: BtrfsRAIDScrubFailed
expr: btrfs_raid_scrub_status == 1
for: 5m
labels:
severity: warning
annotations:
summary: "Btrfs RAID scrub failed on {{ $labels.instance }}"
description: "The Btrfs RAID scrub failed on {{ $labels.instance }}"
source: 'https://monitor.pukeko.xyz'
- name: Docker
rules:
- alert: ContainerHighCpuUtilization
expr: (sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80
for: 2m
labels:
severity: warning
annotations:
summary: Container High CPU utilization (instance {{ $labels.instance }})
description: "Container CPU utilization is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
source: 'https://monitor.pukeko.xyz'
- alert: ContainerHighMemoryUsage
expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80
for: 2m
labels:
severity: warning
annotations:
summary: Container High Memory usage (instance {{ $labels.instance }})
description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
source: 'https://monitor.pukeko.xyz'
- alert: ContainerUpdateStatus
expr: |
increase(container_updated[1m]) > 0 and container_updated == 1
for: 1m
labels:
severity: info
annotations:
summary: "Container update status"
description: "The container was successfully updated."
source: 'https://monitor.pukeko.xyz'
- alert: NewContainerUpdate
expr: |
container_updated{job="takahe", name=~".+"} == 1
for: 1m
labels:
severity: warning
annotations:
summary: "New container update detected"
description: "A new container ('{{ $labels.name }}') was successfully updated."
source: 'https://monitor.pukeko.xyz'
- alert: ContainerUpdateFailure
expr: |
container_updated == 0 or container_updated == -1
labels:
severity: critical
annotations:
summary: "Container update failed"
description: "The container update metric indicates a failure. Check logs for details."
source: 'https://monitor.pukeko.xyz'
- name: Backups
rules:
- alert: KumonoboruFailure
expr: |
up{job="systemd", unit="Kumonoboru", state="failed"} == 1
for: 5m
labels:
severity: warning
annotations:
summary: "Service failure detected in {{ $labels.instance }}"
description: "The service '{{ $labels.instance }}' has a failed status"
source: 'https://monitor.pukeko.xyz'
- alert: KumonoboruTimerFailure
expr: |
up{job="systemd", unit="Kumonoboru", state="failed"} == 1
for: 5m
labels:
severity: warning
annotations:
summary: "Timer failure detected in {{ $labels.instance }}"
description: "The timer '{{ $labels.instance }}' has a failed status"
source: 'https://monitor.pukeko.xyz'

View File

@@ -13,11 +13,12 @@ scrape_configs:
- job_name: prometheus
static_configs:
- targets: ['prometheus:9090']
- job_name: takahe
static_configs:
- targets: ['192.168.0.66:9100']
- targets: ['node-exporter:9100']
- job_name: cadvisor
scrape_interval: 5s
static_configs:
- targets:
- cadvisor:8080
- targets: ['cadvisor:8080']

View File

@@ -1,5 +1,5 @@
version: '3.2'
services:
prometheus:
image: prom/prometheus:latest
container_name: prometheus
@@ -7,9 +7,9 @@ services:
- 9090:9090
command:
- --config.file=/etc/prometheus/prometheus.yml
- --web.external-url=https://monitor.pukeko.xyz
volumes:
- ./config/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./config/alerts.yml:/etc/prometheus/alerts.yml
- ./config/:/etc/prometheus/
depends_on:
- cadvisor
restart: unless-stopped
@@ -19,52 +19,69 @@ services:
labels:
- "traefik.enable=true"
- "traefik.docker.network=prometheus_network"
- "traefik.http.routers.prometheus.entrypoints=websecure"
- "traefik.http.routers.prometheus.entrypoints=pukekos"
- "traefik.http.routers.prometheus.rule=Host(`monitor.pukeko.xyz`)"
- "traefik.http.routers.prometheus.tls.certresolver=pukekoresolver"
- "traefik.http.routers.prometheus.tls.certresolver=takaheresolver"
- "traefik.http.routers.prometheus.middlewares=authelia@docker"
node-exporter:
image: prom/node-exporter
container_name: node-exporter
restart: unless-stopped
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
- ./data/:/etc/node-exporter/textfile_collector/
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--path.rootfs=/rootfs'
- '--collector.textfile.directory=/etc/node-exporter/textfile_collector'
networks:
- internal
alertmanager:
container_name: alertmanager
image: prom/alertmanager
privileged: true
image: prom/alertmanager:latest
command:
- --cluster.advertise-address=192.168.0.66:9093
- --config.file=/etc/prometheus/alertmanager.yml
- --web.external-url=https://monitor.pukeko.xyz
volumes:
- ./config/alerts.yml:/etc/prometheus/alerts.yml
- ./config/alertmanager.yml:/etc/prometheus/alertmanager.yml
ports:
- '9093:9093'
restart: unless-stopped
networks:
- internal
cadvisor:
image: gcr.io/cadvisor/cadvisor:latest
image: gcr.io/cadvisor/cadvisor
container_name: cadvisor
ports:
- 1010:8080
volumes:
- /:/rootfs:ro
- /var/run:/var/run:rw
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
depends_on:
- redis
- /sys/fs/cgroup:/sys/fs/cgroup:ro
command: ["--port=8080"]
restart: unless-stopped
networks:
- internal
redis:
image: redis:alpine
container_name: redis
ports:
- 6379:6379
restart: unless-stopped
networks:
- internal
grafana:
container_name: grafana
image: grafana/grafana
depends_on:
- prometheus
ports:
- '1000:3000'
volumes:
- './grafana/data:/var/lib/grafana'
- './grafana/provisioning/:/etc/grafana/provisioning/'
@@ -72,18 +89,21 @@ services:
restart: unless-stopped
user: '1000'
networks:
- traefik_network
- network
- internal
labels:
- "traefik.enable=true"
- "traefik.docker.network=prometheus_network"
- "traefik.http.routers.grafana.entrypoints=websecure"
- "traefik.http.routers.grafana.entrypoints=pukekos"
- "traefik.http.services.grafana.loadbalancer.server.port=3000"
- "traefik.http.routers.grafana.rule=Host(`flight.pukeko.xyz`)"
- "traefik.http.routers.grafana.tls.certresolver=pukekoresolver"
- "traefik.http.routers.grafana.tls.certresolver=takaheresolver"
- "traefik.http.routers.grafana.middlewares=authelia@docker"
networks:
network:
driver: bridge
internal:
driver: bridge
traefik_network:
external: true

View File

@@ -45,7 +45,7 @@ domain = localhost
enforce_domain = false
# The full public facing url
root_url = %(protocol)s://%(domain)s:%(http_port)s/
root_url = https://flight.pukeko.xyz
# Serve Grafana from subpath specified in `root_url` setting. By default it is set to `false` for compatibility reasons.
serve_from_sub_path = false
@@ -507,34 +507,21 @@ role_attribute_strict = false
#################################### Generic OAuth #######################
[auth.generic_oauth]
name = OAuth
enabled = false
allow_sign_up = true
client_id = some_id
client_secret =
scopes = user:email
empty_scopes = false
email_attribute_name = email:primary
email_attribute_path =
login_attribute_path =
name_attribute_path =
role_attribute_path =
role_attribute_strict = false
groups_attribute_path =
id_token_attribute_name =
team_ids_attribute_path =
auth_url =
token_url =
api_url =
teams_url =
allowed_domains =
team_ids =
allowed_organizations =
tls_skip_verify_insecure = false
tls_client_cert =
tls_client_key =
tls_client_ca =
use_pkce = false
enabled = true
name = Authelia
icon= signin
client_id = grafana
client_secret = P6x3vpNvZcLCZnmwts7E3sEYmtnLVx2cmjPafyFjNRHRsJmcBajaGYzdYjEB4iZemmCTK5H5QAxqg8fSmjMkydKkYcynDgbCciR3tdz3XbcKgRX3LpDVFHqejEKLPz7n
scopes = openid profile groups email
empty_scopes= falise
auth_url = https://auth.pukeko.xyz/api/oidc/authorize
token_url = https://auth.pukeko.xyz/api/oidc/token
api_url = https://auth.pukeko.xyz/api/oidc/userinfo
login_attribute_path = preferred_username
groups_attribute_path = groups
name_attribute_path = name
use_pkce = true
#################################### Basic Auth ##########################
[auth.basic]