Fixed Prometheus and added bunches of alerts

This commit is contained in:
2023-11-17 16:50:39 +02:00
parent b875c663c9
commit 3d90e52a15
5 changed files with 242 additions and 70 deletions

1
config/alertmanager.yml Normal file → Executable file
View File

@@ -7,6 +7,7 @@ route:
group_interval: 10s group_interval: 10s
repeat_interval: 24h repeat_interval: 24h
receiver: 'email' receiver: 'email'
receivers: receivers:
- name: 'email' - name: 'email'
email_configs: email_configs:

View File

@@ -1,14 +1,177 @@
groups: groups:
- name: Uptime
- name: Takahe
rules: rules:
- alert: InstanceDown
expr: up{job="services"} < 1 - alert: HostOutOfMemoryWarning
for: 5m expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
- name: Usage for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{ $labels.instance }})
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
source: 'https://monitor.pukeko.xyz'
- alert: HostOutOfMemory
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{ $labels.instance }})
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
source: 'https://monitor.pukeko.xyz'
- alert: HostOomKillDetected
expr: (increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 0m
labels:
severity: warning
annotations:
summary: Host OOM kill detected (instance {{ $labels.instance }})
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
source: 'https://monitor.pukeko.xyz'
- alert: HostOutOfMemory
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{ $labels.instance }})
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
source: 'https://monitor.pukeko.xyz'
- name: Storage
rules: rules:
- alert: HighRootFSDiskUsage - alert: HighFilesystemUsage
expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/",fstype!="rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/",fstype!="rootfs"}) > 80 expr: 100 * (1 - (btrfs_filesystem_free / btrfs_filesystem_size)) > 90
for: 1m for: 5m
- alert: HighRedVolDiskUsage labels:
expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/Red-Vol",fstype!="rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/Red-Vol",fstype!="rootfs"}) > 70 severity: warning
for: 1m annotations:
summary: "High Btrfs filesystem usage on {{ $labels.instance }}"
description: "Btrfs filesystem usage is above 90% on {{ $labels.instance }}"
source: 'https://monitor.pukeko.xyz'
- alert: HighRootFSDiskUsage
expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/",fstype!="rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/",fstype!="rootfs"}) > 80
for: 1m
labels:
severity: error
annotations:
summary: "Low disk space on Takahe"
description: "root disk is filling up on {{ $labels.instance }}"
source: 'https://monitor.pukeko.xyz'
- alert: HighRedVolDiskUsage
expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/Red-Vol",fstype!="rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/Red-Vol",fstype!="rootfs"}) > 70
for: 1m
labels:
severity: error
annotations:
summary: "Low disk space on Red-Vol"
description: "Red-Vol is filling up on {{ $labels.instance }}"
source: 'https://monitor.pukeko.xyz'
- alert: DegradedBtrfsRAID
expr: btrfs_raid_status == 1
for: 5m
labels:
severity: critical
annotations:
summary: "Degraded Btrfs RAID on {{ $labels.instance }}"
description: "The Btrfs RAID array is in a degraded state on {{ $labels.instance }}"
source: 'https://monitor.pukeko.xyz'
- alert: BtrfsRAIDScrubFailed
expr: btrfs_raid_scrub_status == 1
for: 5m
labels:
severity: warning
annotations:
summary: "Btrfs RAID scrub failed on {{ $labels.instance }}"
description: "The Btrfs RAID scrub failed on {{ $labels.instance }}"
source: 'https://monitor.pukeko.xyz'
- name: Docker
rules:
- alert: ContainerHighCpuUtilization
expr: (sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80
for: 2m
labels:
severity: warning
annotations:
summary: Container High CPU utilization (instance {{ $labels.instance }})
description: "Container CPU utilization is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
source: 'https://monitor.pukeko.xyz'
- alert: ContainerHighMemoryUsage
expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80
for: 2m
labels:
severity: warning
annotations:
summary: Container High Memory usage (instance {{ $labels.instance }})
description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
source: 'https://monitor.pukeko.xyz'
- alert: ContainerUpdateStatus
expr: |
increase(container_updated[1m]) > 0 and container_updated == 1
for: 1m
labels:
severity: info
annotations:
summary: "Container update status"
description: "The container was successfully updated."
source: 'https://monitor.pukeko.xyz'
- alert: NewContainerUpdate
expr: |
container_updated{job="takahe", name=~".+"} == 1
for: 1m
labels:
severity: warning
annotations:
summary: "New container update detected"
description: "A new container ('{{ $labels.name }}') was successfully updated."
source: 'https://monitor.pukeko.xyz'
- alert: ContainerUpdateFailure
expr: |
container_updated == 0 or container_updated == -1
labels:
severity: critical
annotations:
summary: "Container update failed"
description: "The container update metric indicates a failure. Check logs for details."
source: 'https://monitor.pukeko.xyz'
- name: Backups
rules:
- alert: KumonoboruFailure
expr: |
up{job="systemd", unit="Kumonoboru", state="failed"} == 1
for: 5m
labels:
severity: warning
annotations:
summary: "Service failure detected in {{ $labels.instance }}"
description: "The service '{{ $labels.instance }}' has a failed status"
source: 'https://monitor.pukeko.xyz'
- alert: KumonoboruTimerFailure
expr: |
up{job="systemd", unit="Kumonoboru", state="failed"} == 1
for: 5m
labels:
severity: warning
annotations:
summary: "Timer failure detected in {{ $labels.instance }}"
description: "The timer '{{ $labels.instance }}' has a failed status"
source: 'https://monitor.pukeko.xyz'

View File

@@ -1,11 +1,11 @@
global: global:
scrape_interval: 15s scrape_interval: 15s
evaluation_interval: 15s evaluation_interval: 15s
alerting: alerting:
alertmanagers: alertmanagers:
- static_configs: - static_configs:
- targets: ['alertmanager:9093'] - targets: ['alertmanager:9093']
rule_files: rule_files:
- alerts.yml - alerts.yml
@@ -13,11 +13,12 @@ scrape_configs:
- job_name: prometheus - job_name: prometheus
static_configs: static_configs:
- targets: ['prometheus:9090'] - targets: ['prometheus:9090']
- job_name: takahe - job_name: takahe
static_configs: static_configs:
- targets: ['192.168.0.66:9100'] - targets: ['node-exporter:9100']
- job_name: cadvisor - job_name: cadvisor
scrape_interval: 5s scrape_interval: 5s
static_configs: static_configs:
- targets: - targets: ['cadvisor:8080']
- cadvisor:8080

View File

@@ -1,5 +1,5 @@
version: '3.2'
services: services:
prometheus: prometheus:
image: prom/prometheus:latest image: prom/prometheus:latest
container_name: prometheus container_name: prometheus
@@ -7,9 +7,9 @@ services:
- 9090:9090 - 9090:9090
command: command:
- --config.file=/etc/prometheus/prometheus.yml - --config.file=/etc/prometheus/prometheus.yml
- --web.external-url=https://monitor.pukeko.xyz
volumes: volumes:
- ./config/prometheus.yml:/etc/prometheus/prometheus.yml:ro - ./config/:/etc/prometheus/
- ./config/alerts.yml:/etc/prometheus/alerts.yml
depends_on: depends_on:
- cadvisor - cadvisor
restart: unless-stopped restart: unless-stopped
@@ -19,52 +19,69 @@ services:
labels: labels:
- "traefik.enable=true" - "traefik.enable=true"
- "traefik.docker.network=prometheus_network" - "traefik.docker.network=prometheus_network"
- "traefik.http.routers.prometheus.entrypoints=websecure" - "traefik.http.routers.prometheus.entrypoints=pukekos"
- "traefik.http.routers.prometheus.rule=Host(`monitor.pukeko.xyz`)" - "traefik.http.routers.prometheus.rule=Host(`monitor.pukeko.xyz`)"
- "traefik.http.routers.prometheus.tls.certresolver=pukekoresolver" - "traefik.http.routers.prometheus.tls.certresolver=takaheresolver"
- "traefik.http.routers.prometheus.middlewares=authelia@docker" - "traefik.http.routers.prometheus.middlewares=authelia@docker"
node-exporter:
image: prom/node-exporter
container_name: node-exporter
restart: unless-stopped
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
- ./data/:/etc/node-exporter/textfile_collector/
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--path.rootfs=/rootfs'
- '--collector.textfile.directory=/etc/node-exporter/textfile_collector'
networks:
- internal
alertmanager: alertmanager:
container_name: alertmanager container_name: alertmanager
image: prom/alertmanager privileged: true
image: prom/alertmanager:latest
command:
- --cluster.advertise-address=192.168.0.66:9093
- --config.file=/etc/prometheus/alertmanager.yml
- --web.external-url=https://monitor.pukeko.xyz
volumes: volumes:
- ./config/alerts.yml:/etc/prometheus/alerts.yml
- ./config/alertmanager.yml:/etc/prometheus/alertmanager.yml - ./config/alertmanager.yml:/etc/prometheus/alertmanager.yml
ports:
- '9093:9093'
restart: unless-stopped restart: unless-stopped
networks: networks:
- internal - internal
cadvisor: cadvisor:
image: gcr.io/cadvisor/cadvisor:latest image: gcr.io/cadvisor/cadvisor
container_name: cadvisor container_name: cadvisor
ports:
- 1010:8080
volumes: volumes:
- /:/rootfs:ro - /:/rootfs:ro
- /var/run:/var/run:rw - /var/run:/var/run:rw
- /sys:/sys:ro - /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro - /var/lib/docker/:/var/lib/docker:ro
depends_on: - /sys/fs/cgroup:/sys/fs/cgroup:ro
- redis command: ["--port=8080"]
restart: unless-stopped restart: unless-stopped
networks: networks:
- internal - internal
redis: redis:
image: redis:alpine image: redis:alpine
container_name: redis container_name: redis
ports:
- 6379:6379
restart: unless-stopped restart: unless-stopped
networks: networks:
- internal - internal
grafana: grafana:
container_name: grafana container_name: grafana
image: grafana/grafana image: grafana/grafana
depends_on: depends_on:
- prometheus - prometheus
ports:
- '1000:3000'
volumes: volumes:
- './grafana/data:/var/lib/grafana' - './grafana/data:/var/lib/grafana'
- './grafana/provisioning/:/etc/grafana/provisioning/' - './grafana/provisioning/:/etc/grafana/provisioning/'
@@ -72,18 +89,21 @@ services:
restart: unless-stopped restart: unless-stopped
user: '1000' user: '1000'
networks: networks:
- traefik_network
- network - network
- internal - internal
labels: labels:
- "traefik.enable=true" - "traefik.enable=true"
- "traefik.docker.network=prometheus_network" - "traefik.docker.network=prometheus_network"
- "traefik.http.routers.grafana.entrypoints=websecure" - "traefik.http.routers.grafana.entrypoints=pukekos"
- "traefik.http.services.grafana.loadbalancer.server.port=3000" - "traefik.http.services.grafana.loadbalancer.server.port=3000"
- "traefik.http.routers.grafana.rule=Host(`flight.pukeko.xyz`)" - "traefik.http.routers.grafana.rule=Host(`flight.pukeko.xyz`)"
- "traefik.http.routers.grafana.tls.certresolver=pukekoresolver" - "traefik.http.routers.grafana.tls.certresolver=takaheresolver"
- "traefik.http.routers.grafana.middlewares=authelia@docker" - "traefik.http.routers.grafana.middlewares=authelia@docker"
networks: networks:
network: network:
driver: bridge driver: bridge
internal: internal:
driver: bridge driver: bridge
traefik_network:
external: true

View File

@@ -45,7 +45,7 @@ domain = localhost
enforce_domain = false enforce_domain = false
# The full public facing url # The full public facing url
root_url = %(protocol)s://%(domain)s:%(http_port)s/ root_url = https://flight.pukeko.xyz
# Serve Grafana from subpath specified in `root_url` setting. By default it is set to `false` for compatibility reasons. # Serve Grafana from subpath specified in `root_url` setting. By default it is set to `false` for compatibility reasons.
serve_from_sub_path = false serve_from_sub_path = false
@@ -507,34 +507,21 @@ role_attribute_strict = false
#################################### Generic OAuth ####################### #################################### Generic OAuth #######################
[auth.generic_oauth] [auth.generic_oauth]
name = OAuth enabled = true
enabled = false name = Authelia
allow_sign_up = true icon= signin
client_id = some_id client_id = grafana
client_secret = client_secret = P6x3vpNvZcLCZnmwts7E3sEYmtnLVx2cmjPafyFjNRHRsJmcBajaGYzdYjEB4iZemmCTK5H5QAxqg8fSmjMkydKkYcynDgbCciR3tdz3XbcKgRX3LpDVFHqejEKLPz7n
scopes = user:email scopes = openid profile groups email
empty_scopes = false empty_scopes= falise
email_attribute_name = email:primary auth_url = https://auth.pukeko.xyz/api/oidc/authorize
email_attribute_path = token_url = https://auth.pukeko.xyz/api/oidc/token
login_attribute_path = api_url = https://auth.pukeko.xyz/api/oidc/userinfo
name_attribute_path = login_attribute_path = preferred_username
role_attribute_path = groups_attribute_path = groups
role_attribute_strict = false name_attribute_path = name
groups_attribute_path = use_pkce = true
id_token_attribute_name =
team_ids_attribute_path =
auth_url =
token_url =
api_url =
teams_url =
allowed_domains =
team_ids =
allowed_organizations =
tls_skip_verify_insecure = false
tls_client_cert =
tls_client_key =
tls_client_ca =
use_pkce = false
#################################### Basic Auth ########################## #################################### Basic Auth ##########################
[auth.basic] [auth.basic]