From 3d90e52a1534de1fbef0a152b041dee280d5c3b8 Mon Sep 17 00:00:00 2001 From: Matan Horovitz Date: Fri, 17 Nov 2023 16:50:39 +0200 Subject: [PATCH] Fixed Prometheus and added bunches of alerts --- config/alertmanager.yml | 1 + config/alerts.yml | 185 ++++++++++++++++++++++++++++++++++--- config/prometheus.yml | 13 +-- docker-compose.yml | 68 +++++++++----- grafana/config/grafana.ini | 45 ++++----- 5 files changed, 242 insertions(+), 70 deletions(-) mode change 100644 => 100755 config/alertmanager.yml diff --git a/config/alertmanager.yml b/config/alertmanager.yml old mode 100644 new mode 100755 index 62cde79..b02058d --- a/config/alertmanager.yml +++ b/config/alertmanager.yml @@ -7,6 +7,7 @@ route: group_interval: 10s repeat_interval: 24h receiver: 'email' + receivers: - name: 'email' email_configs: diff --git a/config/alerts.yml b/config/alerts.yml index fb4c6b0..efdadf8 100755 --- a/config/alerts.yml +++ b/config/alerts.yml @@ -1,14 +1,177 @@ groups: - - name: Uptime + + - name: Takahe rules: - - alert: InstanceDown - expr: up{job="services"} < 1 - for: 5m - - name: Usage + + - alert: HostOutOfMemoryWarning + expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host out of memory (instance {{ $labels.instance }}) + description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + source: 'https://monitor.pukeko.xyz' + + - alert: HostOutOfMemory + expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host out of memory (instance {{ $labels.instance }}) + description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + source: 'https://monitor.pukeko.xyz' + + - alert: HostOomKillDetected + expr: (increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 0m + labels: + severity: warning + annotations: + summary: Host OOM kill detected (instance {{ $labels.instance }}) + description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + source: 'https://monitor.pukeko.xyz' + + - alert: HostOutOfMemory + expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host out of memory (instance {{ $labels.instance }}) + description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + source: 'https://monitor.pukeko.xyz' + + - name: Storage rules: - - alert: HighRootFSDiskUsage - expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/",fstype!="rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/",fstype!="rootfs"}) > 80 - for: 1m - - alert: HighRedVolDiskUsage - expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/Red-Vol",fstype!="rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/Red-Vol",fstype!="rootfs"}) > 70 - for: 1m + - alert: HighFilesystemUsage + expr: 100 * (1 - (btrfs_filesystem_free / btrfs_filesystem_size)) > 90 + for: 5m + labels: + severity: warning + annotations: + summary: "High Btrfs filesystem usage on {{ $labels.instance }}" + description: "Btrfs filesystem usage is above 90% on {{ $labels.instance }}" + source: 'https://monitor.pukeko.xyz' + + - alert: HighRootFSDiskUsage + expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/",fstype!="rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/",fstype!="rootfs"}) > 80 + for: 1m + labels: + severity: error + annotations: + summary: "Low disk space on Takahe" + description: "root disk is filling up on {{ $labels.instance }}" + source: 'https://monitor.pukeko.xyz' + + - alert: HighRedVolDiskUsage + expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/Red-Vol",fstype!="rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/Red-Vol",fstype!="rootfs"}) > 70 + for: 1m + labels: + severity: error + annotations: + summary: "Low disk space on Red-Vol" + description: "Red-Vol is filling up on {{ $labels.instance }}" + source: 'https://monitor.pukeko.xyz' + + - alert: DegradedBtrfsRAID + expr: btrfs_raid_status == 1 + for: 5m + labels: + severity: critical + annotations: + summary: "Degraded Btrfs RAID on {{ $labels.instance }}" + description: "The Btrfs RAID array is in a degraded state on {{ $labels.instance }}" + source: 'https://monitor.pukeko.xyz' + + - alert: BtrfsRAIDScrubFailed + expr: btrfs_raid_scrub_status == 1 + for: 5m + labels: + severity: warning + annotations: + summary: "Btrfs RAID scrub failed on {{ $labels.instance }}" + description: "The Btrfs RAID scrub failed on {{ $labels.instance }}" + source: 'https://monitor.pukeko.xyz' + + - name: Docker + rules: + + - alert: ContainerHighCpuUtilization + expr: (sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80 + for: 2m + labels: + severity: warning + annotations: + summary: Container High CPU utilization (instance {{ $labels.instance }}) + description: "Container CPU utilization is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + source: 'https://monitor.pukeko.xyz' + + - alert: ContainerHighMemoryUsage + expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80 + for: 2m + labels: + severity: warning + annotations: + summary: Container High Memory usage (instance {{ $labels.instance }}) + description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + source: 'https://monitor.pukeko.xyz' + + - alert: ContainerUpdateStatus + expr: | + increase(container_updated[1m]) > 0 and container_updated == 1 + for: 1m + labels: + severity: info + annotations: + summary: "Container update status" + description: "The container was successfully updated." + source: 'https://monitor.pukeko.xyz' + + - alert: NewContainerUpdate + expr: | + container_updated{job="takahe", name=~".+"} == 1 + for: 1m + labels: + severity: warning + annotations: + summary: "New container update detected" + description: "A new container ('{{ $labels.name }}') was successfully updated." + source: 'https://monitor.pukeko.xyz' + + - alert: ContainerUpdateFailure + expr: | + container_updated == 0 or container_updated == -1 + labels: + severity: critical + annotations: + summary: "Container update failed" + description: "The container update metric indicates a failure. Check logs for details." + source: 'https://monitor.pukeko.xyz' + + + - name: Backups + rules: + - alert: KumonoboruFailure + expr: | + up{job="systemd", unit="Kumonoboru", state="failed"} == 1 + for: 5m + labels: + severity: warning + annotations: + summary: "Service failure detected in {{ $labels.instance }}" + description: "The service '{{ $labels.instance }}' has a failed status" + source: 'https://monitor.pukeko.xyz' + + - alert: KumonoboruTimerFailure + expr: | + up{job="systemd", unit="Kumonoboru", state="failed"} == 1 + for: 5m + labels: + severity: warning + annotations: + summary: "Timer failure detected in {{ $labels.instance }}" + description: "The timer '{{ $labels.instance }}' has a failed status" + source: 'https://monitor.pukeko.xyz' + diff --git a/config/prometheus.yml b/config/prometheus.yml index 6532a60..f2ad7aa 100755 --- a/config/prometheus.yml +++ b/config/prometheus.yml @@ -1,11 +1,11 @@ global: - scrape_interval: 15s + scrape_interval: 15s evaluation_interval: 15s alerting: alertmanagers: - - static_configs: - - targets: ['alertmanager:9093'] + - static_configs: + - targets: ['alertmanager:9093'] rule_files: - alerts.yml @@ -13,11 +13,12 @@ scrape_configs: - job_name: prometheus static_configs: - targets: ['prometheus:9090'] + - job_name: takahe static_configs: - - targets: ['192.168.0.66:9100'] + - targets: ['node-exporter:9100'] + - job_name: cadvisor scrape_interval: 5s static_configs: - - targets: - - cadvisor:8080 + - targets: ['cadvisor:8080'] diff --git a/docker-compose.yml b/docker-compose.yml index 64b7c1a..af46b97 100755 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,5 @@ -version: '3.2' services: + prometheus: image: prom/prometheus:latest container_name: prometheus @@ -7,9 +7,9 @@ services: - 9090:9090 command: - --config.file=/etc/prometheus/prometheus.yml + - --web.external-url=https://monitor.pukeko.xyz volumes: - - ./config/prometheus.yml:/etc/prometheus/prometheus.yml:ro - - ./config/alerts.yml:/etc/prometheus/alerts.yml + - ./config/:/etc/prometheus/ depends_on: - cadvisor restart: unless-stopped @@ -19,52 +19,69 @@ services: labels: - "traefik.enable=true" - "traefik.docker.network=prometheus_network" - - "traefik.http.routers.prometheus.entrypoints=websecure" + - "traefik.http.routers.prometheus.entrypoints=pukekos" - "traefik.http.routers.prometheus.rule=Host(`monitor.pukeko.xyz`)" - - "traefik.http.routers.prometheus.tls.certresolver=pukekoresolver" + - "traefik.http.routers.prometheus.tls.certresolver=takaheresolver" - "traefik.http.routers.prometheus.middlewares=authelia@docker" + node-exporter: + image: prom/node-exporter + container_name: node-exporter + restart: unless-stopped + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + - ./data/:/etc/node-exporter/textfile_collector/ + command: + - '--path.procfs=/host/proc' + - '--path.sysfs=/host/sys' + - '--path.rootfs=/rootfs' + - '--collector.textfile.directory=/etc/node-exporter/textfile_collector' + networks: + - internal + + alertmanager: container_name: alertmanager - image: prom/alertmanager + privileged: true + image: prom/alertmanager:latest + command: + - --cluster.advertise-address=192.168.0.66:9093 + - --config.file=/etc/prometheus/alertmanager.yml + - --web.external-url=https://monitor.pukeko.xyz volumes: - - ./config/alerts.yml:/etc/prometheus/alerts.yml - ./config/alertmanager.yml:/etc/prometheus/alertmanager.yml - ports: - - '9093:9093' restart: unless-stopped networks: - internal + cadvisor: - image: gcr.io/cadvisor/cadvisor:latest + image: gcr.io/cadvisor/cadvisor container_name: cadvisor - ports: - - 1010:8080 volumes: - - /:/rootfs:ro - - /var/run:/var/run:rw - - /sys:/sys:ro - - /var/lib/docker/:/var/lib/docker:ro - depends_on: - - redis + - /:/rootfs:ro + - /var/run:/var/run:rw + - /sys:/sys:ro + - /var/lib/docker/:/var/lib/docker:ro + - /sys/fs/cgroup:/sys/fs/cgroup:ro + command: ["--port=8080"] restart: unless-stopped networks: - internal + redis: image: redis:alpine container_name: redis - ports: - - 6379:6379 restart: unless-stopped networks: - internal + grafana: container_name: grafana image: grafana/grafana depends_on: - prometheus - ports: - - '1000:3000' volumes: - './grafana/data:/var/lib/grafana' - './grafana/provisioning/:/etc/grafana/provisioning/' @@ -72,18 +89,21 @@ services: restart: unless-stopped user: '1000' networks: + - traefik_network - network - internal labels: - "traefik.enable=true" - "traefik.docker.network=prometheus_network" - - "traefik.http.routers.grafana.entrypoints=websecure" + - "traefik.http.routers.grafana.entrypoints=pukekos" - "traefik.http.services.grafana.loadbalancer.server.port=3000" - "traefik.http.routers.grafana.rule=Host(`flight.pukeko.xyz`)" - - "traefik.http.routers.grafana.tls.certresolver=pukekoresolver" + - "traefik.http.routers.grafana.tls.certresolver=takaheresolver" - "traefik.http.routers.grafana.middlewares=authelia@docker" networks: network: driver: bridge internal: driver: bridge + traefik_network: + external: true diff --git a/grafana/config/grafana.ini b/grafana/config/grafana.ini index c4a445b..72705e9 100644 --- a/grafana/config/grafana.ini +++ b/grafana/config/grafana.ini @@ -45,7 +45,7 @@ domain = localhost enforce_domain = false # The full public facing url -root_url = %(protocol)s://%(domain)s:%(http_port)s/ +root_url = https://flight.pukeko.xyz # Serve Grafana from subpath specified in `root_url` setting. By default it is set to `false` for compatibility reasons. serve_from_sub_path = false @@ -507,34 +507,21 @@ role_attribute_strict = false #################################### Generic OAuth ####################### [auth.generic_oauth] -name = OAuth -enabled = false -allow_sign_up = true -client_id = some_id -client_secret = -scopes = user:email -empty_scopes = false -email_attribute_name = email:primary -email_attribute_path = -login_attribute_path = -name_attribute_path = -role_attribute_path = -role_attribute_strict = false -groups_attribute_path = -id_token_attribute_name = -team_ids_attribute_path = -auth_url = -token_url = -api_url = -teams_url = -allowed_domains = -team_ids = -allowed_organizations = -tls_skip_verify_insecure = false -tls_client_cert = -tls_client_key = -tls_client_ca = -use_pkce = false +enabled = true +name = Authelia +icon= signin +client_id = grafana +client_secret = P6x3vpNvZcLCZnmwts7E3sEYmtnLVx2cmjPafyFjNRHRsJmcBajaGYzdYjEB4iZemmCTK5H5QAxqg8fSmjMkydKkYcynDgbCciR3tdz3XbcKgRX3LpDVFHqejEKLPz7n +scopes = openid profile groups email +empty_scopes= falise +auth_url = https://auth.pukeko.xyz/api/oidc/authorize +token_url = https://auth.pukeko.xyz/api/oidc/token +api_url = https://auth.pukeko.xyz/api/oidc/userinfo +login_attribute_path = preferred_username +groups_attribute_path = groups +name_attribute_path = name +use_pkce = true + #################################### Basic Auth ########################## [auth.basic]