Fixed Prometheus and added bunches of alerts
This commit is contained in:
1
config/alertmanager.yml
Normal file → Executable file
1
config/alertmanager.yml
Normal file → Executable file
@@ -7,6 +7,7 @@ route:
|
|||||||
group_interval: 10s
|
group_interval: 10s
|
||||||
repeat_interval: 24h
|
repeat_interval: 24h
|
||||||
receiver: 'email'
|
receiver: 'email'
|
||||||
|
|
||||||
receivers:
|
receivers:
|
||||||
- name: 'email'
|
- name: 'email'
|
||||||
email_configs:
|
email_configs:
|
||||||
|
|||||||
@@ -1,14 +1,177 @@
|
|||||||
groups:
|
groups:
|
||||||
- name: Uptime
|
|
||||||
|
- name: Takahe
|
||||||
rules:
|
rules:
|
||||||
- alert: InstanceDown
|
|
||||||
expr: up{job="services"} < 1
|
- alert: HostOutOfMemoryWarning
|
||||||
|
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host out of memory (instance {{ $labels.instance }})
|
||||||
|
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
source: 'https://monitor.pukeko.xyz'
|
||||||
|
|
||||||
|
- alert: HostOutOfMemory
|
||||||
|
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host out of memory (instance {{ $labels.instance }})
|
||||||
|
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
source: 'https://monitor.pukeko.xyz'
|
||||||
|
|
||||||
|
- alert: HostOomKillDetected
|
||||||
|
expr: (increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host OOM kill detected (instance {{ $labels.instance }})
|
||||||
|
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
source: 'https://monitor.pukeko.xyz'
|
||||||
|
|
||||||
|
- alert: HostOutOfMemory
|
||||||
|
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host out of memory (instance {{ $labels.instance }})
|
||||||
|
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
source: 'https://monitor.pukeko.xyz'
|
||||||
|
|
||||||
|
- name: Storage
|
||||||
|
rules:
|
||||||
|
- alert: HighFilesystemUsage
|
||||||
|
expr: 100 * (1 - (btrfs_filesystem_free / btrfs_filesystem_size)) > 90
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Usage
|
labels:
|
||||||
rules:
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "High Btrfs filesystem usage on {{ $labels.instance }}"
|
||||||
|
description: "Btrfs filesystem usage is above 90% on {{ $labels.instance }}"
|
||||||
|
source: 'https://monitor.pukeko.xyz'
|
||||||
|
|
||||||
- alert: HighRootFSDiskUsage
|
- alert: HighRootFSDiskUsage
|
||||||
expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/",fstype!="rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/",fstype!="rootfs"}) > 80
|
expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/",fstype!="rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/",fstype!="rootfs"}) > 80
|
||||||
for: 1m
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: error
|
||||||
|
annotations:
|
||||||
|
summary: "Low disk space on Takahe"
|
||||||
|
description: "root disk is filling up on {{ $labels.instance }}"
|
||||||
|
source: 'https://monitor.pukeko.xyz'
|
||||||
|
|
||||||
- alert: HighRedVolDiskUsage
|
- alert: HighRedVolDiskUsage
|
||||||
expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/Red-Vol",fstype!="rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/Red-Vol",fstype!="rootfs"}) > 70
|
expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/Red-Vol",fstype!="rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/Red-Vol",fstype!="rootfs"}) > 70
|
||||||
for: 1m
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: error
|
||||||
|
annotations:
|
||||||
|
summary: "Low disk space on Red-Vol"
|
||||||
|
description: "Red-Vol is filling up on {{ $labels.instance }}"
|
||||||
|
source: 'https://monitor.pukeko.xyz'
|
||||||
|
|
||||||
|
- alert: DegradedBtrfsRAID
|
||||||
|
expr: btrfs_raid_status == 1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Degraded Btrfs RAID on {{ $labels.instance }}"
|
||||||
|
description: "The Btrfs RAID array is in a degraded state on {{ $labels.instance }}"
|
||||||
|
source: 'https://monitor.pukeko.xyz'
|
||||||
|
|
||||||
|
- alert: BtrfsRAIDScrubFailed
|
||||||
|
expr: btrfs_raid_scrub_status == 1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Btrfs RAID scrub failed on {{ $labels.instance }}"
|
||||||
|
description: "The Btrfs RAID scrub failed on {{ $labels.instance }}"
|
||||||
|
source: 'https://monitor.pukeko.xyz'
|
||||||
|
|
||||||
|
- name: Docker
|
||||||
|
rules:
|
||||||
|
|
||||||
|
- alert: ContainerHighCpuUtilization
|
||||||
|
expr: (sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Container High CPU utilization (instance {{ $labels.instance }})
|
||||||
|
description: "Container CPU utilization is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
source: 'https://monitor.pukeko.xyz'
|
||||||
|
|
||||||
|
- alert: ContainerHighMemoryUsage
|
||||||
|
expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Container High Memory usage (instance {{ $labels.instance }})
|
||||||
|
description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
source: 'https://monitor.pukeko.xyz'
|
||||||
|
|
||||||
|
- alert: ContainerUpdateStatus
|
||||||
|
expr: |
|
||||||
|
increase(container_updated[1m]) > 0 and container_updated == 1
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
annotations:
|
||||||
|
summary: "Container update status"
|
||||||
|
description: "The container was successfully updated."
|
||||||
|
source: 'https://monitor.pukeko.xyz'
|
||||||
|
|
||||||
|
- alert: NewContainerUpdate
|
||||||
|
expr: |
|
||||||
|
container_updated{job="takahe", name=~".+"} == 1
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "New container update detected"
|
||||||
|
description: "A new container ('{{ $labels.name }}') was successfully updated."
|
||||||
|
source: 'https://monitor.pukeko.xyz'
|
||||||
|
|
||||||
|
- alert: ContainerUpdateFailure
|
||||||
|
expr: |
|
||||||
|
container_updated == 0 or container_updated == -1
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Container update failed"
|
||||||
|
description: "The container update metric indicates a failure. Check logs for details."
|
||||||
|
source: 'https://monitor.pukeko.xyz'
|
||||||
|
|
||||||
|
|
||||||
|
- name: Backups
|
||||||
|
rules:
|
||||||
|
- alert: KumonoboruFailure
|
||||||
|
expr: |
|
||||||
|
up{job="systemd", unit="Kumonoboru", state="failed"} == 1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Service failure detected in {{ $labels.instance }}"
|
||||||
|
description: "The service '{{ $labels.instance }}' has a failed status"
|
||||||
|
source: 'https://monitor.pukeko.xyz'
|
||||||
|
|
||||||
|
- alert: KumonoboruTimerFailure
|
||||||
|
expr: |
|
||||||
|
up{job="systemd", unit="Kumonoboru", state="failed"} == 1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Timer failure detected in {{ $labels.instance }}"
|
||||||
|
description: "The timer '{{ $labels.instance }}' has a failed status"
|
||||||
|
source: 'https://monitor.pukeko.xyz'
|
||||||
|
|
||||||
|
|||||||
@@ -13,11 +13,12 @@ scrape_configs:
|
|||||||
- job_name: prometheus
|
- job_name: prometheus
|
||||||
static_configs:
|
static_configs:
|
||||||
- targets: ['prometheus:9090']
|
- targets: ['prometheus:9090']
|
||||||
|
|
||||||
- job_name: takahe
|
- job_name: takahe
|
||||||
static_configs:
|
static_configs:
|
||||||
- targets: ['192.168.0.66:9100']
|
- targets: ['node-exporter:9100']
|
||||||
|
|
||||||
- job_name: cadvisor
|
- job_name: cadvisor
|
||||||
scrape_interval: 5s
|
scrape_interval: 5s
|
||||||
static_configs:
|
static_configs:
|
||||||
- targets:
|
- targets: ['cadvisor:8080']
|
||||||
- cadvisor:8080
|
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
version: '3.2'
|
|
||||||
services:
|
services:
|
||||||
|
|
||||||
prometheus:
|
prometheus:
|
||||||
image: prom/prometheus:latest
|
image: prom/prometheus:latest
|
||||||
container_name: prometheus
|
container_name: prometheus
|
||||||
@@ -7,9 +7,9 @@ services:
|
|||||||
- 9090:9090
|
- 9090:9090
|
||||||
command:
|
command:
|
||||||
- --config.file=/etc/prometheus/prometheus.yml
|
- --config.file=/etc/prometheus/prometheus.yml
|
||||||
|
- --web.external-url=https://monitor.pukeko.xyz
|
||||||
volumes:
|
volumes:
|
||||||
- ./config/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
- ./config/:/etc/prometheus/
|
||||||
- ./config/alerts.yml:/etc/prometheus/alerts.yml
|
|
||||||
depends_on:
|
depends_on:
|
||||||
- cadvisor
|
- cadvisor
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
@@ -19,52 +19,69 @@ services:
|
|||||||
labels:
|
labels:
|
||||||
- "traefik.enable=true"
|
- "traefik.enable=true"
|
||||||
- "traefik.docker.network=prometheus_network"
|
- "traefik.docker.network=prometheus_network"
|
||||||
- "traefik.http.routers.prometheus.entrypoints=websecure"
|
- "traefik.http.routers.prometheus.entrypoints=pukekos"
|
||||||
- "traefik.http.routers.prometheus.rule=Host(`monitor.pukeko.xyz`)"
|
- "traefik.http.routers.prometheus.rule=Host(`monitor.pukeko.xyz`)"
|
||||||
- "traefik.http.routers.prometheus.tls.certresolver=pukekoresolver"
|
- "traefik.http.routers.prometheus.tls.certresolver=takaheresolver"
|
||||||
- "traefik.http.routers.prometheus.middlewares=authelia@docker"
|
- "traefik.http.routers.prometheus.middlewares=authelia@docker"
|
||||||
|
|
||||||
|
node-exporter:
|
||||||
|
image: prom/node-exporter
|
||||||
|
container_name: node-exporter
|
||||||
|
restart: unless-stopped
|
||||||
|
volumes:
|
||||||
|
- /proc:/host/proc:ro
|
||||||
|
- /sys:/host/sys:ro
|
||||||
|
- /:/rootfs:ro
|
||||||
|
- ./data/:/etc/node-exporter/textfile_collector/
|
||||||
|
command:
|
||||||
|
- '--path.procfs=/host/proc'
|
||||||
|
- '--path.sysfs=/host/sys'
|
||||||
|
- '--path.rootfs=/rootfs'
|
||||||
|
- '--collector.textfile.directory=/etc/node-exporter/textfile_collector'
|
||||||
|
networks:
|
||||||
|
- internal
|
||||||
|
|
||||||
|
|
||||||
alertmanager:
|
alertmanager:
|
||||||
container_name: alertmanager
|
container_name: alertmanager
|
||||||
image: prom/alertmanager
|
privileged: true
|
||||||
|
image: prom/alertmanager:latest
|
||||||
|
command:
|
||||||
|
- --cluster.advertise-address=192.168.0.66:9093
|
||||||
|
- --config.file=/etc/prometheus/alertmanager.yml
|
||||||
|
- --web.external-url=https://monitor.pukeko.xyz
|
||||||
volumes:
|
volumes:
|
||||||
- ./config/alerts.yml:/etc/prometheus/alerts.yml
|
|
||||||
- ./config/alertmanager.yml:/etc/prometheus/alertmanager.yml
|
- ./config/alertmanager.yml:/etc/prometheus/alertmanager.yml
|
||||||
ports:
|
|
||||||
- '9093:9093'
|
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
networks:
|
networks:
|
||||||
- internal
|
- internal
|
||||||
|
|
||||||
cadvisor:
|
cadvisor:
|
||||||
image: gcr.io/cadvisor/cadvisor:latest
|
image: gcr.io/cadvisor/cadvisor
|
||||||
container_name: cadvisor
|
container_name: cadvisor
|
||||||
ports:
|
|
||||||
- 1010:8080
|
|
||||||
volumes:
|
volumes:
|
||||||
- /:/rootfs:ro
|
- /:/rootfs:ro
|
||||||
- /var/run:/var/run:rw
|
- /var/run:/var/run:rw
|
||||||
- /sys:/sys:ro
|
- /sys:/sys:ro
|
||||||
- /var/lib/docker/:/var/lib/docker:ro
|
- /var/lib/docker/:/var/lib/docker:ro
|
||||||
depends_on:
|
- /sys/fs/cgroup:/sys/fs/cgroup:ro
|
||||||
- redis
|
command: ["--port=8080"]
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
networks:
|
networks:
|
||||||
- internal
|
- internal
|
||||||
|
|
||||||
redis:
|
redis:
|
||||||
image: redis:alpine
|
image: redis:alpine
|
||||||
container_name: redis
|
container_name: redis
|
||||||
ports:
|
|
||||||
- 6379:6379
|
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
networks:
|
networks:
|
||||||
- internal
|
- internal
|
||||||
|
|
||||||
grafana:
|
grafana:
|
||||||
container_name: grafana
|
container_name: grafana
|
||||||
image: grafana/grafana
|
image: grafana/grafana
|
||||||
depends_on:
|
depends_on:
|
||||||
- prometheus
|
- prometheus
|
||||||
ports:
|
|
||||||
- '1000:3000'
|
|
||||||
volumes:
|
volumes:
|
||||||
- './grafana/data:/var/lib/grafana'
|
- './grafana/data:/var/lib/grafana'
|
||||||
- './grafana/provisioning/:/etc/grafana/provisioning/'
|
- './grafana/provisioning/:/etc/grafana/provisioning/'
|
||||||
@@ -72,18 +89,21 @@ services:
|
|||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
user: '1000'
|
user: '1000'
|
||||||
networks:
|
networks:
|
||||||
|
- traefik_network
|
||||||
- network
|
- network
|
||||||
- internal
|
- internal
|
||||||
labels:
|
labels:
|
||||||
- "traefik.enable=true"
|
- "traefik.enable=true"
|
||||||
- "traefik.docker.network=prometheus_network"
|
- "traefik.docker.network=prometheus_network"
|
||||||
- "traefik.http.routers.grafana.entrypoints=websecure"
|
- "traefik.http.routers.grafana.entrypoints=pukekos"
|
||||||
- "traefik.http.services.grafana.loadbalancer.server.port=3000"
|
- "traefik.http.services.grafana.loadbalancer.server.port=3000"
|
||||||
- "traefik.http.routers.grafana.rule=Host(`flight.pukeko.xyz`)"
|
- "traefik.http.routers.grafana.rule=Host(`flight.pukeko.xyz`)"
|
||||||
- "traefik.http.routers.grafana.tls.certresolver=pukekoresolver"
|
- "traefik.http.routers.grafana.tls.certresolver=takaheresolver"
|
||||||
- "traefik.http.routers.grafana.middlewares=authelia@docker"
|
- "traefik.http.routers.grafana.middlewares=authelia@docker"
|
||||||
networks:
|
networks:
|
||||||
network:
|
network:
|
||||||
driver: bridge
|
driver: bridge
|
||||||
internal:
|
internal:
|
||||||
driver: bridge
|
driver: bridge
|
||||||
|
traefik_network:
|
||||||
|
external: true
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ domain = localhost
|
|||||||
enforce_domain = false
|
enforce_domain = false
|
||||||
|
|
||||||
# The full public facing url
|
# The full public facing url
|
||||||
root_url = %(protocol)s://%(domain)s:%(http_port)s/
|
root_url = https://flight.pukeko.xyz
|
||||||
|
|
||||||
# Serve Grafana from subpath specified in `root_url` setting. By default it is set to `false` for compatibility reasons.
|
# Serve Grafana from subpath specified in `root_url` setting. By default it is set to `false` for compatibility reasons.
|
||||||
serve_from_sub_path = false
|
serve_from_sub_path = false
|
||||||
@@ -507,34 +507,21 @@ role_attribute_strict = false
|
|||||||
|
|
||||||
#################################### Generic OAuth #######################
|
#################################### Generic OAuth #######################
|
||||||
[auth.generic_oauth]
|
[auth.generic_oauth]
|
||||||
name = OAuth
|
enabled = true
|
||||||
enabled = false
|
name = Authelia
|
||||||
allow_sign_up = true
|
icon= signin
|
||||||
client_id = some_id
|
client_id = grafana
|
||||||
client_secret =
|
client_secret = P6x3vpNvZcLCZnmwts7E3sEYmtnLVx2cmjPafyFjNRHRsJmcBajaGYzdYjEB4iZemmCTK5H5QAxqg8fSmjMkydKkYcynDgbCciR3tdz3XbcKgRX3LpDVFHqejEKLPz7n
|
||||||
scopes = user:email
|
scopes = openid profile groups email
|
||||||
empty_scopes = false
|
empty_scopes= falise
|
||||||
email_attribute_name = email:primary
|
auth_url = https://auth.pukeko.xyz/api/oidc/authorize
|
||||||
email_attribute_path =
|
token_url = https://auth.pukeko.xyz/api/oidc/token
|
||||||
login_attribute_path =
|
api_url = https://auth.pukeko.xyz/api/oidc/userinfo
|
||||||
name_attribute_path =
|
login_attribute_path = preferred_username
|
||||||
role_attribute_path =
|
groups_attribute_path = groups
|
||||||
role_attribute_strict = false
|
name_attribute_path = name
|
||||||
groups_attribute_path =
|
use_pkce = true
|
||||||
id_token_attribute_name =
|
|
||||||
team_ids_attribute_path =
|
|
||||||
auth_url =
|
|
||||||
token_url =
|
|
||||||
api_url =
|
|
||||||
teams_url =
|
|
||||||
allowed_domains =
|
|
||||||
team_ids =
|
|
||||||
allowed_organizations =
|
|
||||||
tls_skip_verify_insecure = false
|
|
||||||
tls_client_cert =
|
|
||||||
tls_client_key =
|
|
||||||
tls_client_ca =
|
|
||||||
use_pkce = false
|
|
||||||
|
|
||||||
#################################### Basic Auth ##########################
|
#################################### Basic Auth ##########################
|
||||||
[auth.basic]
|
[auth.basic]
|
||||||
|
|||||||
Reference in New Issue
Block a user