Fixed Prometheus and added bunches of alerts
This commit is contained in:
1
config/alertmanager.yml
Normal file → Executable file
1
config/alertmanager.yml
Normal file → Executable file
@@ -7,6 +7,7 @@ route:
|
||||
group_interval: 10s
|
||||
repeat_interval: 24h
|
||||
receiver: 'email'
|
||||
|
||||
receivers:
|
||||
- name: 'email'
|
||||
email_configs:
|
||||
|
||||
@@ -1,14 +1,177 @@
|
||||
groups:
|
||||
- name: Uptime
|
||||
|
||||
- name: Takahe
|
||||
rules:
|
||||
- alert: InstanceDown
|
||||
expr: up{job="services"} < 1
|
||||
for: 5m
|
||||
- name: Usage
|
||||
|
||||
- alert: HostOutOfMemoryWarning
|
||||
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of memory (instance {{ $labels.instance }})
|
||||
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
source: 'https://monitor.pukeko.xyz'
|
||||
|
||||
- alert: HostOutOfMemory
|
||||
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of memory (instance {{ $labels.instance }})
|
||||
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
source: 'https://monitor.pukeko.xyz'
|
||||
|
||||
- alert: HostOomKillDetected
|
||||
expr: (increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host OOM kill detected (instance {{ $labels.instance }})
|
||||
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
source: 'https://monitor.pukeko.xyz'
|
||||
|
||||
- alert: HostOutOfMemory
|
||||
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of memory (instance {{ $labels.instance }})
|
||||
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
source: 'https://monitor.pukeko.xyz'
|
||||
|
||||
- name: Storage
|
||||
rules:
|
||||
- alert: HighRootFSDiskUsage
|
||||
expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/",fstype!="rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/",fstype!="rootfs"}) > 80
|
||||
for: 1m
|
||||
- alert: HighRedVolDiskUsage
|
||||
expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/Red-Vol",fstype!="rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/Red-Vol",fstype!="rootfs"}) > 70
|
||||
for: 1m
|
||||
- alert: HighFilesystemUsage
|
||||
expr: 100 * (1 - (btrfs_filesystem_free / btrfs_filesystem_size)) > 90
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High Btrfs filesystem usage on {{ $labels.instance }}"
|
||||
description: "Btrfs filesystem usage is above 90% on {{ $labels.instance }}"
|
||||
source: 'https://monitor.pukeko.xyz'
|
||||
|
||||
- alert: HighRootFSDiskUsage
|
||||
expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/",fstype!="rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/",fstype!="rootfs"}) > 80
|
||||
for: 1m
|
||||
labels:
|
||||
severity: error
|
||||
annotations:
|
||||
summary: "Low disk space on Takahe"
|
||||
description: "root disk is filling up on {{ $labels.instance }}"
|
||||
source: 'https://monitor.pukeko.xyz'
|
||||
|
||||
- alert: HighRedVolDiskUsage
|
||||
expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/Red-Vol",fstype!="rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/Red-Vol",fstype!="rootfs"}) > 70
|
||||
for: 1m
|
||||
labels:
|
||||
severity: error
|
||||
annotations:
|
||||
summary: "Low disk space on Red-Vol"
|
||||
description: "Red-Vol is filling up on {{ $labels.instance }}"
|
||||
source: 'https://monitor.pukeko.xyz'
|
||||
|
||||
- alert: DegradedBtrfsRAID
|
||||
expr: btrfs_raid_status == 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Degraded Btrfs RAID on {{ $labels.instance }}"
|
||||
description: "The Btrfs RAID array is in a degraded state on {{ $labels.instance }}"
|
||||
source: 'https://monitor.pukeko.xyz'
|
||||
|
||||
- alert: BtrfsRAIDScrubFailed
|
||||
expr: btrfs_raid_scrub_status == 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Btrfs RAID scrub failed on {{ $labels.instance }}"
|
||||
description: "The Btrfs RAID scrub failed on {{ $labels.instance }}"
|
||||
source: 'https://monitor.pukeko.xyz'
|
||||
|
||||
- name: Docker
|
||||
rules:
|
||||
|
||||
- alert: ContainerHighCpuUtilization
|
||||
expr: (sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Container High CPU utilization (instance {{ $labels.instance }})
|
||||
description: "Container CPU utilization is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
source: 'https://monitor.pukeko.xyz'
|
||||
|
||||
- alert: ContainerHighMemoryUsage
|
||||
expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Container High Memory usage (instance {{ $labels.instance }})
|
||||
description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
source: 'https://monitor.pukeko.xyz'
|
||||
|
||||
- alert: ContainerUpdateStatus
|
||||
expr: |
|
||||
increase(container_updated[1m]) > 0 and container_updated == 1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: "Container update status"
|
||||
description: "The container was successfully updated."
|
||||
source: 'https://monitor.pukeko.xyz'
|
||||
|
||||
- alert: NewContainerUpdate
|
||||
expr: |
|
||||
container_updated{job="takahe", name=~".+"} == 1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "New container update detected"
|
||||
description: "A new container ('{{ $labels.name }}') was successfully updated."
|
||||
source: 'https://monitor.pukeko.xyz'
|
||||
|
||||
- alert: ContainerUpdateFailure
|
||||
expr: |
|
||||
container_updated == 0 or container_updated == -1
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Container update failed"
|
||||
description: "The container update metric indicates a failure. Check logs for details."
|
||||
source: 'https://monitor.pukeko.xyz'
|
||||
|
||||
|
||||
- name: Backups
|
||||
rules:
|
||||
- alert: KumonoboruFailure
|
||||
expr: |
|
||||
up{job="systemd", unit="Kumonoboru", state="failed"} == 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Service failure detected in {{ $labels.instance }}"
|
||||
description: "The service '{{ $labels.instance }}' has a failed status"
|
||||
source: 'https://monitor.pukeko.xyz'
|
||||
|
||||
- alert: KumonoboruTimerFailure
|
||||
expr: |
|
||||
up{job="systemd", unit="Kumonoboru", state="failed"} == 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Timer failure detected in {{ $labels.instance }}"
|
||||
description: "The timer '{{ $labels.instance }}' has a failed status"
|
||||
source: 'https://monitor.pukeko.xyz'
|
||||
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets: ['alertmanager:9093']
|
||||
- static_configs:
|
||||
- targets: ['alertmanager:9093']
|
||||
rule_files:
|
||||
- alerts.yml
|
||||
|
||||
@@ -13,11 +13,12 @@ scrape_configs:
|
||||
- job_name: prometheus
|
||||
static_configs:
|
||||
- targets: ['prometheus:9090']
|
||||
|
||||
- job_name: takahe
|
||||
static_configs:
|
||||
- targets: ['192.168.0.66:9100']
|
||||
- targets: ['node-exporter:9100']
|
||||
|
||||
- job_name: cadvisor
|
||||
scrape_interval: 5s
|
||||
static_configs:
|
||||
- targets:
|
||||
- cadvisor:8080
|
||||
- targets: ['cadvisor:8080']
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
version: '3.2'
|
||||
services:
|
||||
|
||||
prometheus:
|
||||
image: prom/prometheus:latest
|
||||
container_name: prometheus
|
||||
@@ -7,9 +7,9 @@ services:
|
||||
- 9090:9090
|
||||
command:
|
||||
- --config.file=/etc/prometheus/prometheus.yml
|
||||
- --web.external-url=https://monitor.pukeko.xyz
|
||||
volumes:
|
||||
- ./config/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||
- ./config/alerts.yml:/etc/prometheus/alerts.yml
|
||||
- ./config/:/etc/prometheus/
|
||||
depends_on:
|
||||
- cadvisor
|
||||
restart: unless-stopped
|
||||
@@ -19,52 +19,69 @@ services:
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.docker.network=prometheus_network"
|
||||
- "traefik.http.routers.prometheus.entrypoints=websecure"
|
||||
- "traefik.http.routers.prometheus.entrypoints=pukekos"
|
||||
- "traefik.http.routers.prometheus.rule=Host(`monitor.pukeko.xyz`)"
|
||||
- "traefik.http.routers.prometheus.tls.certresolver=pukekoresolver"
|
||||
- "traefik.http.routers.prometheus.tls.certresolver=takaheresolver"
|
||||
- "traefik.http.routers.prometheus.middlewares=authelia@docker"
|
||||
|
||||
node-exporter:
|
||||
image: prom/node-exporter
|
||||
container_name: node-exporter
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
- /:/rootfs:ro
|
||||
- ./data/:/etc/node-exporter/textfile_collector/
|
||||
command:
|
||||
- '--path.procfs=/host/proc'
|
||||
- '--path.sysfs=/host/sys'
|
||||
- '--path.rootfs=/rootfs'
|
||||
- '--collector.textfile.directory=/etc/node-exporter/textfile_collector'
|
||||
networks:
|
||||
- internal
|
||||
|
||||
|
||||
alertmanager:
|
||||
container_name: alertmanager
|
||||
image: prom/alertmanager
|
||||
privileged: true
|
||||
image: prom/alertmanager:latest
|
||||
command:
|
||||
- --cluster.advertise-address=192.168.0.66:9093
|
||||
- --config.file=/etc/prometheus/alertmanager.yml
|
||||
- --web.external-url=https://monitor.pukeko.xyz
|
||||
volumes:
|
||||
- ./config/alerts.yml:/etc/prometheus/alerts.yml
|
||||
- ./config/alertmanager.yml:/etc/prometheus/alertmanager.yml
|
||||
ports:
|
||||
- '9093:9093'
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- internal
|
||||
|
||||
cadvisor:
|
||||
image: gcr.io/cadvisor/cadvisor:latest
|
||||
image: gcr.io/cadvisor/cadvisor
|
||||
container_name: cadvisor
|
||||
ports:
|
||||
- 1010:8080
|
||||
volumes:
|
||||
- /:/rootfs:ro
|
||||
- /var/run:/var/run:rw
|
||||
- /sys:/sys:ro
|
||||
- /var/lib/docker/:/var/lib/docker:ro
|
||||
depends_on:
|
||||
- redis
|
||||
- /:/rootfs:ro
|
||||
- /var/run:/var/run:rw
|
||||
- /sys:/sys:ro
|
||||
- /var/lib/docker/:/var/lib/docker:ro
|
||||
- /sys/fs/cgroup:/sys/fs/cgroup:ro
|
||||
command: ["--port=8080"]
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- internal
|
||||
|
||||
redis:
|
||||
image: redis:alpine
|
||||
container_name: redis
|
||||
ports:
|
||||
- 6379:6379
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- internal
|
||||
|
||||
grafana:
|
||||
container_name: grafana
|
||||
image: grafana/grafana
|
||||
depends_on:
|
||||
- prometheus
|
||||
ports:
|
||||
- '1000:3000'
|
||||
volumes:
|
||||
- './grafana/data:/var/lib/grafana'
|
||||
- './grafana/provisioning/:/etc/grafana/provisioning/'
|
||||
@@ -72,18 +89,21 @@ services:
|
||||
restart: unless-stopped
|
||||
user: '1000'
|
||||
networks:
|
||||
- traefik_network
|
||||
- network
|
||||
- internal
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.docker.network=prometheus_network"
|
||||
- "traefik.http.routers.grafana.entrypoints=websecure"
|
||||
- "traefik.http.routers.grafana.entrypoints=pukekos"
|
||||
- "traefik.http.services.grafana.loadbalancer.server.port=3000"
|
||||
- "traefik.http.routers.grafana.rule=Host(`flight.pukeko.xyz`)"
|
||||
- "traefik.http.routers.grafana.tls.certresolver=pukekoresolver"
|
||||
- "traefik.http.routers.grafana.tls.certresolver=takaheresolver"
|
||||
- "traefik.http.routers.grafana.middlewares=authelia@docker"
|
||||
networks:
|
||||
network:
|
||||
driver: bridge
|
||||
internal:
|
||||
driver: bridge
|
||||
traefik_network:
|
||||
external: true
|
||||
|
||||
@@ -45,7 +45,7 @@ domain = localhost
|
||||
enforce_domain = false
|
||||
|
||||
# The full public facing url
|
||||
root_url = %(protocol)s://%(domain)s:%(http_port)s/
|
||||
root_url = https://flight.pukeko.xyz
|
||||
|
||||
# Serve Grafana from subpath specified in `root_url` setting. By default it is set to `false` for compatibility reasons.
|
||||
serve_from_sub_path = false
|
||||
@@ -507,34 +507,21 @@ role_attribute_strict = false
|
||||
|
||||
#################################### Generic OAuth #######################
|
||||
[auth.generic_oauth]
|
||||
name = OAuth
|
||||
enabled = false
|
||||
allow_sign_up = true
|
||||
client_id = some_id
|
||||
client_secret =
|
||||
scopes = user:email
|
||||
empty_scopes = false
|
||||
email_attribute_name = email:primary
|
||||
email_attribute_path =
|
||||
login_attribute_path =
|
||||
name_attribute_path =
|
||||
role_attribute_path =
|
||||
role_attribute_strict = false
|
||||
groups_attribute_path =
|
||||
id_token_attribute_name =
|
||||
team_ids_attribute_path =
|
||||
auth_url =
|
||||
token_url =
|
||||
api_url =
|
||||
teams_url =
|
||||
allowed_domains =
|
||||
team_ids =
|
||||
allowed_organizations =
|
||||
tls_skip_verify_insecure = false
|
||||
tls_client_cert =
|
||||
tls_client_key =
|
||||
tls_client_ca =
|
||||
use_pkce = false
|
||||
enabled = true
|
||||
name = Authelia
|
||||
icon= signin
|
||||
client_id = grafana
|
||||
client_secret = P6x3vpNvZcLCZnmwts7E3sEYmtnLVx2cmjPafyFjNRHRsJmcBajaGYzdYjEB4iZemmCTK5H5QAxqg8fSmjMkydKkYcynDgbCciR3tdz3XbcKgRX3LpDVFHqejEKLPz7n
|
||||
scopes = openid profile groups email
|
||||
empty_scopes= falise
|
||||
auth_url = https://auth.pukeko.xyz/api/oidc/authorize
|
||||
token_url = https://auth.pukeko.xyz/api/oidc/token
|
||||
api_url = https://auth.pukeko.xyz/api/oidc/userinfo
|
||||
login_attribute_path = preferred_username
|
||||
groups_attribute_path = groups
|
||||
name_attribute_path = name
|
||||
use_pkce = true
|
||||
|
||||
|
||||
#################################### Basic Auth ##########################
|
||||
[auth.basic]
|
||||
|
||||
Reference in New Issue
Block a user