refactor: provision alerts instead of putting them in the /var/lib folder #16

Merged
p4u1 merged 1 commits from provision-alerts into main 2026-03-20 14:10:10 +00:00
4 changed files with 140 additions and 6 deletions

View File

@ -5,13 +5,13 @@ export GRAFANA_SWARM_DASHBOARD_JSON_VERSION=v2
export GRAFANA_STACKS_DASHBOARD_JSON_VERSION=v2
export GRAFANA_TRAEFIK_DASHBOARD_JSON_VERSION=v2
export GRAFANA_BACKUP_DASHBOARD_JSON_VERSION=v1
export GRAFANA_ALERTS_JSON_VERSION=v3
export GRAFANA_CUSTOM_INI_VERSION=v4
export PROMTAIL_YML_VERSION=v3
export LOKI_YML_VERSION=v2
export PROMETHEUS_YML_VERSION=v2
export MATRIX_ALERTMANAGER_CONFIG_VERSION=e
export MATRIX_ALERTMANAGER_ENTRYPOINT_VERSION=a
export GRAFANA_ALERTS_NODE_VERSION=v1c
# creates a default prometheus scrape config for a given node
add_node(){

131
alerts/node.yml.tmpl Normal file
View File

@ -0,0 +1,131 @@
apiVersion: 1
# List of alert rule UIDs that should be deleted
deleteRules:
{{ if ne (env "ALERT_NODE_DISK_SPACE_ENABLED") "true" }}
- orgId: 1
uid: bds8bhxu97pxca
{{ end }}
{{ if ne (env "ALERT_NODE_MEMORY_USAGE_ENABLED") "true" }}
- orgId: 1
uid: ads8cswmly96oa
{{ end }}
groups:
- orgId: 1
name: node
folder: node
interval: 5m
rules:
{{ if eq (env "ALERT_NODE_DISK_SPACE_ENABLED") "true" }}
- uid: bds8bhxu97pxca
title: Node Disk Space
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: PBFA97CFB590B2093
model:
editorMode: code
expr: (node_filesystem_free_bytes{fstype="ext4"} / node_filesystem_size_bytes{fstype="ext4"}) * 100
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 10
type: lt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 5m
annotations:
description: ""
runbook_url: ""
summary: Less than 10% disk space left on {{`{{ $labels.instance }}`}} ({{`{{ (index $values "A").Value }}`}}% left)
labels:
"": ""
isPaused: false
{{ end }}
{{ if eq (env "ALERT_NODE_MEMORY_USAGE_ENABLED") "true" }}
- uid: ads8cswmly96oa
title: Node Memory Usage
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: PBFA97CFB590B2093
model:
editorMode: code
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 85
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 5m
annotations:
summary: Memory usage is above 85% on {{`{{ $labels.instance }}`}} ({{`{{ printf "%.2f" (index $values "A").Value }}`}}% usage)
isPaused: false
{{ end }}

View File

@ -2,6 +2,7 @@ version: '3.8'
services:
grafana:
secrets:
- grafana_oidc_client_secret
environment:
- OIDC_API_URL

View File

@ -22,8 +22,8 @@ services:
target: /var/lib/grafana/dashboards/traefik.json
- source: grafana_backup_dashboard_json
target: /var/lib/grafana/dashboards/backup.json
- source: grafana_alerts_json
target: /var/lib/grafana/alerts/alerts.json
- source: gf_alerts_node
target: /etc/grafana/provisioning/alerting/node.yml
networks:
- proxy
- internal
@ -32,6 +32,8 @@ services:
- GF_SECURITY_ADMIN_PASSWORD__FILE=/run/secrets/grafana_admin_password
- GF_SECURITY_ALLOW_EMBEDDING
- GF_INSTALL_PLUGINS
- ALERT_NODE_DISK_SPACE_ENABLED
- ALERT_NODE_MEMORY_USAGE_ENABLED
deploy:
labels:
- "traefik.enable=true"
@ -71,10 +73,10 @@ configs:
grafana_backup_dashboard_json:
name: ${STACK_NAME}_g_backup_dashboard_json_${GRAFANA_BACKUP_DASHBOARD_JSON_VERSION}
file: grafana-backup-dashboard.json
grafana_alerts_json:
gf_alerts_node:
template_driver: golang
name: ${STACK_NAME}_g_alerts_json_${GRAFANA_ALERTS_JSON_VERSION}
file: grafana-alerts.json.tmpl
name: ${STACK_NAME}_gf_alerts_node_${GRAFANA_ALERTS_NODE_VERSION}
file: alerts/node.yml.tmpl
volumes:
grafana-data: