refactor: provision alerts instead of putting them in the /var/lib folder #16
2
abra.sh
2
abra.sh
@ -5,13 +5,13 @@ export GRAFANA_SWARM_DASHBOARD_JSON_VERSION=v2
|
||||
export GRAFANA_STACKS_DASHBOARD_JSON_VERSION=v2
|
||||
export GRAFANA_TRAEFIK_DASHBOARD_JSON_VERSION=v2
|
||||
export GRAFANA_BACKUP_DASHBOARD_JSON_VERSION=v1
|
||||
export GRAFANA_ALERTS_JSON_VERSION=v3
|
||||
export GRAFANA_CUSTOM_INI_VERSION=v4
|
||||
export PROMTAIL_YML_VERSION=v3
|
||||
export LOKI_YML_VERSION=v2
|
||||
export PROMETHEUS_YML_VERSION=v2
|
||||
export MATRIX_ALERTMANAGER_CONFIG_VERSION=e
|
||||
export MATRIX_ALERTMANAGER_ENTRYPOINT_VERSION=a
|
||||
export GRAFANA_ALERTS_NODE_VERSION=v1c
|
||||
|
||||
# creates a default prometheus scrape config for a given node
|
||||
add_node(){
|
||||
|
||||
131
alerts/node.yml.tmpl
Normal file
131
alerts/node.yml.tmpl
Normal file
@ -0,0 +1,131 @@
|
||||
apiVersion: 1
|
||||
|
||||
# List of alert rule UIDs that should be deleted
|
||||
deleteRules:
|
||||
{{ if ne (env "ALERT_NODE_DISK_SPACE_ENABLED") "true" }}
|
||||
- orgId: 1
|
||||
uid: bds8bhxu97pxca
|
||||
{{ end }}
|
||||
{{ if ne (env "ALERT_NODE_MEMORY_USAGE_ENABLED") "true" }}
|
||||
- orgId: 1
|
||||
uid: ads8cswmly96oa
|
||||
{{ end }}
|
||||
|
||||
groups:
|
||||
- orgId: 1
|
||||
name: node
|
||||
folder: node
|
||||
interval: 5m
|
||||
rules:
|
||||
{{ if eq (env "ALERT_NODE_DISK_SPACE_ENABLED") "true" }}
|
||||
- uid: bds8bhxu97pxca
|
||||
title: Node Disk Space
|
||||
condition: C
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
datasourceUid: PBFA97CFB590B2093
|
||||
model:
|
||||
editorMode: code
|
||||
expr: (node_filesystem_free_bytes{fstype="ext4"} / node_filesystem_size_bytes{fstype="ext4"}) * 100
|
||||
instant: true
|
||||
intervalMs: 1000
|
||||
legendFormat: __auto
|
||||
maxDataPoints: 43200
|
||||
range: false
|
||||
refId: A
|
||||
- refId: C
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 10
|
||||
type: lt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params:
|
||||
- C
|
||||
reducer:
|
||||
params: []
|
||||
type: last
|
||||
type: query
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
refId: C
|
||||
type: threshold
|
||||
noDataState: NoData
|
||||
execErrState: Error
|
||||
for: 5m
|
||||
annotations:
|
||||
description: ""
|
||||
runbook_url: ""
|
||||
summary: Less than 10% disk space left on {{`{{ $labels.instance }}`}} ({{`{{ (index $values "A").Value }}`}}% left)
|
||||
labels:
|
||||
"": ""
|
||||
isPaused: false
|
||||
{{ end }}
|
||||
{{ if eq (env "ALERT_NODE_MEMORY_USAGE_ENABLED") "true" }}
|
||||
- uid: ads8cswmly96oa
|
||||
title: Node Memory Usage
|
||||
condition: C
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
datasourceUid: PBFA97CFB590B2093
|
||||
model:
|
||||
editorMode: code
|
||||
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100
|
||||
instant: true
|
||||
intervalMs: 1000
|
||||
legendFormat: __auto
|
||||
maxDataPoints: 43200
|
||||
range: false
|
||||
refId: A
|
||||
- refId: C
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 85
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params:
|
||||
- C
|
||||
reducer:
|
||||
params: []
|
||||
type: last
|
||||
type: query
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
refId: C
|
||||
type: threshold
|
||||
noDataState: NoData
|
||||
execErrState: Error
|
||||
for: 5m
|
||||
annotations:
|
||||
summary: Memory usage is above 85% on {{`{{ $labels.instance }}`}} ({{`{{ printf "%.2f" (index $values "A").Value }}`}}% usage)
|
||||
isPaused: false
|
||||
{{ end }}
|
||||
@ -2,6 +2,7 @@ version: '3.8'
|
||||
|
||||
services:
|
||||
grafana:
|
||||
secrets:
|
||||
- grafana_oidc_client_secret
|
||||
environment:
|
||||
- OIDC_API_URL
|
||||
|
||||
@ -22,8 +22,8 @@ services:
|
||||
target: /var/lib/grafana/dashboards/traefik.json
|
||||
- source: grafana_backup_dashboard_json
|
||||
target: /var/lib/grafana/dashboards/backup.json
|
||||
- source: grafana_alerts_json
|
||||
target: /var/lib/grafana/alerts/alerts.json
|
||||
- source: gf_alerts_node
|
||||
target: /etc/grafana/provisioning/alerting/node.yml
|
||||
networks:
|
||||
- proxy
|
||||
- internal
|
||||
@ -32,6 +32,8 @@ services:
|
||||
- GF_SECURITY_ADMIN_PASSWORD__FILE=/run/secrets/grafana_admin_password
|
||||
- GF_SECURITY_ALLOW_EMBEDDING
|
||||
- GF_INSTALL_PLUGINS
|
||||
- ALERT_NODE_DISK_SPACE_ENABLED
|
||||
- ALERT_NODE_MEMORY_USAGE_ENABLED
|
||||
deploy:
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
@ -71,10 +73,10 @@ configs:
|
||||
grafana_backup_dashboard_json:
|
||||
name: ${STACK_NAME}_g_backup_dashboard_json_${GRAFANA_BACKUP_DASHBOARD_JSON_VERSION}
|
||||
file: grafana-backup-dashboard.json
|
||||
grafana_alerts_json:
|
||||
gf_alerts_node:
|
||||
template_driver: golang
|
||||
name: ${STACK_NAME}_g_alerts_json_${GRAFANA_ALERTS_JSON_VERSION}
|
||||
file: grafana-alerts.json.tmpl
|
||||
name: ${STACK_NAME}_gf_alerts_node_${GRAFANA_ALERTS_NODE_VERSION}
|
||||
file: alerts/node.yml.tmpl
|
||||
|
||||
volumes:
|
||||
grafana-data:
|
||||
|
||||
Reference in New Issue
Block a user