diff --git a/abra.sh b/abra.sh index fa5dcc4..bf3a0dc 100644 --- a/abra.sh +++ b/abra.sh @@ -5,13 +5,13 @@ export GRAFANA_SWARM_DASHBOARD_JSON_VERSION=v2 export GRAFANA_STACKS_DASHBOARD_JSON_VERSION=v2 export GRAFANA_TRAEFIK_DASHBOARD_JSON_VERSION=v2 export GRAFANA_BACKUP_DASHBOARD_JSON_VERSION=v1 -export GRAFANA_ALERTS_JSON_VERSION=v3 export GRAFANA_CUSTOM_INI_VERSION=v4 export PROMTAIL_YML_VERSION=v3 export LOKI_YML_VERSION=v2 export PROMETHEUS_YML_VERSION=v2 export MATRIX_ALERTMANAGER_CONFIG_VERSION=e export MATRIX_ALERTMANAGER_ENTRYPOINT_VERSION=a +export GRAFANA_ALERTS_NODE_VERSION=v1c # creates a default prometheus scrape config for a given node add_node(){ diff --git a/alerts/node.yml.tmpl b/alerts/node.yml.tmpl new file mode 100644 index 0000000..d984fc7 --- /dev/null +++ b/alerts/node.yml.tmpl @@ -0,0 +1,131 @@ +apiVersion: 1 + +# List of alert rule UIDs that should be deleted +deleteRules: + {{ if ne (env "ALERT_NODE_DISK_SPACE_ENABLED") "true" }} + - orgId: 1 + uid: bds8bhxu97pxca + {{ end }} + {{ if ne (env "ALERT_NODE_MEMORY_USAGE_ENABLED") "true" }} + - orgId: 1 + uid: ads8cswmly96oa + {{ end }} + +groups: +- orgId: 1 + name: node + folder: node + interval: 5m + rules: + {{ if eq (env "ALERT_NODE_DISK_SPACE_ENABLED") "true" }} + - uid: bds8bhxu97pxca + title: Node Disk Space + condition: C + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + editorMode: code + expr: (node_filesystem_free_bytes{fstype="ext4"} / node_filesystem_size_bytes{fstype="ext4"}) * 100 + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: C + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 10 + type: lt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + noDataState: NoData + execErrState: Error + for: 5m + annotations: + description: "" + runbook_url: "" + summary: Less than 10% disk space left on {{`{{ $labels.instance }}`}} ({{`{{ (index $values "A").Value }}`}}% left) + labels: + "": "" + isPaused: false + {{ end }} + {{ if eq (env "ALERT_NODE_MEMORY_USAGE_ENABLED") "true" }} + - uid: ads8cswmly96oa + title: Node Memory Usage + condition: C + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + editorMode: code + expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: C + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 85 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + noDataState: NoData + execErrState: Error + for: 5m + annotations: + summary: Memory usage is above 85% on {{`{{ $labels.instance }}`}} ({{`{{ printf "%.2f" (index $values "A").Value }}`}}% usage) + isPaused: false + {{ end }} diff --git a/compose.grafana-oidc.yml b/compose.grafana-oidc.yml index 8561020..a4a4bc8 100644 --- a/compose.grafana-oidc.yml +++ b/compose.grafana-oidc.yml @@ -2,6 +2,7 @@ version: '3.8' services: grafana: + secrets: - grafana_oidc_client_secret environment: - OIDC_API_URL diff --git a/compose.grafana.yml b/compose.grafana.yml index 87bd038..ab6dd70 100644 --- a/compose.grafana.yml +++ b/compose.grafana.yml @@ -22,8 +22,8 @@ services: target: /var/lib/grafana/dashboards/traefik.json - source: grafana_backup_dashboard_json target: /var/lib/grafana/dashboards/backup.json - - source: grafana_alerts_json - target: /var/lib/grafana/alerts/alerts.json + - source: gf_alerts_node + target: /etc/grafana/provisioning/alerting/node.yml networks: - proxy - internal @@ -32,6 +32,8 @@ services: - GF_SECURITY_ADMIN_PASSWORD__FILE=/run/secrets/grafana_admin_password - GF_SECURITY_ALLOW_EMBEDDING - GF_INSTALL_PLUGINS + - ALERT_NODE_DISK_SPACE_ENABLED + - ALERT_NODE_MEMORY_USAGE_ENABLED deploy: labels: - "traefik.enable=true" @@ -71,10 +73,10 @@ configs: grafana_backup_dashboard_json: name: ${STACK_NAME}_g_backup_dashboard_json_${GRAFANA_BACKUP_DASHBOARD_JSON_VERSION} file: grafana-backup-dashboard.json - grafana_alerts_json: + gf_alerts_node: template_driver: golang - name: ${STACK_NAME}_g_alerts_json_${GRAFANA_ALERTS_JSON_VERSION} - file: grafana-alerts.json.tmpl + name: ${STACK_NAME}_gf_alerts_node_${GRAFANA_ALERTS_NODE_VERSION} + file: alerts/node.yml.tmpl volumes: grafana-data: