From 0352a393de77d58c8462a69cf805f5cc7e3c8156 Mon Sep 17 00:00:00 2001 From: p4u1 Date: Mon, 30 Dec 2024 14:43:30 +0100 Subject: [PATCH] feat: Adds dashboard and alerts for backupbot --- .env.sample | 7 + README.md | 9 + abra.sh | 4 +- compose.grafana.yml | 11 ++ grafana-alerts.json.tmpl | 315 ++++++++++++++++++++++++++++++++++ grafana-backup-dashboard.json | 228 ++++++++++++++++++++++++ grafana-dashboards.yml | 10 ++ 7 files changed, 583 insertions(+), 1 deletion(-) create mode 100644 grafana-alerts.json.tmpl create mode 100644 grafana-backup-dashboard.json diff --git a/.env.sample b/.env.sample index f8215d3..25d0586 100644 --- a/.env.sample +++ b/.env.sample @@ -64,3 +64,10 @@ ENABLE_BACKUPS=true # GF_SMTP_SKIP_VERIFY=false # SECRET_GRAFANA_SMTP_PASSWORD_VERSION=v1 # + +# ALerts +#ALERT_BACKUP_FAILED_ENABLED=true +#ALERT_BACKUP_MISSING_ENABLED=true +#ALERT_BACKUP_NOT_SUCCESSFULL_ENABLED=true +#ALERT_NODE_DISK_SPACE_ENABLED=true +#ALERT_NODE_MEMORY_USAGE_ENABLED=true diff --git a/README.md b/README.md index dc43924..36166ca 100644 --- a/README.md +++ b/README.md @@ -129,3 +129,12 @@ After that you need to add the `pushgateway.${DOMAIN}` to the scare config. --- THX to the previous work of @decentral1se @knooflok @3wc @cellarspoon @mirsal + +## alerts + +It is possible to enable the following alerts, by setting the corresponding env variable to `true`: +- backupbot failed: `ALERT_BACKUP_FAILED_ENABLED` +- backupbot missing: `ALERT_BACKUP_MISSING_ENABLED` +- backupbot not successfull: `ALERT_BACKUP_NOT_SUCCESSFULL_ENABLED` +- node disk space: `ALERT_NODE_DISK_SPACE_ENABLED` +- node memory usage: `ALERT_NODE_MEMORY_USAGE_ENABLED` diff --git a/abra.sh b/abra.sh index 73d63ae..85dc51d 100644 --- a/abra.sh +++ b/abra.sh @@ -1,9 +1,11 @@ export ENTRYPOINT_VERSION=v1 export GRAFANA_DATASOURCES_YML_VERSION=v1 -export GRAFANA_DASHBOARDS_YML_VERSION=v1 +export GRAFANA_DASHBOARDS_YML_VERSION=v2 export GRAFANA_SWARM_DASHBOARD_JSON_VERSION=v2 export GRAFANA_STACKS_DASHBOARD_JSON_VERSION=v2 export GRAFANA_TRAEFIK_DASHBOARD_JSON_VERSION=v2 +export GRAFANA_BACKUP_DASHBOARD_JSON_VERSION=v1 +export GRAFANA_ALERTS_JSON_VERSION=v3 export GRAFANA_CUSTOM_INI_VERSION=v4 export PROMTAIL_YML_VERSION=v3 export LOKI_YML_VERSION=v2 diff --git a/compose.grafana.yml b/compose.grafana.yml index f5495e7..36c605a 100644 --- a/compose.grafana.yml +++ b/compose.grafana.yml @@ -22,6 +22,10 @@ services: target: /var/lib/grafana/dashboards/docker-swarm-stacks.json - source: grafana_traefik_dashboard_json target: /var/lib/grafana/dashboards/traefik.json + - source: grafana_backup_dashboard_json + target: /var/lib/grafana/dashboards/backup.json + - source: grafana_alerts_json + target: /var/lib/grafana/alerts/alerts.json networks: - proxy - internal @@ -76,6 +80,13 @@ configs: grafana_traefik_dashboard_json: name: ${STACK_NAME}_g_traefik_dashboard_json_${GRAFANA_TRAEFIK_DASHBOARD_JSON_VERSION} file: grafana-traefik-dashboard.json + grafana_backup_dashboard_json: + name: ${STACK_NAME}_g_backup_dashboard_json_${GRAFANA_BACKUP_DASHBOARD_JSON_VERSION} + file: grafana-backup-dashboard.json + grafana_alerts_json: + template_driver: golang + name: ${STACK_NAME}_g_alerts_json_${GRAFANA_ALERTS_JSON_VERSION} + file: grafana-alerts.json.tmpl volumes: grafana-data: diff --git a/grafana-alerts.json.tmpl b/grafana-alerts.json.tmpl new file mode 100644 index 0000000..c7b8cab --- /dev/null +++ b/grafana-alerts.json.tmpl @@ -0,0 +1,315 @@ +{ + "apiVersion": 1, + "groups": [ + { + "orgId": 1, + "name": "backupbot", + "folder": "node", + "interval": "1m", + "rules": [ + {{ if eq (env "ALERT_BACKUP_FAILED_ENABLED") "true" }} + { + "uid": "de8e5xxup7t34a", + "title": "Backup Failed", + "condition": "C", + "data": [ + { + "refId": "A", + "relativeTimeRange": { "from": 600, "to": 0 }, + "datasourceUid": "PBFA97CFB590B2093", + "model": { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "backup", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": true, + "intervalMs": 1000, + "legendFormat": "__auto", + "maxDataPoints": 43200, + "range": false, + "refId": "A", + "useBackend": false + } + }, + { + "refId": "C", + "relativeTimeRange": { "from": 600, "to": 0 }, + "datasourceUid": "__expr__", + "model": { + "conditions": [ + { + "evaluator": { "params": [0], "type": "lt" }, + "operator": { "type": "and" }, + "query": { "params": ["C"] }, + "reducer": { "params": [], "type": "last" }, + "type": "query" + } + ], + "datasource": { "type": "__expr__", "uid": "__expr__" }, + "expression": "A", + "intervalMs": 1000, + "maxDataPoints": 43200, + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "NoData", + "execErrState": "Error", + "for": "1m", + "isPaused": false + }, + {{ end }} + {{ if eq (env "ALERT_BACKUP_MISSING_ENABLED") "true" }} + { + "uid": "ce8e65uddcwe8d", + "title": "Backup Missing", + "condition": "B", + "data": [ + { + "refId": "A", + "relativeTimeRange": { "from": 600, "to": 0 }, + "datasourceUid": "PBFA97CFB590B2093", + "model": { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "rate(backup[24h])", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": true, + "intervalMs": 1000, + "legendFormat": "__auto", + "maxDataPoints": 43200, + "range": false, + "refId": "A", + "useBackend": false + } + }, + { + "refId": "B", + "relativeTimeRange": { "from": 600, "to": 0 }, + "datasourceUid": "__expr__", + "model": { + "conditions": [ + { + "evaluator": { "params": [0, 0], "type": "within_range" }, + "operator": { "type": "and" }, + "query": { "params": ["C"] }, + "reducer": { "params": [], "type": "last" }, + "type": "query" + } + ], + "datasource": { "type": "__expr__", "uid": "__expr__" }, + "expression": "A", + "intervalMs": 1000, + "maxDataPoints": 43200, + "refId": "B", + "type": "threshold" + } + } + ], + "noDataState": "NoData", + "execErrState": "Error", + "for": "5m", + "isPaused": false + }, + {{ end }} + {{ if eq (env "ALERT_BACKUP_NOT_SUCCESSFULL_ENABLED") "true" }} + { + "uid": "de8e6bc92a8lcc", + "title": "Backup Not Successfull", + "condition": "B", + "data": [ + { + "refId": "A", + "relativeTimeRange": { + "from": 60, + "to": 0 + }, + "datasourceUid": "PBFA97CFB590B2093", + "model": { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "backup", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": true, + "intervalMs": 1000, + "legendFormat": "__auto", + "maxDataPoints": 43200, + "range": false, + "refId": "A", + "useBackend": false + } + }, + { + "refId": "B", + "relativeTimeRange": { + "from": 60, + "to": 0 + }, + "datasourceUid": "__expr__", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 0 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "C" + ] + }, + "reducer": { + "params": [], + "type": "last" + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "__expr__" + }, + "expression": "A", + "intervalMs": 1000, + "maxDataPoints": 43200, + "refId": "B", + "type": "threshold" + } + } + ], + "noDataState": "NoData", + "execErrState": "Error", + "for": "20m", + "annotations": { + "summary": "Backup did not finish within 20 minutes" + }, + "labels": {}, + "isPaused": false + } + {{ end }} + ] + }, + { + "orgId": 1, + "name": "node", + "folder": "node", + "interval": "5m", + "rules": [ + {{ if eq (env "ALERT_NODE_DISK_SPACE_ENABLED") "true" }} + { + "uid": "bds8bhxu97pxca", + "title": "Node Disk Space", + "condition": "C", + "data": [ + { + "refId": "A", + "relativeTimeRange": { "from": 600, "to": 0 }, + "datasourceUid": "PBFA97CFB590B2093", + "model": { + "editorMode": "code", + "expr": "(node_filesystem_free_bytes{fstype=\"ext4\",mountpoint=~\"(/$)|(/media.*)\"} / node_filesystem_size_bytes{fstype=\"ext4\",mountpoint=~\"(/$)|(/media.*)\"}) * 100", + "instant": true, + "intervalMs": 1000, + "legendFormat": "__auto", + "maxDataPoints": 43200, + "range": false, + "refId": "A" + } + }, + { + "refId": "C", + "relativeTimeRange": { "from": 600, "to": 0 }, + "datasourceUid": "__expr__", + "model": { + "conditions": [ + { + "evaluator": { "params": [10], "type": "lt" }, + "operator": { "type": "and" }, + "query": { "params": ["C"] }, + "reducer": { "params": [], "type": "last" }, + "type": "query" + } + ], + "datasource": { "type": "__expr__", "uid": "__expr__" }, + "expression": "A", + "intervalMs": 1000, + "maxDataPoints": 43200, + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "NoData", + "execErrState": "Error", + "for": "5m", + "annotations": {}, + "labels": {}, + "isPaused": false + }, + {{ end }} + {{ if eq (env "ALERT_NODE_MEMORY_USAGE_ENABLED") "true" }} + { + "uid": "ads8cswmly96oa", + "title": "Node Memory Usage", + "condition": "C", + "data": [ + { + "refId": "A", + "relativeTimeRange": { "from": 600, "to": 0 }, + "datasourceUid": "PBFA97CFB590B2093", + "model": { + "editorMode": "code", + "expr": "(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100", + "instant": true, + "intervalMs": 1000, + "legendFormat": "__auto", + "maxDataPoints": 43200, + "range": false, + "refId": "A" + } + }, + { + "refId": "C", + "relativeTimeRange": { "from": 600, "to": 0 }, + "datasourceUid": "__expr__", + "model": { + "conditions": [ + { + "evaluator": { "params": [90], "type": "gt" }, + "operator": { "type": "and" }, + "query": { "params": ["C"] }, + "reducer": { "params": [], "type": "last" }, + "type": "query" + } + ], + "datasource": { "type": "__expr__", "uid": "__expr__" }, + "expression": "A", + "intervalMs": 1000, + "maxDataPoints": 43200, + "refId": "C", + "type": "threshold" + } + } + ], + "noDataState": "NoData", + "execErrState": "Error", + "for": "5m", + "annotations": {}, + "labels": {}, + "isPaused": false + } + {{ end }} + ] + } + ] +} + diff --git a/grafana-backup-dashboard.json b/grafana-backup-dashboard.json new file mode 100644 index 0000000..64f02e0 --- /dev/null +++ b/grafana-backup-dashboard.json @@ -0,0 +1,228 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 6, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 2, + "axisSoftMin": -2, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [ + { + "options": { + "0": { + "color": "dark-green", + "index": 0 + }, + "1": { + "color": "dark-yellow", + "index": 1, + "text": "Running" + }, + "-1": { + "index": 2, + "text": "Fail" + } + }, + "type": "value" + } + ], + "max": 1, + "min": -1, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "string" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "disableTextWrap": false, + "editorMode": "builder", + "exemplar": false, + "expr": "backup", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Backup Status", + "type": "timeseries" + }, + { + "datasource": { + "type": "loki", + "uid": "P8E80F9AEF21F6940" + }, + "gridPos": { + "h": 11, + "w": 24, + "x": 0, + "y": 7 + }, + "id": 2, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": false, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": false + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "P8E80F9AEF21F6940" + }, + "editorMode": "builder", + "expr": "{service_name=\"$ServiceName\"} |= ``", + "queryType": "range", + "refId": "A" + } + ], + "title": "Backupbot Logs", + "type": "logs" + } + ], + "refresh": "auto", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": true, + "text": "backup_marx_klasse-methode_it_app", + "value": "backup_marx_klasse-methode_it_app" + }, + "datasource": { + "type": "loki", + "uid": "P8E80F9AEF21F6940" + }, + "definition": "", + "hide": 0, + "includeAll": false, + "label": "Backupbot Service", + "multi": false, + "name": "ServiceName", + "options": [], + "query": { + "label": "service_name", + "refId": "LokiVariableQueryEditor-VariableQuery", + "stream": "", + "type": 1 + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-24h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "backupbot-two", + "uid": "be8e2xeofw4xsa", + "version": 3, + "weekStart": "" +} diff --git a/grafana-dashboards.yml b/grafana-dashboards.yml index 8411cca..799f4a1 100644 --- a/grafana-dashboards.yml +++ b/grafana-dashboards.yml @@ -11,3 +11,13 @@ providers: options: path: /var/lib/grafana/dashboards foldersFromFilesStructure: true + - name: 'default-alert-provider' + orgId: 1 + folder: 'default-alerts' + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: true + options: + path: /var/lib/grafana/alerts + foldersFromFilesStructure: true