8 Commits

Author SHA1 Message Date
0352a393de feat: Adds dashboard and alerts for backupbot 2024-12-30 14:43:30 +01:00
92e7bbc730 chore: publish 1.3.0+v1.8.1 release 2024-12-28 13:55:05 +01:00
5bf3d31c0f docs: Improves documentation for pushgateway 2024-12-28 13:53:31 +01:00
a14cb575a2 fix: shorten names for dashboard files 2024-12-21 14:37:10 +01:00
1a59dfac7f add pushgateway 2024-12-21 14:23:50 +01:00
a9b76dff65 update backupbot label 2024-10-24 17:32:29 +02:00
0401de1d16 chore: publish 1.2.0+v1.8.1 release 2024-07-17 13:36:04 +02:00
aa133fcfea add backup label 2024-04-17 16:02:38 +02:00
12 changed files with 638 additions and 15 deletions

View File

@ -3,6 +3,7 @@ LETS_ENCRYPT_ENV=production
COMPOSE_FILE=compose.yml
DOMAIN=monitoring.example.com
TIMEOUT=120
ENABLE_BACKUPS=true
## Enable this secret for Promtail / Prometheus
# SECRET_BASIC_AUTH_VERSION=v1
@ -16,6 +17,10 @@ TIMEOUT=120
## Prometheus
# COMPOSE_FILE="$COMPOSE_FILE:compose.prometheus.yml"
# PROMETHEUS_RETENTION_TIME=1y
#
## Prometheus Pushgateway
# COMPOSE_FILE="$COMPOSE_FILE:compose.pushgateway.yml"
#
## Loki
# Loki Server
#
@ -59,3 +64,10 @@ TIMEOUT=120
# GF_SMTP_SKIP_VERIFY=false
# SECRET_GRAFANA_SMTP_PASSWORD_VERSION=v1
#
# ALerts
#ALERT_BACKUP_FAILED_ENABLED=true
#ALERT_BACKUP_MISSING_ENABLED=true
#ALERT_BACKUP_NOT_SUCCESSFULL_ENABLED=true
#ALERT_NODE_DISK_SPACE_ENABLED=true
#ALERT_NODE_MEMORY_USAGE_ENABLED=true

View File

@ -54,7 +54,7 @@ Where gathering.org is the node you want to gather metrics from.
- monitoring.example.org
- prometheus.monitoring.example.org
- loki.monitoring.example.org
1. Setup monitoring stack
2. Setup monitoring stack
- `abra app new monitoring-ng`
- `abra app config monitoring.example.org`
Uncomment all the stuff
@ -62,7 +62,7 @@ Where gathering.org is the node you want to gather metrics from.
this needs the plaintext traefik basic-auth secret, not the hashed one!
- `abra app secret ls monitoring.example.org`
- `abra app deploy monitoring.example.org`
1. add scrape config to prometheus
3. Add scrape config to prometheus
- `abra app cmd monitoring.example.org prometheus gathering.org`
- or manually
```
@ -101,8 +101,18 @@ $ echo '{
$ systemctl restart docker.service
```
## Setup Push Gateway
1. Enable in the env fiöle by uncommenting the following lines:
```
## Prometheus Pushgateway
# COMPOSE_FILE="$COMPOSE_FILE:compose.pushgateway.yml"
```
2. `abra app deploy monitoring.example.org`
This will expose the pushgateway at `https://pushgateway.${DOMAIN}`.
It is secured behind the same basic auth as the other services.
After that you need to add the `pushgateway.${DOMAIN}` to the scare config.
## Post-setup guide
@ -119,3 +129,12 @@ $ systemctl restart docker.service
---
THX to the previous work of @decentral1se @knooflok @3wc @cellarspoon @mirsal
## alerts
It is possible to enable the following alerts, by setting the corresponding env variable to `true`:
- backupbot failed: `ALERT_BACKUP_FAILED_ENABLED`
- backupbot missing: `ALERT_BACKUP_MISSING_ENABLED`
- backupbot not successfull: `ALERT_BACKUP_NOT_SUCCESSFULL_ENABLED`
- node disk space: `ALERT_NODE_DISK_SPACE_ENABLED`
- node memory usage: `ALERT_NODE_MEMORY_USAGE_ENABLED`

View File

@ -1,9 +1,11 @@
export ENTRYPOINT_VERSION=v1
export GRAFANA_DATASOURCES_YML_VERSION=v1
export GRAFANA_DASHBOARDS_YML_VERSION=v1
export GRAFANA_DASHBOARDS_YML_VERSION=v2
export GRAFANA_SWARM_DASHBOARD_JSON_VERSION=v2
export GRAFANA_STACKS_DASHBOARD_JSON_VERSION=v2
export GRAFANA_TRAEFIK_DASHBOARD_JSON_VERSION=v2
export GRAFANA_BACKUP_DASHBOARD_JSON_VERSION=v1
export GRAFANA_ALERTS_JSON_VERSION=v3
export GRAFANA_CUSTOM_INI_VERSION=v4
export PROMTAIL_YML_VERSION=v3
export LOKI_YML_VERSION=v2

View File

@ -2,7 +2,7 @@ version: '3.8'
services:
grafana:
image: grafana/grafana:10.4.2
image: grafana/grafana:10.4.14
volumes:
- grafana-data:/var/lib/grafana:rw
secrets:
@ -22,6 +22,10 @@ services:
target: /var/lib/grafana/dashboards/docker-swarm-stacks.json
- source: grafana_traefik_dashboard_json
target: /var/lib/grafana/dashboards/traefik.json
- source: grafana_backup_dashboard_json
target: /var/lib/grafana/dashboards/backup.json
- source: grafana_alerts_json
target: /var/lib/grafana/alerts/alerts.json
networks:
- proxy
- internal
@ -62,20 +66,27 @@ configs:
name: ${STACK_NAME}_grafana_custom_ini_${GRAFANA_CUSTOM_INI_VERSION}
file: grafana_custom.ini
grafana_datasources_yml:
name: ${STACK_NAME}_grafana_datasources_yml_${GRAFANA_DATASOURCES_YML_VERSION}
name: ${STACK_NAME}_g_datasources_yml_${GRAFANA_DATASOURCES_YML_VERSION}
file: grafana-datasources.yml
grafana_dashboards_yml:
name: ${STACK_NAME}_grafana_dashboards_yml_${GRAFANA_DASHBOARDS_YML_VERSION}
name: ${STACK_NAME}_g_dashboards_yml_${GRAFANA_DASHBOARDS_YML_VERSION}
file: grafana-dashboards.yml
grafana_swarm_dashboard_json:
name: ${STACK_NAME}_grafana_swarm_dashboard_json_${GRAFANA_SWARM_DASHBOARD_JSON_VERSION}
name: ${STACK_NAME}_g_swarm_dashboard_json_${GRAFANA_SWARM_DASHBOARD_JSON_VERSION}
file: grafana-swarm-dashboard.json
grafana_stacks_dashboard_json:
name: ${STACK_NAME}_grafana_stacks_dashboard_json_${GRAFANA_STACKS_DASHBOARD_JSON_VERSION}
name: ${STACK_NAME}_g_stacks_dashboard_json_${GRAFANA_STACKS_DASHBOARD_JSON_VERSION}
file: grafana-stacks-dashboard.json
grafana_traefik_dashboard_json:
name: ${STACK_NAME}_grafana_traefik_dashboard_json_${GRAFANA_TRAEFIK_DASHBOARD_JSON_VERSION}
name: ${STACK_NAME}_g_traefik_dashboard_json_${GRAFANA_TRAEFIK_DASHBOARD_JSON_VERSION}
file: grafana-traefik-dashboard.json
grafana_backup_dashboard_json:
name: ${STACK_NAME}_g_backup_dashboard_json_${GRAFANA_BACKUP_DASHBOARD_JSON_VERSION}
file: grafana-backup-dashboard.json
grafana_alerts_json:
template_driver: golang
name: ${STACK_NAME}_g_alerts_json_${GRAFANA_ALERTS_JSON_VERSION}
file: grafana-alerts.json.tmpl
volumes:
grafana-data:

View File

@ -2,7 +2,7 @@ version: '3.8'
services:
loki:
image: grafana/loki:2.9.7
image: grafana/loki:2.9.11
command: -config.file=/etc/loki/local-config.yaml
networks:
- proxy

View File

@ -2,7 +2,7 @@ version: '3.8'
services:
prometheus:
image: prom/prometheus:v2.51.2
image: prom/prometheus:v2.55.1
secrets:
- basic_auth
volumes:

View File

@ -2,7 +2,7 @@ version: "3.8"
services:
promtail:
image: grafana/promtail:2.9.7
image: grafana/promtail:2.9.11
volumes:
- /var/log:/var/log:ro
- /var/run/docker.sock:/var/run/docker.sock

25
compose.pushgateway.yml Normal file
View File

@ -0,0 +1,25 @@
version: '3.8'
services:
pushgateway:
image: prom/pushgateway:v1.10.0
command:
- '--web.listen-address=:9191'
- '--push.disable-consistency-check'
- '--persistence.interval=5m'
ports:
- 9191:9191
networks:
- internal
- proxy
deploy:
restart_policy:
condition: on-failure
labels:
- "traefik.enable=true"
- "traefik.http.services.${STACK_NAME}-pushgateway.loadbalancer.server.port=9191"
- "traefik.http.routers.${STACK_NAME}-pushgateway.rule=Host(`pushgateway.${DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-pushgateway.entrypoints=web-secure"
- "traefik.http.routers.${STACK_NAME}-pushgateway.tls=true"
- "traefik.http.routers.${STACK_NAME}-pushgateway.tls.certresolver=${LETS_ENCRYPT_ENV}"
- "traefik.http.routers.${STACK_NAME}-pushgateway.middlewares=basicauth@file"

View File

@ -3,7 +3,7 @@ version: "3.8"
services:
app:
image: prom/node-exporter:v1.7.0
image: prom/node-exporter:v1.8.1
user: root
environment:
- NODE_ID={{.Node.ID}}
@ -30,6 +30,7 @@ services:
restart_policy:
condition: on-failure
labels:
- "backupbot.backup=${ENABLE_BACKUPS:-true}"
- "traefik.enable=true"
- "traefik.http.services.${STACK_NAME}-node.loadbalancer.server.port=9100"
- "traefik.http.routers.${STACK_NAME}-node.rule=Host(`node.${DOMAIN}`)"
@ -37,11 +38,11 @@ services:
- "traefik.http.routers.${STACK_NAME}-node.tls=true"
- "traefik.http.routers.${STACK_NAME}-node.tls.certresolver=${LETS_ENCRYPT_ENV}"
- "traefik.http.routers.${STACK_NAME}-node.middlewares=basicauth@file"
- "coop-cloud.${STACK_NAME}.version=1.1.0+v1.7.0"
- "coop-cloud.${STACK_NAME}.version=1.3.0+v1.8.1"
- "coop-cloud.${STACK_NAME}.timeout=${TIMEOUT:-120}"
cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.49.1
image: gcr.io/cadvisor/cadvisor:v0.49.2
command:
- "-logtostderr"
- "--enable_metrics=cpu,cpuLoad,disk,diskIO,process,memory,network"

315
grafana-alerts.json.tmpl Normal file
View File

@ -0,0 +1,315 @@
{
"apiVersion": 1,
"groups": [
{
"orgId": 1,
"name": "backupbot",
"folder": "node",
"interval": "1m",
"rules": [
{{ if eq (env "ALERT_BACKUP_FAILED_ENABLED") "true" }}
{
"uid": "de8e5xxup7t34a",
"title": "Backup Failed",
"condition": "C",
"data": [
{
"refId": "A",
"relativeTimeRange": { "from": 600, "to": 0 },
"datasourceUid": "PBFA97CFB590B2093",
"model": {
"disableTextWrap": false,
"editorMode": "builder",
"expr": "backup",
"fullMetaSearch": false,
"includeNullMetadata": true,
"instant": true,
"intervalMs": 1000,
"legendFormat": "__auto",
"maxDataPoints": 43200,
"range": false,
"refId": "A",
"useBackend": false
}
},
{
"refId": "C",
"relativeTimeRange": { "from": 600, "to": 0 },
"datasourceUid": "__expr__",
"model": {
"conditions": [
{
"evaluator": { "params": [0], "type": "lt" },
"operator": { "type": "and" },
"query": { "params": ["C"] },
"reducer": { "params": [], "type": "last" },
"type": "query"
}
],
"datasource": { "type": "__expr__", "uid": "__expr__" },
"expression": "A",
"intervalMs": 1000,
"maxDataPoints": 43200,
"refId": "C",
"type": "threshold"
}
}
],
"noDataState": "NoData",
"execErrState": "Error",
"for": "1m",
"isPaused": false
},
{{ end }}
{{ if eq (env "ALERT_BACKUP_MISSING_ENABLED") "true" }}
{
"uid": "ce8e65uddcwe8d",
"title": "Backup Missing",
"condition": "B",
"data": [
{
"refId": "A",
"relativeTimeRange": { "from": 600, "to": 0 },
"datasourceUid": "PBFA97CFB590B2093",
"model": {
"disableTextWrap": false,
"editorMode": "builder",
"expr": "rate(backup[24h])",
"fullMetaSearch": false,
"includeNullMetadata": true,
"instant": true,
"intervalMs": 1000,
"legendFormat": "__auto",
"maxDataPoints": 43200,
"range": false,
"refId": "A",
"useBackend": false
}
},
{
"refId": "B",
"relativeTimeRange": { "from": 600, "to": 0 },
"datasourceUid": "__expr__",
"model": {
"conditions": [
{
"evaluator": { "params": [0, 0], "type": "within_range" },
"operator": { "type": "and" },
"query": { "params": ["C"] },
"reducer": { "params": [], "type": "last" },
"type": "query"
}
],
"datasource": { "type": "__expr__", "uid": "__expr__" },
"expression": "A",
"intervalMs": 1000,
"maxDataPoints": 43200,
"refId": "B",
"type": "threshold"
}
}
],
"noDataState": "NoData",
"execErrState": "Error",
"for": "5m",
"isPaused": false
},
{{ end }}
{{ if eq (env "ALERT_BACKUP_NOT_SUCCESSFULL_ENABLED") "true" }}
{
"uid": "de8e6bc92a8lcc",
"title": "Backup Not Successfull",
"condition": "B",
"data": [
{
"refId": "A",
"relativeTimeRange": {
"from": 60,
"to": 0
},
"datasourceUid": "PBFA97CFB590B2093",
"model": {
"disableTextWrap": false,
"editorMode": "builder",
"expr": "backup",
"fullMetaSearch": false,
"includeNullMetadata": true,
"instant": true,
"intervalMs": 1000,
"legendFormat": "__auto",
"maxDataPoints": 43200,
"range": false,
"refId": "A",
"useBackend": false
}
},
{
"refId": "B",
"relativeTimeRange": {
"from": 60,
"to": 0
},
"datasourceUid": "__expr__",
"model": {
"conditions": [
{
"evaluator": {
"params": [
0
],
"type": "gt"
},
"operator": {
"type": "and"
},
"query": {
"params": [
"C"
]
},
"reducer": {
"params": [],
"type": "last"
},
"type": "query"
}
],
"datasource": {
"type": "__expr__",
"uid": "__expr__"
},
"expression": "A",
"intervalMs": 1000,
"maxDataPoints": 43200,
"refId": "B",
"type": "threshold"
}
}
],
"noDataState": "NoData",
"execErrState": "Error",
"for": "20m",
"annotations": {
"summary": "Backup did not finish within 20 minutes"
},
"labels": {},
"isPaused": false
}
{{ end }}
]
},
{
"orgId": 1,
"name": "node",
"folder": "node",
"interval": "5m",
"rules": [
{{ if eq (env "ALERT_NODE_DISK_SPACE_ENABLED") "true" }}
{
"uid": "bds8bhxu97pxca",
"title": "Node Disk Space",
"condition": "C",
"data": [
{
"refId": "A",
"relativeTimeRange": { "from": 600, "to": 0 },
"datasourceUid": "PBFA97CFB590B2093",
"model": {
"editorMode": "code",
"expr": "(node_filesystem_free_bytes{fstype=\"ext4\",mountpoint=~\"(/$)|(/media.*)\"} / node_filesystem_size_bytes{fstype=\"ext4\",mountpoint=~\"(/$)|(/media.*)\"}) * 100",
"instant": true,
"intervalMs": 1000,
"legendFormat": "__auto",
"maxDataPoints": 43200,
"range": false,
"refId": "A"
}
},
{
"refId": "C",
"relativeTimeRange": { "from": 600, "to": 0 },
"datasourceUid": "__expr__",
"model": {
"conditions": [
{
"evaluator": { "params": [10], "type": "lt" },
"operator": { "type": "and" },
"query": { "params": ["C"] },
"reducer": { "params": [], "type": "last" },
"type": "query"
}
],
"datasource": { "type": "__expr__", "uid": "__expr__" },
"expression": "A",
"intervalMs": 1000,
"maxDataPoints": 43200,
"refId": "C",
"type": "threshold"
}
}
],
"noDataState": "NoData",
"execErrState": "Error",
"for": "5m",
"annotations": {},
"labels": {},
"isPaused": false
},
{{ end }}
{{ if eq (env "ALERT_NODE_MEMORY_USAGE_ENABLED") "true" }}
{
"uid": "ads8cswmly96oa",
"title": "Node Memory Usage",
"condition": "C",
"data": [
{
"refId": "A",
"relativeTimeRange": { "from": 600, "to": 0 },
"datasourceUid": "PBFA97CFB590B2093",
"model": {
"editorMode": "code",
"expr": "(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100",
"instant": true,
"intervalMs": 1000,
"legendFormat": "__auto",
"maxDataPoints": 43200,
"range": false,
"refId": "A"
}
},
{
"refId": "C",
"relativeTimeRange": { "from": 600, "to": 0 },
"datasourceUid": "__expr__",
"model": {
"conditions": [
{
"evaluator": { "params": [90], "type": "gt" },
"operator": { "type": "and" },
"query": { "params": ["C"] },
"reducer": { "params": [], "type": "last" },
"type": "query"
}
],
"datasource": { "type": "__expr__", "uid": "__expr__" },
"expression": "A",
"intervalMs": 1000,
"maxDataPoints": 43200,
"refId": "C",
"type": "threshold"
}
}
],
"noDataState": "NoData",
"execErrState": "Error",
"for": "5m",
"annotations": {},
"labels": {},
"isPaused": false
}
{{ end }}
]
}
]
}

View File

@ -0,0 +1,228 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": 6,
"links": [],
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"axisSoftMax": 2,
"axisSoftMin": -2,
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [
{
"options": {
"0": {
"color": "dark-green",
"index": 0
},
"1": {
"color": "dark-yellow",
"index": 1,
"text": "Running"
},
"-1": {
"index": 2,
"text": "Fail"
}
},
"type": "value"
}
],
"max": 1,
"min": -1,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
},
"unit": "string"
},
"overrides": []
},
"gridPos": {
"h": 7,
"w": 24,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"disableTextWrap": false,
"editorMode": "builder",
"exemplar": false,
"expr": "backup",
"fullMetaSearch": false,
"includeNullMetadata": true,
"instant": false,
"legendFormat": "__auto",
"range": true,
"refId": "A",
"useBackend": false
}
],
"title": "Backup Status",
"type": "timeseries"
},
{
"datasource": {
"type": "loki",
"uid": "P8E80F9AEF21F6940"
},
"gridPos": {
"h": 11,
"w": 24,
"x": 0,
"y": 7
},
"id": 2,
"options": {
"dedupStrategy": "none",
"enableLogDetails": true,
"prettifyLogMessage": false,
"showCommonLabels": false,
"showLabels": false,
"showTime": true,
"sortOrder": "Descending",
"wrapLogMessage": false
},
"targets": [
{
"datasource": {
"type": "loki",
"uid": "P8E80F9AEF21F6940"
},
"editorMode": "builder",
"expr": "{service_name=\"$ServiceName\"} |= ``",
"queryType": "range",
"refId": "A"
}
],
"title": "Backupbot Logs",
"type": "logs"
}
],
"refresh": "auto",
"schemaVersion": 39,
"tags": [],
"templating": {
"list": [
{
"current": {
"selected": true,
"text": "backup_marx_klasse-methode_it_app",
"value": "backup_marx_klasse-methode_it_app"
},
"datasource": {
"type": "loki",
"uid": "P8E80F9AEF21F6940"
},
"definition": "",
"hide": 0,
"includeAll": false,
"label": "Backupbot Service",
"multi": false,
"name": "ServiceName",
"options": [],
"query": {
"label": "service_name",
"refId": "LokiVariableQueryEditor-VariableQuery",
"stream": "",
"type": 1
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
}
]
},
"time": {
"from": "now-24h",
"to": "now"
},
"timepicker": {},
"timezone": "browser",
"title": "backupbot-two",
"uid": "be8e2xeofw4xsa",
"version": 3,
"weekStart": ""
}

View File

@ -11,3 +11,13 @@ providers:
options:
path: /var/lib/grafana/dashboards
foldersFromFilesStructure: true
- name: 'default-alert-provider'
orgId: 1
folder: 'default-alerts'
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
options:
path: /var/lib/grafana/alerts
foldersFromFilesStructure: true