diff --git a/.env.sample b/.env.sample index cb379a9..c9ac4e0 100644 --- a/.env.sample +++ b/.env.sample @@ -6,7 +6,8 @@ DOMAIN=monitoring-ng.example.com ENABLE_BACKUPS=true ## Enable this secret for Promtail / Prometheus -# SECRET_BASIC_AUTH_VERSION=v1 +#COMPOSE_FILE="$COMPOSE_FILE:compose.basic-auth.yml" +#SECRET_BASIC_AUTH_VERSION=v1 # # Promtail (Gathering Logs) # COMPOSE_FILE="$COMPOSE_FILE:compose.promtail.yml" @@ -79,9 +80,10 @@ ENABLE_BACKUPS=true #GF_MATRIX_ROOM_ID="" #GF_MATRIX_HOMESERVER_URL="" -# ALerts -#ALERT_BACKUP_FAILED_ENABLED=true -#ALERT_BACKUP_MISSING_ENABLED=true -#ALERT_BACKUP_NOT_SUCCESSFULL_ENABLED=true -#ALERT_NODE_DISK_SPACE_ENABLED=true -#ALERT_NODE_MEMORY_USAGE_ENABLED=true +## ALerts + +# Node disk space alert will trigger when free disk space left is below the given number in percent +#ALERT_NODE_DISK_SPACE_LEFT=10 + +# Node memory usage alert will trigger when memory usage is above the given number in percent +#ALERT_NODE_MEMORY_USAGE=85 diff --git a/README.md b/README.md index 03d891e..c631367 100644 --- a/README.md +++ b/README.md @@ -156,13 +156,9 @@ GF_MATRIX_HOME_SERVER_URL= ``` 4. Configure Alertmanager webhook and set the url to `http://matrix-alertmanager-receiver:12345/alerts/` -## alerts - -It is possible to enable the following alerts, by setting the corresponding env variable to `true`: -- backupbot failed: `ALERT_BACKUP_FAILED_ENABLED` -- backupbot missing: `ALERT_BACKUP_MISSING_ENABLED` -- backupbot not successfull: `ALERT_BACKUP_NOT_SUCCESSFULL_ENABLED` -- node disk space: `ALERT_NODE_DISK_SPACE_ENABLED` -- node memory usage: `ALERT_NODE_MEMORY_USAGE_ENABLED` +## Alerts +It is possible to enable the following alerts, by uncommenting the corresponding env variable: +- node disk space: `ALERT_NODE_DISK_SPACE_LEFT` +- node memory usage: `ALERT_NODE_MEMORY_USAGE` diff --git a/abra.sh b/abra.sh index 2651477..7b62aa5 100644 --- a/abra.sh +++ b/abra.sh @@ -9,9 +9,9 @@ export GRAFANA_CUSTOM_INI_VERSION=v4 export PROMTAIL_YML_VERSION=v3 export LOKI_YML_VERSION=v2 export PROMETHEUS_YML_VERSION=v2 -export MATRIX_ALERTMANAGER_CONFIG_VERSION=e -export MATRIX_ALERTMANAGER_ENTRYPOINT_VERSION=a -export GRAFANA_ALERTS_NODE_VERSION=v1c +export MATRIX_ALERTMANAGER_CONFIG_VERSION=v1 +export MATRIX_ALERTMANAGER_ENTRYPOINT_VERSION=v1 +export GRAFANA_ALERTS_NODE_VERSION=v2 # creates a default prometheus scrape config for a given node add_node(){ diff --git a/alerts/node.yml.tmpl b/alerts/node.yml.tmpl index d984fc7..f19152d 100644 --- a/alerts/node.yml.tmpl +++ b/alerts/node.yml.tmpl @@ -2,13 +2,13 @@ apiVersion: 1 # List of alert rule UIDs that should be deleted deleteRules: - {{ if ne (env "ALERT_NODE_DISK_SPACE_ENABLED") "true" }} + {{ if not (env "ALERT_NODE_DISK_SPACE_LEFT") }} - orgId: 1 - uid: bds8bhxu97pxca + uid: coopcloud_node_disk_space_left {{ end }} - {{ if ne (env "ALERT_NODE_MEMORY_USAGE_ENABLED") "true" }} + {{ if not (env "ALERT_NODE_MEMORY_USAGE") }} - orgId: 1 - uid: ads8cswmly96oa + uid: coopcloud_node_memory_usage {{ end }} groups: @@ -17,8 +17,8 @@ groups: folder: node interval: 5m rules: - {{ if eq (env "ALERT_NODE_DISK_SPACE_ENABLED") "true" }} - - uid: bds8bhxu97pxca + {{ if (env "ALERT_NODE_DISK_SPACE_LEFT") }} + - uid: coopcloud_node_disk_space_left title: Node Disk Space condition: C data: @@ -45,7 +45,7 @@ groups: conditions: - evaluator: params: - - 10 + - {{ env "ALERT_NODE_DISK_SPACE_LEFT" }} type: lt operator: type: and @@ -70,13 +70,13 @@ groups: annotations: description: "" runbook_url: "" - summary: Less than 10% disk space left on {{`{{ $labels.instance }}`}} ({{`{{ (index $values "A").Value }}`}}% left) + summary: Less than {{ env "ALERT_NODE_DISK_SPACE_LEFT" }}% disk space left on {{`{{ $labels.instance }}`}} ({{`{{ (index $values "A").Value }}`}}% left) labels: "": "" isPaused: false {{ end }} - {{ if eq (env "ALERT_NODE_MEMORY_USAGE_ENABLED") "true" }} - - uid: ads8cswmly96oa + {{ if (env "ALERT_NODE_MEMORY_USAGE") }} + - uid: coopcloud_node_memory_usage title: Node Memory Usage condition: C data: @@ -103,7 +103,7 @@ groups: conditions: - evaluator: params: - - 85 + - {{ env "ALERT_NODE_MEMORY_USAGE" }} type: gt operator: type: and @@ -126,6 +126,6 @@ groups: execErrState: Error for: 5m annotations: - summary: Memory usage is above 85% on {{`{{ $labels.instance }}`}} ({{`{{ printf "%.2f" (index $values "A").Value }}`}}% usage) + summary: Memory usage is above {{ env "ALERT_NODE_MEMORY_USAGE" }}% on {{`{{ $labels.instance }}`}} ({{`{{ printf "%.2f" (index $values "A").Value }}`}}% usage) isPaused: false {{ end }} diff --git a/compose.basic-auth.yml b/compose.basic-auth.yml new file mode 100644 index 0000000..228f9a2 --- /dev/null +++ b/compose.basic-auth.yml @@ -0,0 +1,7 @@ +--- +version: "3.8" + +secrets: + basic_auth: + external: true + name: ${STACK_NAME}_basic_auth_${SECRET_BASIC_AUTH_VERSION} diff --git a/compose.grafana.yml b/compose.grafana.yml index ab6dd70..e4c1bc6 100644 --- a/compose.grafana.yml +++ b/compose.grafana.yml @@ -32,8 +32,8 @@ services: - GF_SECURITY_ADMIN_PASSWORD__FILE=/run/secrets/grafana_admin_password - GF_SECURITY_ALLOW_EMBEDDING - GF_INSTALL_PLUGINS - - ALERT_NODE_DISK_SPACE_ENABLED - - ALERT_NODE_MEMORY_USAGE_ENABLED + - ALERT_NODE_DISK_SPACE_LEFT + - ALERT_NODE_MEMORY_USAGE deploy: labels: - "traefik.enable=true" diff --git a/compose.promtail.yml b/compose.promtail.yml index 7cf6cf8..ad3abcf 100644 --- a/compose.promtail.yml +++ b/compose.promtail.yml @@ -23,8 +23,3 @@ configs: name: ${STACK_NAME}_promtail_yml_${PROMTAIL_YML_VERSION} file: promtail.yml.tmpl template_driver: golang - -secrets: - basic_auth: - external: true - name: ${STACK_NAME}_basic_auth_${SECRET_BASIC_AUTH_VERSION}