feat: make alerts configurable #19

Open
p4u1 wants to merge 1 commits from configurable-alerts into main
7 changed files with 37 additions and 37 deletions

View File

@ -6,7 +6,8 @@ DOMAIN=monitoring-ng.example.com
ENABLE_BACKUPS=true
## Enable this secret for Promtail / Prometheus
# SECRET_BASIC_AUTH_VERSION=v1
#COMPOSE_FILE="$COMPOSE_FILE:compose.basic-auth.yml"
#SECRET_BASIC_AUTH_VERSION=v1
#
# Promtail (Gathering Logs)
# COMPOSE_FILE="$COMPOSE_FILE:compose.promtail.yml"
@ -79,9 +80,10 @@ ENABLE_BACKUPS=true
#GF_MATRIX_ROOM_ID="<room-id>"
#GF_MATRIX_HOMESERVER_URL="<homeserver-url>"
# ALerts
#ALERT_BACKUP_FAILED_ENABLED=true
Review

I've seen mentions that these backup alerts don't work, but is that the case for everyone? Or could someone still be relying on this?

And if these envs are removed, then grafana-alerts.json.tmpl can also be removed?

I've seen mentions that these backup alerts don't work, but is that the case for everyone? Or could someone still be relying on this? And if these envs are removed, then grafana-alerts.json.tmpl can also be removed?
#ALERT_BACKUP_MISSING_ENABLED=true
#ALERT_BACKUP_NOT_SUCCESSFULL_ENABLED=true
#ALERT_NODE_DISK_SPACE_ENABLED=true
#ALERT_NODE_MEMORY_USAGE_ENABLED=true
## ALerts
# Node disk space alert will trigger when free disk space left is below the given number in percent
Review

Good enough for now. But 10% of a 20GB disk or a 2TB disk is a bit different. Can imagine we want a bit more fine-grained control depending on disk size and usage.

Good enough for now. But 10% of a 20GB disk or a 2TB disk is a bit different. Can imagine we want a bit more fine-grained control depending on disk size and usage.
#ALERT_NODE_DISK_SPACE_LEFT=10
# Node memory usage alert will trigger when memory usage is above the given number in percent
#ALERT_NODE_MEMORY_USAGE=85

View File

@ -156,13 +156,9 @@ GF_MATRIX_HOME_SERVER_URL=
```
4. Configure Alertmanager webhook and set the url to `http://matrix-alertmanager-receiver:12345/alerts/<room-id>`
## alerts
It is possible to enable the following alerts, by setting the corresponding env variable to `true`:
- backupbot failed: `ALERT_BACKUP_FAILED_ENABLED`
- backupbot missing: `ALERT_BACKUP_MISSING_ENABLED`
- backupbot not successfull: `ALERT_BACKUP_NOT_SUCCESSFULL_ENABLED`
- node disk space: `ALERT_NODE_DISK_SPACE_ENABLED`
- node memory usage: `ALERT_NODE_MEMORY_USAGE_ENABLED`
## Alerts
It is possible to enable the following alerts, by uncommenting the corresponding env variable:
- node disk space: `ALERT_NODE_DISK_SPACE_LEFT`
- node memory usage: `ALERT_NODE_MEMORY_USAGE`
Review

nit: should probably remove the options from here since they are explained in the env file, otherwise we would have to remember that these need to be updates in 2 places

nit: should probably remove the options from here since they are explained in the env file, otherwise we would have to remember that these need to be updates in 2 places
Review

Ah yes, good point!

Ah yes, good point!

View File

@ -9,9 +9,9 @@ export GRAFANA_CUSTOM_INI_VERSION=v4
export PROMTAIL_YML_VERSION=v3
export LOKI_YML_VERSION=v2
export PROMETHEUS_YML_VERSION=v2
export MATRIX_ALERTMANAGER_CONFIG_VERSION=e
export MATRIX_ALERTMANAGER_ENTRYPOINT_VERSION=a
export GRAFANA_ALERTS_NODE_VERSION=v1c
export MATRIX_ALERTMANAGER_CONFIG_VERSION=v1
export MATRIX_ALERTMANAGER_ENTRYPOINT_VERSION=v1
export GRAFANA_ALERTS_NODE_VERSION=v2
# creates a default prometheus scrape config for a given node
add_node(){

View File

@ -2,13 +2,13 @@ apiVersion: 1
# List of alert rule UIDs that should be deleted
deleteRules:
{{ if ne (env "ALERT_NODE_DISK_SPACE_ENABLED") "true" }}
{{ if not (env "ALERT_NODE_DISK_SPACE_LEFT") }}
- orgId: 1
uid: bds8bhxu97pxca
uid: coopcloud_node_disk_space_left
{{ end }}
{{ if ne (env "ALERT_NODE_MEMORY_USAGE_ENABLED") "true" }}
{{ if not (env "ALERT_NODE_MEMORY_USAGE") }}
- orgId: 1
uid: ads8cswmly96oa
uid: coopcloud_node_memory_usage
{{ end }}
groups:
@ -17,8 +17,8 @@ groups:
folder: node
interval: 5m
rules:
{{ if eq (env "ALERT_NODE_DISK_SPACE_ENABLED") "true" }}
- uid: bds8bhxu97pxca
{{ if (env "ALERT_NODE_DISK_SPACE_LEFT") }}
- uid: coopcloud_node_disk_space_left
title: Node Disk Space
condition: C
data:
@ -45,7 +45,7 @@ groups:
conditions:
- evaluator:
params:
- 10
- {{ env "ALERT_NODE_DISK_SPACE_LEFT" }}
type: lt
operator:
type: and
@ -70,13 +70,13 @@ groups:
annotations:
description: ""
runbook_url: ""
summary: Less than 10% disk space left on {{`{{ $labels.instance }}`}} ({{`{{ (index $values "A").Value }}`}}% left)
summary: Less than {{ env "ALERT_NODE_DISK_SPACE_LEFT" }}% disk space left on {{`{{ $labels.instance }}`}} ({{`{{ (index $values "A").Value }}`}}% left)
labels:
"": ""
isPaused: false
{{ end }}
{{ if eq (env "ALERT_NODE_MEMORY_USAGE_ENABLED") "true" }}
- uid: ads8cswmly96oa
{{ if (env "ALERT_NODE_MEMORY_USAGE") }}
- uid: coopcloud_node_memory_usage
title: Node Memory Usage
condition: C
data:
@ -103,7 +103,7 @@ groups:
conditions:
- evaluator:
params:
- 85
- {{ env "ALERT_NODE_MEMORY_USAGE" }}
type: gt
operator:
type: and
@ -126,6 +126,6 @@ groups:
execErrState: Error
for: 5m
annotations:
summary: Memory usage is above 85% on {{`{{ $labels.instance }}`}} ({{`{{ printf "%.2f" (index $values "A").Value }}`}}% usage)
summary: Memory usage is above {{ env "ALERT_NODE_MEMORY_USAGE" }}% on {{`{{ $labels.instance }}`}} ({{`{{ printf "%.2f" (index $values "A").Value }}`}}% usage)
isPaused: false
{{ end }}

7
compose.basic-auth.yml Normal file
View File

@ -0,0 +1,7 @@
---
version: "3.8"
secrets:
basic_auth:
external: true
name: ${STACK_NAME}_basic_auth_${SECRET_BASIC_AUTH_VERSION}

View File

@ -32,8 +32,8 @@ services:
- GF_SECURITY_ADMIN_PASSWORD__FILE=/run/secrets/grafana_admin_password
- GF_SECURITY_ALLOW_EMBEDDING
- GF_INSTALL_PLUGINS
- ALERT_NODE_DISK_SPACE_ENABLED
- ALERT_NODE_MEMORY_USAGE_ENABLED
- ALERT_NODE_DISK_SPACE_LEFT
- ALERT_NODE_MEMORY_USAGE
deploy:
labels:
- "traefik.enable=true"

View File

@ -23,8 +23,3 @@ configs:
name: ${STACK_NAME}_promtail_yml_${PROMTAIL_YML_VERSION}
file: promtail.yml.tmpl
template_driver: golang
secrets:
basic_auth:
external: true
name: ${STACK_NAME}_basic_auth_${SECRET_BASIC_AUTH_VERSION}