feat: make alerts configurable
This commit is contained in:
16
.env.sample
16
.env.sample
@ -6,7 +6,8 @@ DOMAIN=monitoring-ng.example.com
|
||||
ENABLE_BACKUPS=true
|
||||
|
||||
## Enable this secret for Promtail / Prometheus
|
||||
# SECRET_BASIC_AUTH_VERSION=v1
|
||||
#COMPOSE_FILE="$COMPOSE_FILE:compose.basic-auth.yml"
|
||||
#SECRET_BASIC_AUTH_VERSION=v1
|
||||
#
|
||||
# Promtail (Gathering Logs)
|
||||
# COMPOSE_FILE="$COMPOSE_FILE:compose.promtail.yml"
|
||||
@ -79,9 +80,10 @@ ENABLE_BACKUPS=true
|
||||
#GF_MATRIX_ROOM_ID="<room-id>"
|
||||
#GF_MATRIX_HOMESERVER_URL="<homeserver-url>"
|
||||
|
||||
# ALerts
|
||||
#ALERT_BACKUP_FAILED_ENABLED=true
|
||||
#ALERT_BACKUP_MISSING_ENABLED=true
|
||||
#ALERT_BACKUP_NOT_SUCCESSFULL_ENABLED=true
|
||||
#ALERT_NODE_DISK_SPACE_ENABLED=true
|
||||
#ALERT_NODE_MEMORY_USAGE_ENABLED=true
|
||||
## ALerts
|
||||
|
||||
# Node disk space alert will trigger when free disk space left is below the given number in percent
|
||||
#ALERT_NODE_DISK_SPACE_LEFT=10
|
||||
|
||||
# Node memory usage alert will trigger when memory usage is above the given number in percent
|
||||
#ALERT_NODE_MEMORY_USAGE=85
|
||||
|
||||
12
README.md
12
README.md
@ -156,13 +156,9 @@ GF_MATRIX_HOME_SERVER_URL=
|
||||
```
|
||||
4. Configure Alertmanager webhook and set the url to `http://matrix-alertmanager-receiver:12345/alerts/<room-id>`
|
||||
|
||||
## alerts
|
||||
|
||||
It is possible to enable the following alerts, by setting the corresponding env variable to `true`:
|
||||
- backupbot failed: `ALERT_BACKUP_FAILED_ENABLED`
|
||||
- backupbot missing: `ALERT_BACKUP_MISSING_ENABLED`
|
||||
- backupbot not successfull: `ALERT_BACKUP_NOT_SUCCESSFULL_ENABLED`
|
||||
- node disk space: `ALERT_NODE_DISK_SPACE_ENABLED`
|
||||
- node memory usage: `ALERT_NODE_MEMORY_USAGE_ENABLED`
|
||||
## Alerts
|
||||
|
||||
It is possible to enable the following alerts, by uncommenting the corresponding env variable:
|
||||
|
||||
- node disk space: `ALERT_NODE_DISK_SPACE_LEFT`
|
||||
- node memory usage: `ALERT_NODE_MEMORY_USAGE`
|
||||
|
||||
6
abra.sh
6
abra.sh
@ -9,9 +9,9 @@ export GRAFANA_CUSTOM_INI_VERSION=v4
|
||||
export PROMTAIL_YML_VERSION=v3
|
||||
export LOKI_YML_VERSION=v2
|
||||
export PROMETHEUS_YML_VERSION=v2
|
||||
export MATRIX_ALERTMANAGER_CONFIG_VERSION=e
|
||||
export MATRIX_ALERTMANAGER_ENTRYPOINT_VERSION=a
|
||||
export GRAFANA_ALERTS_NODE_VERSION=v1c
|
||||
export MATRIX_ALERTMANAGER_CONFIG_VERSION=v1
|
||||
export MATRIX_ALERTMANAGER_ENTRYPOINT_VERSION=v1
|
||||
export GRAFANA_ALERTS_NODE_VERSION=v2
|
||||
|
||||
# creates a default prometheus scrape config for a given node
|
||||
add_node(){
|
||||
|
||||
@ -2,13 +2,13 @@ apiVersion: 1
|
||||
|
||||
# List of alert rule UIDs that should be deleted
|
||||
deleteRules:
|
||||
{{ if ne (env "ALERT_NODE_DISK_SPACE_ENABLED") "true" }}
|
||||
{{ if not (env "ALERT_NODE_DISK_SPACE_LEFT") }}
|
||||
- orgId: 1
|
||||
uid: bds8bhxu97pxca
|
||||
uid: coopcloud_node_disk_space_left
|
||||
{{ end }}
|
||||
{{ if ne (env "ALERT_NODE_MEMORY_USAGE_ENABLED") "true" }}
|
||||
{{ if not (env "ALERT_NODE_MEMORY_USAGE") }}
|
||||
- orgId: 1
|
||||
uid: ads8cswmly96oa
|
||||
uid: coopcloud_node_memory_usage
|
||||
{{ end }}
|
||||
|
||||
groups:
|
||||
@ -17,8 +17,8 @@ groups:
|
||||
folder: node
|
||||
interval: 5m
|
||||
rules:
|
||||
{{ if eq (env "ALERT_NODE_DISK_SPACE_ENABLED") "true" }}
|
||||
- uid: bds8bhxu97pxca
|
||||
{{ if (env "ALERT_NODE_DISK_SPACE_LEFT") }}
|
||||
- uid: coopcloud_node_disk_space_left
|
||||
title: Node Disk Space
|
||||
condition: C
|
||||
data:
|
||||
@ -45,7 +45,7 @@ groups:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 10
|
||||
- {{ env "ALERT_NODE_DISK_SPACE_LEFT" }}
|
||||
type: lt
|
||||
operator:
|
||||
type: and
|
||||
@ -70,13 +70,13 @@ groups:
|
||||
annotations:
|
||||
description: ""
|
||||
runbook_url: ""
|
||||
summary: Less than 10% disk space left on {{`{{ $labels.instance }}`}} ({{`{{ (index $values "A").Value }}`}}% left)
|
||||
summary: Less than {{ env "ALERT_NODE_DISK_SPACE_LEFT" }}% disk space left on {{`{{ $labels.instance }}`}} ({{`{{ (index $values "A").Value }}`}}% left)
|
||||
labels:
|
||||
"": ""
|
||||
isPaused: false
|
||||
{{ end }}
|
||||
{{ if eq (env "ALERT_NODE_MEMORY_USAGE_ENABLED") "true" }}
|
||||
- uid: ads8cswmly96oa
|
||||
{{ if (env "ALERT_NODE_MEMORY_USAGE") }}
|
||||
- uid: coopcloud_node_memory_usage
|
||||
title: Node Memory Usage
|
||||
condition: C
|
||||
data:
|
||||
@ -103,7 +103,7 @@ groups:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 85
|
||||
- {{ env "ALERT_NODE_MEMORY_USAGE" }}
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
@ -126,6 +126,6 @@ groups:
|
||||
execErrState: Error
|
||||
for: 5m
|
||||
annotations:
|
||||
summary: Memory usage is above 85% on {{`{{ $labels.instance }}`}} ({{`{{ printf "%.2f" (index $values "A").Value }}`}}% usage)
|
||||
summary: Memory usage is above {{ env "ALERT_NODE_MEMORY_USAGE" }}% on {{`{{ $labels.instance }}`}} ({{`{{ printf "%.2f" (index $values "A").Value }}`}}% usage)
|
||||
isPaused: false
|
||||
{{ end }}
|
||||
|
||||
7
compose.basic-auth.yml
Normal file
7
compose.basic-auth.yml
Normal file
@ -0,0 +1,7 @@
|
||||
---
|
||||
version: "3.8"
|
||||
|
||||
secrets:
|
||||
basic_auth:
|
||||
external: true
|
||||
name: ${STACK_NAME}_basic_auth_${SECRET_BASIC_AUTH_VERSION}
|
||||
@ -32,8 +32,8 @@ services:
|
||||
- GF_SECURITY_ADMIN_PASSWORD__FILE=/run/secrets/grafana_admin_password
|
||||
- GF_SECURITY_ALLOW_EMBEDDING
|
||||
- GF_INSTALL_PLUGINS
|
||||
- ALERT_NODE_DISK_SPACE_ENABLED
|
||||
- ALERT_NODE_MEMORY_USAGE_ENABLED
|
||||
- ALERT_NODE_DISK_SPACE_LEFT
|
||||
- ALERT_NODE_MEMORY_USAGE
|
||||
deploy:
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
|
||||
@ -23,8 +23,3 @@ configs:
|
||||
name: ${STACK_NAME}_promtail_yml_${PROMTAIL_YML_VERSION}
|
||||
file: promtail.yml.tmpl
|
||||
template_driver: golang
|
||||
|
||||
secrets:
|
||||
basic_auth:
|
||||
external: true
|
||||
name: ${STACK_NAME}_basic_auth_${SECRET_BASIC_AUTH_VERSION}
|
||||
|
||||
Reference in New Issue
Block a user