3 Commits

Author SHA1 Message Date
f
c9910eabf4 Merge branch 'main' of https://git.coopcloud.tech/coop-cloud/monitoring-ng into alloy 2026-06-20 23:59:20 -03:00
02b01e5c23 feat: make alerts configurable (#19)
Alert threshholds can now be configured via env variables

I also seperated the basic auth, so that it is possible to deploy only prometheus without promtail

Reviewed-on: #19
Reviewed-by: ammaratef45 <ammaratef45@proton.me>
Reviewed-by: Danny Groenewegen <dannygroenewegen@noreply.git.coopcloud.tech>
Co-authored-by: p4u1 <p4u1_f4u1@riseup.net>
Co-committed-by: p4u1 <p4u1_f4u1@riseup.net>
2026-06-12 06:49:17 +00:00
fce8ea5889 docs: Update deploy docs and adds prometheus basic auth back (#7)
Reviewed-on: #7
Reviewed-by: Danny Groenewegen <dannygroenewegen@noreply.git.coopcloud.tech>
Co-authored-by: p4u1 <p4u1_f4u1@riseup.net>
Co-committed-by: p4u1 <p4u1_f4u1@riseup.net>
2026-06-12 06:48:39 +00:00
11 changed files with 44 additions and 483 deletions

View File

@ -30,9 +30,6 @@ LIVE_DEBUGGING=false
# server is remote
# PROMETHEUS_REMOTE_WRITE_URL=https://prometheus.$DOMAIN/api/v1/write
# Monitor physical disks health
# COMPOSE_FILE="$COMPOSE_FILE:compose.smartctl.yml"
# Monitoring Server
#
## Prometheus
@ -95,9 +92,10 @@ LIVE_DEBUGGING=false
#GF_MATRIX_ROOM_ID="<room-id>"
#GF_MATRIX_HOMESERVER_URL="<homeserver-url>"
# ALerts
#ALERT_BACKUP_FAILED_ENABLED=true
#ALERT_BACKUP_MISSING_ENABLED=true
#ALERT_BACKUP_NOT_SUCCESSFULL_ENABLED=true
#ALERT_NODE_DISK_SPACE_ENABLED=true
#ALERT_NODE_MEMORY_USAGE_ENABLED=true
## ALerts
# Node disk space alert will trigger when free disk space left is below the given number in percent
#ALERT_NODE_DISK_SPACE_LEFT=10
# Node memory usage alert will trigger when memory usage is above the given number in percent
#ALERT_NODE_MEMORY_USAGE=85

View File

@ -18,47 +18,30 @@ It's based heavily on the [monitoring-lite](https://git.coopcloud.tech/coop-clou
<!-- endmetadata -->
## Setup a Metrics Gathering
## Setup Metrics Gathering
Where gathering.org is the node you want to gather metrics from.
1. Configure DNS
- monitoring.gathering.org
- cadvisor.monitoring.gathering.org
- node.monitoring.gathering.org
1. Configure Traefik to use BasicAuth
* `abra app config traefik.gathering.org`
uncomment
```
# BASIC_AUTH
COMPOSE_FILE="$COMPOSE_FILE:compose.basicauth.yml"
BASIC_AUTH=1
SECRET_USERSFILE_VERSION=v1
```
- Generate userslist with httpasswd hashed password
`abra app secret insert traefik.gathering.org usersfile v1 'admin:<hashed-secret>'`
make sure there is no whitespace in between `admin:<hashed-secret>`, it seems to break stuff...
- `abra app deploy -f traefik`
1. `abra app new monitoring-ng`
1. `abra app config monitoring.gathering.org`
for gathering only the main `compose.yml` is needed, nothing more.
1. `abra app deploy monitoring.gathering.org`
1. check that endpoints are up and basic-auth works
2. [Configure Traefik to use BasicAuth](https://git.coopcloud.tech/coop-cloud/traefik#configuring-wildcard-ssl-using-dns)
3. `abra app new monitoring-ng`
4. `abra app config monitoring.gathering.org` (for gathering only the main `compose.yml` is needed, nothing more.)
5. `abra app deploy monitoring.gathering.org`
6. check that endpoints are up and basic-auth works
- cadvisor.monitoring.gathering.org
- node.monitoring.gathering.org
## Setup Metrics Browser
This builds upon [Setup Metrics Gathering](#setup-metrics-grathering) so make sure you did that first.
1. Configure DNS
- monitoring.example.org
- prometheus.monitoring.example.org
- loki.monitoring.example.org
2. Setup monitoring stack
- `abra app new monitoring-ng`
- `abra app config monitoring.example.org`
Uncomment all the stuff
- `abra app secret insert monitoring.example.org basic_auth v1 <secret>`
- `abra app config monitoring.example.org` Uncomment prometheus, loki and grafana
- `abra app secret insert monitoring.example.org basic_auth v1 <password>`
this needs the plaintext traefik basic-auth secret, not the hashed one!
- `abra app secret ls monitoring.example.org`
- `abra app deploy monitoring.example.org`
@ -149,19 +132,9 @@ GF_MATRIX_HOME_SERVER_URL=
```
4. Configure Alertmanager webhook and set the url to `http://matrix-alertmanager-receiver:12345/alerts/<room-id>`
## alerts
## Alerts
It is possible to enable the following alerts, by setting the corresponding env variable to `true`:
- backupbot failed: `ALERT_BACKUP_FAILED_ENABLED`
- backupbot missing: `ALERT_BACKUP_MISSING_ENABLED`
- backupbot not successfull: `ALERT_BACKUP_NOT_SUCCESSFULL_ENABLED`
- node disk space: `ALERT_NODE_DISK_SPACE_ENABLED`
- node memory usage: `ALERT_NODE_MEMORY_USAGE_ENABLED`
It is possible to enable the following alerts, by uncommenting the corresponding env variable:
## smart monitoring
To be able monitor hard drive health data, you need to configure
`smartd` to run on the host system, and also the
`collect-smartctl-json.sh` script provided here (via cronjob or as
a `smartd` hook). This is a limitation on Docker Swarm, which prevents
the `smartctl_exporter` from running on privileged mode.
- node disk space: `ALERT_NODE_DISK_SPACE_LEFT`
- node memory usage: `ALERT_NODE_MEMORY_USAGE`

View File

@ -8,9 +8,9 @@ export GF_BACKUP_DASH_VERSION=v1
export GF_CUSTOM_INI_VERSION=v4
export LOKI_YML_VERSION=v3
export PROMETHEUS_YML_VERSION=v2
export MATRIX_ALERTMANAGER_CONFIG_VERSION=e
export MATRIX_ALERTMANAGER_ENTRYPOINT_VERSION=a
export GRAFANA_ALERTS_NODE_VERSION=v1c
export MATRIX_ALERTMANAGER_CONFIG_VERSION=v1
export MATRIX_ALERTMANAGER_ENTRYPOINT_VERSION=v1
export GRAFANA_ALERTS_NODE_VERSION=v2
export CONFIG_ALLOY_VERSION=v10
# creates a default prometheus scrape config for a given node

View File

@ -2,13 +2,13 @@ apiVersion: 1
# List of alert rule UIDs that should be deleted
deleteRules:
{{ if ne (env "ALERT_NODE_DISK_SPACE_ENABLED") "true" }}
{{ if not (env "ALERT_NODE_DISK_SPACE_LEFT") }}
- orgId: 1
uid: bds8bhxu97pxca
uid: coopcloud_node_disk_space_left
{{ end }}
{{ if ne (env "ALERT_NODE_MEMORY_USAGE_ENABLED") "true" }}
{{ if not (env "ALERT_NODE_MEMORY_USAGE") }}
- orgId: 1
uid: ads8cswmly96oa
uid: coopcloud_node_memory_usage
{{ end }}
groups:
@ -17,8 +17,8 @@ groups:
folder: node
interval: 5m
rules:
{{ if eq (env "ALERT_NODE_DISK_SPACE_ENABLED") "true" }}
- uid: bds8bhxu97pxca
{{ if (env "ALERT_NODE_DISK_SPACE_LEFT") }}
- uid: coopcloud_node_disk_space_left
title: Node Disk Space
condition: C
data:
@ -45,7 +45,7 @@ groups:
conditions:
- evaluator:
params:
- 10
- {{ env "ALERT_NODE_DISK_SPACE_LEFT" }}
type: lt
operator:
type: and
@ -70,13 +70,13 @@ groups:
annotations:
description: ""
runbook_url: ""
summary: Less than 10% disk space left on {{`{{ $labels.instance }}`}} ({{`{{ (index $values "A").Value }}`}}% left)
summary: Less than {{ env "ALERT_NODE_DISK_SPACE_LEFT" }}% disk space left on {{`{{ $labels.instance }}`}} ({{`{{ (index $values "A").Value }}`}}% left)
labels:
"": ""
isPaused: false
{{ end }}
{{ if eq (env "ALERT_NODE_MEMORY_USAGE_ENABLED") "true" }}
- uid: ads8cswmly96oa
{{ if (env "ALERT_NODE_MEMORY_USAGE") }}
- uid: coopcloud_node_memory_usage
title: Node Memory Usage
condition: C
data:
@ -103,7 +103,7 @@ groups:
conditions:
- evaluator:
params:
- 85
- {{ env "ALERT_NODE_MEMORY_USAGE" }}
type: gt
operator:
type: and
@ -126,6 +126,6 @@ groups:
execErrState: Error
for: 5m
annotations:
summary: Memory usage is above 85% on {{`{{ $labels.instance }}`}} ({{`{{ printf "%.2f" (index $values "A").Value }}`}}% usage)
summary: Memory usage is above {{ env "ALERT_NODE_MEMORY_USAGE" }}% on {{`{{ $labels.instance }}`}} ({{`{{ printf "%.2f" (index $values "A").Value }}`}}% usage)
isPaused: false
{{ end }}

View File

@ -1,6 +0,0 @@
[Unit]
Description=Collect SMART data
[Service]
Type=oneshot
ExecStart=/usr/local/bin/collect-smartctl-json.sh

View File

@ -1,69 +0,0 @@
#! /bin/bash
# Adapted from https://github.com/prometheus-community/smartctl_exporter/blob/master/collect-smartctl-json.sh
script_dir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
# Data directory to dump smartctl output
# This directory will be created if it doesn't exist
data_dir="/var/lib/smartmontools/json"
# The original script used --xall but that doesn't work
# This matches the command in readSMARTctl()
smartctl_args="--json --info --health --attributes --tolerance=verypermissive \
--nocheck=standby --format=brief --log=error"
# Ignore this devices
smartctl_ignore_dev_regex="^(/dev/bus)"
# Determine the json query tool to use
if command -v jq >/dev/null; then
json_tool="jq"
json_args="--raw-output"
elif command -v yq >/dev/null; then
json_tool="yq"
json_args="--unwrapScalar"
else
echo -e "One of 'yq' or 'jq' is required. Please try again after \
installing one of them"
exit 1
fi
if [[ ! "${UID}" -eq 0 ]] && ! command -v sudo >/dev/null; then
# Not root and sudo doesn't exist
echo "sudo does not exist. Please run this as root"
exit 1
fi
SUDO="sudo"
if [[ "${UID}" -eq 0 ]]; then
# Don't use sudo if root
SUDO=""
fi
[[ ! -d "${data_dir}" ]] && mkdir --parents "${data_dir}"
if [[ $# -ne 0 ]]; then
devices="${1}"
else
devices="$(smartctl --scan --json | "${json_tool}" "${json_args}" \
".devices[].name | select(test(\"${smartctl_ignore_dev_regex}\") | not)")"
mapfile -t devices <<< "${devices[@]}"
fi
for device in "${devices[@]}"
do
echo -n "Collecting data for '${device}'..."
# shellcheck disable=SC2086
data="$($SUDO smartctl ${smartctl_args} ${device})"
# Accommodate a smartmontools pre-7.3 bug
data=${data#" Pending defect count:"}
type="$(echo "${data}" | "${json_tool}" "${json_args}" '.device.type')"
family="$(echo "${data}" | "${json_tool}" "${json_args}" \
'select(.model_family != null) | .model_family | sub(" |/" ; "_" ; "g")
| sub("\"|\\(|\\)" ; "" ; "g")')"
model="$(echo "${data}" | "${json_tool}" "${json_args}" \
'.model_name | sub(" |/" ; "_" ; "g") | sub("\"|\\(|\\)" ; "" ; "g")')"
device_name="$(basename "${device}")"
echo -e "\tSaving to ${device_name}.json"
echo "${data}" > "${data_dir}/${device_name}.json"
done

View File

@ -1,9 +0,0 @@
[Unit]
Description=Collect SMART data
[Timer]
OnCalendar=hourly
Persistent=true
[Install]
WantedBy=timers.target

7
compose.basic-auth.yml Normal file
View File

@ -0,0 +1,7 @@
---
version: "3.8"
secrets:
basic_auth:
external: true
name: ${STACK_NAME}_basic_auth_${SECRET_BASIC_AUTH_VERSION}

View File

@ -32,8 +32,8 @@ services:
- GF_SECURITY_ADMIN_PASSWORD__FILE=/run/secrets/gf_adminpasswd
- GF_SECURITY_ALLOW_EMBEDDING
- GF_INSTALL_PLUGINS
- ALERT_NODE_DISK_SPACE_ENABLED
- ALERT_NODE_MEMORY_USAGE_ENABLED
- ALERT_NODE_DISK_SPACE_LEFT
- ALERT_NODE_MEMORY_USAGE
deploy:
labels:
- "traefik.enable=true"

View File

@ -1,18 +0,0 @@
---
version: "3.8"
services:
smartctl:
image: "prometheuscommunity/smartctl-exporter:v0.14.0"
volumes:
- "/dev:/dev"
- "/var/lib/smartmontools/json:/debug"
command:
- "--smartctl.fake-data"
- "--smartctl.interval=1h"
networks:
- "proxy"
deploy:
labels:
- "prometheus.io/scrape=true"
- "prometheus.io/port=9633"
- "prometheus.io/path=/metrics"

View File

@ -1,315 +0,0 @@
{
"apiVersion": 1,
"groups": [
{
"orgId": 1,
"name": "backupbot",
"folder": "node",
"interval": "1m",
"rules": [
{{ if eq (env "ALERT_BACKUP_FAILED_ENABLED") "true" }}
{
"uid": "de8e5xxup7t34a",
"title": "Backup Failed",
"condition": "C",
"data": [
{
"refId": "A",
"relativeTimeRange": { "from": 600, "to": 0 },
"datasourceUid": "PBFA97CFB590B2093",
"model": {
"disableTextWrap": false,
"editorMode": "builder",
"expr": "backup",
"fullMetaSearch": false,
"includeNullMetadata": true,
"instant": true,
"intervalMs": 1000,
"legendFormat": "__auto",
"maxDataPoints": 43200,
"range": false,
"refId": "A",
"useBackend": false
}
},
{
"refId": "C",
"relativeTimeRange": { "from": 600, "to": 0 },
"datasourceUid": "__expr__",
"model": {
"conditions": [
{
"evaluator": { "params": [0], "type": "lt" },
"operator": { "type": "and" },
"query": { "params": ["C"] },
"reducer": { "params": [], "type": "last" },
"type": "query"
}
],
"datasource": { "type": "__expr__", "uid": "__expr__" },
"expression": "A",
"intervalMs": 1000,
"maxDataPoints": 43200,
"refId": "C",
"type": "threshold"
}
}
],
"noDataState": "NoData",
"execErrState": "Error",
"for": "1m",
"isPaused": false
},
{{ end }}
{{ if eq (env "ALERT_BACKUP_MISSING_ENABLED") "true" }}
{
"uid": "ce8e65uddcwe8d",
"title": "Backup Missing",
"condition": "B",
"data": [
{
"refId": "A",
"relativeTimeRange": { "from": 600, "to": 0 },
"datasourceUid": "PBFA97CFB590B2093",
"model": {
"disableTextWrap": false,
"editorMode": "builder",
"expr": "rate(backup[24h])",
"fullMetaSearch": false,
"includeNullMetadata": true,
"instant": true,
"intervalMs": 1000,
"legendFormat": "__auto",
"maxDataPoints": 43200,
"range": false,
"refId": "A",
"useBackend": false
}
},
{
"refId": "B",
"relativeTimeRange": { "from": 600, "to": 0 },
"datasourceUid": "__expr__",
"model": {
"conditions": [
{
"evaluator": { "params": [0, 0], "type": "within_range" },
"operator": { "type": "and" },
"query": { "params": ["C"] },
"reducer": { "params": [], "type": "last" },
"type": "query"
}
],
"datasource": { "type": "__expr__", "uid": "__expr__" },
"expression": "A",
"intervalMs": 1000,
"maxDataPoints": 43200,
"refId": "B",
"type": "threshold"
}
}
],
"noDataState": "NoData",
"execErrState": "Error",
"for": "5m",
"isPaused": false
},
{{ end }}
{{ if eq (env "ALERT_BACKUP_NOT_SUCCESSFULL_ENABLED") "true" }}
{
"uid": "de8e6bc92a8lcc",
"title": "Backup Not Successfull",
"condition": "B",
"data": [
{
"refId": "A",
"relativeTimeRange": {
"from": 60,
"to": 0
},
"datasourceUid": "PBFA97CFB590B2093",
"model": {
"disableTextWrap": false,
"editorMode": "builder",
"expr": "backup",
"fullMetaSearch": false,
"includeNullMetadata": true,
"instant": true,
"intervalMs": 1000,
"legendFormat": "__auto",
"maxDataPoints": 43200,
"range": false,
"refId": "A",
"useBackend": false
}
},
{
"refId": "B",
"relativeTimeRange": {
"from": 60,
"to": 0
},
"datasourceUid": "__expr__",
"model": {
"conditions": [
{
"evaluator": {
"params": [
0
],
"type": "gt"
},
"operator": {
"type": "and"
},
"query": {
"params": [
"C"
]
},
"reducer": {
"params": [],
"type": "last"
},
"type": "query"
}
],
"datasource": {
"type": "__expr__",
"uid": "__expr__"
},
"expression": "A",
"intervalMs": 1000,
"maxDataPoints": 43200,
"refId": "B",
"type": "threshold"
}
}
],
"noDataState": "NoData",
"execErrState": "Error",
"for": "20m",
"annotations": {
"summary": "Backup did not finish within 20 minutes"
},
"labels": {},
"isPaused": false
}
{{ end }}
]
},
{
"orgId": 1,
"name": "node",
"folder": "node",
"interval": "5m",
"rules": [
{{ if eq (env "ALERT_NODE_DISK_SPACE_ENABLED") "true" }}
{
"uid": "bds8bhxu97pxca",
"title": "Node Disk Space",
"condition": "C",
"data": [
{
"refId": "A",
"relativeTimeRange": { "from": 600, "to": 0 },
"datasourceUid": "PBFA97CFB590B2093",
"model": {
"editorMode": "code",
"expr": "(node_filesystem_free_bytes{fstype=\"ext4\"} / node_filesystem_size_bytes{fstype=\"ext4\"}) * 100",
"instant": true,
"intervalMs": 1000,
"legendFormat": "__auto",
"maxDataPoints": 43200,
"range": false,
"refId": "A"
}
},
{
"refId": "C",
"relativeTimeRange": { "from": 600, "to": 0 },
"datasourceUid": "__expr__",
"model": {
"conditions": [
{
"evaluator": { "params": [10], "type": "lt" },
"operator": { "type": "and" },
"query": { "params": ["C"] },
"reducer": { "params": [], "type": "last" },
"type": "query"
}
],
"datasource": { "type": "__expr__", "uid": "__expr__" },
"expression": "A",
"intervalMs": 1000,
"maxDataPoints": 43200,
"refId": "C",
"type": "threshold"
}
}
],
"noDataState": "NoData",
"execErrState": "Error",
"for": "5m",
"annotations": {},
"labels": {},
"isPaused": false
},
{{ end }}
{{ if eq (env "ALERT_NODE_MEMORY_USAGE_ENABLED") "true" }}
{
"uid": "ads8cswmly96oa",
"title": "Node Memory Usage",
"condition": "C",
"data": [
{
"refId": "A",
"relativeTimeRange": { "from": 600, "to": 0 },
"datasourceUid": "PBFA97CFB590B2093",
"model": {
"editorMode": "code",
"expr": "(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100",
"instant": true,
"intervalMs": 1000,
"legendFormat": "__auto",
"maxDataPoints": 43200,
"range": false,
"refId": "A"
}
},
{
"refId": "C",
"relativeTimeRange": { "from": 600, "to": 0 },
"datasourceUid": "__expr__",
"model": {
"conditions": [
{
"evaluator": { "params": [90], "type": "gt" },
"operator": { "type": "and" },
"query": { "params": ["C"] },
"reducer": { "params": [], "type": "last" },
"type": "query"
}
],
"datasource": { "type": "__expr__", "uid": "__expr__" },
"expression": "A",
"intervalMs": 1000,
"maxDataPoints": 43200,
"refId": "C",
"type": "threshold"
}
}
],
"noDataState": "NoData",
"execErrState": "Error",
"for": "5m",
"annotations": {},
"labels": {},
"isPaused": false
}
{{ end }}
]
}
]
}