Compare commits

..

6 Commits

Author SHA1 Message Date
5172806135 Update scape-config example to use HTTPS for Traefik metrics 2026-03-20 20:17:07 +01:00
310c28e735 refactor: provision alerts instead of putting them in the /var/lib folder (#16)
Note that I did not copy the backupbot alert since this one gets a rework soon

Reviewed-on: coop-cloud/monitoring-ng#16
Co-authored-by: p4u1 <p4u1_f4u1@riseup.net>
Co-committed-by: p4u1 <p4u1_f4u1@riseup.net>
2026-03-20 14:10:10 +00:00
16bd65f417 fix recipe part in the domain (#8)
I created a new app using this recipe and the domain wasn't automatically replaced, I'm guessing cause the part before the root domain didn't match the recipe name?

Just opening a PR real quick so I can get back to it and test the fix later when I have cycles

Co-authored-by: p4u1 <p4u1@noreply.git.coopcloud.tech>
Reviewed-on: coop-cloud/monitoring-ng#8
Reviewed-by: p4u1 <p4u1@noreply.git.coopcloud.tech>
Co-authored-by: ammaratef45 <ammaratef45@proton.me>
Co-committed-by: ammaratef45 <ammaratef45@proton.me>
2026-03-20 09:23:36 +00:00
97ebcf306a add all mountpoints to free disk space in Docker Swarm dashboard (#4)
Until now, only / and /media were monitored in the Docker Swarm dashboard. We removed the filters and changed the dashboard to a time series, so multiple mounts can be shown at once.
We also updated the alert, so it also triggers on all mount ext4 points.

Reviewed-on: coop-cloud/monitoring-ng#4
Co-authored-by: Apfelwurm <Alexander@volzit.de>
Co-committed-by: Apfelwurm <Alexander@volzit.de>
2026-03-20 09:15:52 +00:00
f93370b9ca Moves oidc to a seperate compose config (#6)
Otherwise the secret has to be provided when oidc is not used

Reviewed-on: coop-cloud/monitoring-ng#6
Co-authored-by: p4u1 <p4u1_f4u1@riseup.net>
Co-committed-by: p4u1 <p4u1_f4u1@riseup.net>
2026-03-20 09:10:48 +00:00
83461e2e76 remove default TIMEOUT (abra #596) 2025-12-30 13:53:47 +01:00
9 changed files with 190 additions and 30 deletions

View File

@ -1,8 +1,8 @@
TYPE=monitoring-ng
LETS_ENCRYPT_ENV=production
COMPOSE_FILE=compose.yml
DOMAIN=monitoring.example.com
TIMEOUT=120
DOMAIN=monitoring-ng.example.com
#TIMEOUT=120
ENABLE_BACKUPS=true
## Enable this secret for Promtail / Prometheus
@ -50,6 +50,7 @@ ENABLE_BACKUPS=true
#GRAFANA_DOMAIN=grafana.example.com
#
## Single-Sign-On with OIDC
# COMPOSE_FILE="$COMPOSE_FILE:compose.grafana-oidc.yml"
# OIDC_ENABLED=1
# SECRET_GRAFANA_OIDC_CLIENT_SECRET_VERSION=v1
# OIDC_CLIENT_ID=grafana
@ -62,6 +63,7 @@ ENABLE_BACKUPS=true
# GF_INSTALL_PLUGINS=grafana-piechart-panel
#
## grafana SMTP configuration (optional)
# COMPOSE_FILE="$COMPOSE_FILE:compose.grafana-smtp.yml"
# GF_SMTP_HOST=changeme
# GF_SMTP_USER=changme
# GF_SMTP_ENABLED=true

View File

@ -5,18 +5,18 @@ export GRAFANA_SWARM_DASHBOARD_JSON_VERSION=v2
export GRAFANA_STACKS_DASHBOARD_JSON_VERSION=v2
export GRAFANA_TRAEFIK_DASHBOARD_JSON_VERSION=v2
export GRAFANA_BACKUP_DASHBOARD_JSON_VERSION=v1
export GRAFANA_ALERTS_JSON_VERSION=v3
export GRAFANA_CUSTOM_INI_VERSION=v4
export PROMTAIL_YML_VERSION=v3
export LOKI_YML_VERSION=v2
export PROMETHEUS_YML_VERSION=v2
export MATRIX_ALERTMANAGER_CONFIG_VERSION=e
export MATRIX_ALERTMANAGER_ENTRYPOINT_VERSION=a
export GRAFANA_ALERTS_NODE_VERSION=v1c
# creates a default prometheus scrape config for a given node
add_node(){
name=$1
add_domain "$name" "$name:8082"
add_domain "$name" "metrics.traefik.$name"
add_domain "$name" "node.monitoring.$name"
add_domain "$name" "cadvisor.monitoring.$name"
cat "/prometheus/scrape_configs/$name.yml"

131
alerts/node.yml.tmpl Normal file
View File

@ -0,0 +1,131 @@
apiVersion: 1
# List of alert rule UIDs that should be deleted
deleteRules:
{{ if ne (env "ALERT_NODE_DISK_SPACE_ENABLED") "true" }}
- orgId: 1
uid: bds8bhxu97pxca
{{ end }}
{{ if ne (env "ALERT_NODE_MEMORY_USAGE_ENABLED") "true" }}
- orgId: 1
uid: ads8cswmly96oa
{{ end }}
groups:
- orgId: 1
name: node
folder: node
interval: 5m
rules:
{{ if eq (env "ALERT_NODE_DISK_SPACE_ENABLED") "true" }}
- uid: bds8bhxu97pxca
title: Node Disk Space
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: PBFA97CFB590B2093
model:
editorMode: code
expr: (node_filesystem_free_bytes{fstype="ext4"} / node_filesystem_size_bytes{fstype="ext4"}) * 100
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 10
type: lt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 5m
annotations:
description: ""
runbook_url: ""
summary: Less than 10% disk space left on {{`{{ $labels.instance }}`}} ({{`{{ (index $values "A").Value }}`}}% left)
labels:
"": ""
isPaused: false
{{ end }}
{{ if eq (env "ALERT_NODE_MEMORY_USAGE_ENABLED") "true" }}
- uid: ads8cswmly96oa
title: Node Memory Usage
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: PBFA97CFB590B2093
model:
editorMode: code
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 85
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 5m
annotations:
summary: Memory usage is above 85% on {{`{{ $labels.instance }}`}} ({{`{{ printf "%.2f" (index $values "A").Value }}`}}% usage)
isPaused: false
{{ end }}

17
compose.grafana-oidc.yml Normal file
View File

@ -0,0 +1,17 @@
version: '3.8'
services:
grafana:
secrets:
- grafana_oidc_client_secret
environment:
- OIDC_API_URL
- OIDC_AUTH_URL
- OIDC_CLIENT_ID
- OIDC_ENABLED
- OIDC_TOKEN_URL
secrets:
grafana_oidc_client_secret:
external: true
name: ${STACK_NAME}_grafana_oidc_client_secret_${SECRET_GRAFANA_OIDC_CLIENT_SECRET_VERSION}

18
compose.grafana-smtp.yml Normal file
View File

@ -0,0 +1,18 @@
version: '3.8'
services:
grafana:
secrets:
- grafana_smtp_password
environment:
- GF_SMTP_HOST
- GF_SMTP_USER
- GF_SMTP_PASSWORD__FILE=/run/secrets/grafana_smtp_password
- GF_SMTP_ENABLED
- GF_SMTP_FROM_ADDRESS
- GF_SMTP_SKIP_VERIFY
secrets:
grafana_smtp_password:
external: true
name: ${STACK_NAME}_grafana_smtp_password_${SECRET_GRAFANA_SMTP_PASSWORD_VERSION}

View File

@ -7,8 +7,6 @@ services:
- grafana-data:/var/lib/grafana:rw
secrets:
- grafana_admin_password
- grafana_oidc_client_secret
- grafana_smtp_password
configs:
- source: grafana_custom_ini
target: /etc/grafana/grafana.ini
@ -24,27 +22,18 @@ services:
target: /var/lib/grafana/dashboards/traefik.json
- source: grafana_backup_dashboard_json
target: /var/lib/grafana/dashboards/backup.json
- source: grafana_alerts_json
target: /var/lib/grafana/alerts/alerts.json
- source: gf_alerts_node
target: /etc/grafana/provisioning/alerting/node.yml
networks:
- proxy
- internal
environment:
- GF_SERVER_ROOT_URL
- GF_SECURITY_ADMIN_PASSWORD__FILE=/run/secrets/grafana_admin_password
- GF_SMTP_HOST
- GF_SMTP_USER
- GF_SMTP_PASSWORD__FILE=/run/secrets/grafana_smtp_password
- GF_SMTP_ENABLED
- GF_SMTP_FROM_ADDRESS
- GF_SMTP_SKIP_VERIFY
- GF_SECURITY_ALLOW_EMBEDDING
- GF_INSTALL_PLUGINS
- OIDC_API_URL
- OIDC_AUTH_URL
- OIDC_CLIENT_ID
- OIDC_ENABLED
- OIDC_TOKEN_URL
- ALERT_NODE_DISK_SPACE_ENABLED
- ALERT_NODE_MEMORY_USAGE_ENABLED
deploy:
labels:
- "traefik.enable=true"
@ -84,10 +73,10 @@ configs:
grafana_backup_dashboard_json:
name: ${STACK_NAME}_g_backup_dashboard_json_${GRAFANA_BACKUP_DASHBOARD_JSON_VERSION}
file: grafana-backup-dashboard.json
grafana_alerts_json:
gf_alerts_node:
template_driver: golang
name: ${STACK_NAME}_g_alerts_json_${GRAFANA_ALERTS_JSON_VERSION}
file: grafana-alerts.json.tmpl
name: ${STACK_NAME}_gf_alerts_node_${GRAFANA_ALERTS_NODE_VERSION}
file: alerts/node.yml.tmpl
volumes:
grafana-data:
@ -97,9 +86,3 @@ secrets:
grafana_admin_password:
external: true
name: ${STACK_NAME}_grafana_admin_password_${SECRET_GRAFANA_ADMIN_PASSWORD_VERSION}
grafana_oidc_client_secret:
external: true
name: ${STACK_NAME}_grafana_oidc_client_secret_${SECRET_GRAFANA_OIDC_CLIENT_SECRET_VERSION}
grafana_smtp_password:
external: true
name: ${STACK_NAME}_grafana_smtp_password_${SECRET_GRAFANA_SMTP_PASSWORD_VERSION}

View File

@ -40,7 +40,7 @@ services:
- "traefik.http.routers.${STACK_NAME}-node.tls.certresolver=${LETS_ENCRYPT_ENV}"
- "traefik.http.routers.${STACK_NAME}-node.middlewares=basicauth@file"
- "coop-cloud.${STACK_NAME}.version=1.6.0+v1.8.1"
- "coop-cloud.${STACK_NAME}.timeout=${TIMEOUT:-120}"
- "coop-cloud.${STACK_NAME}.timeout=${TIMEOUT}"
cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.49.2

9
release/next Normal file
View File

@ -0,0 +1,9 @@
1. OIDC was moved into a seperate compose file. If you have oidc configured you need to add the following line to you .env file:
COMPOSE_FILE="$COMPOSE_FILE:compose.grafana-oidc.yml"
2. SMTP was moved into a seperate compose file. If you have smtp configured you need to add the following line to you .env file:
COMPOSE_FILE="$COMPOSE_FILE:compose.grafana-smtp.yml"
3. The scape-config.example.yml file and add_node() command were updated to use a secure endpoint for the traefik metrics instead http. This requires an updated Traefik recipe that publishes the metrics on https.

View File

@ -1,4 +1,4 @@
- targets:
- 'example.org:8082'
- 'metrics.traefik.example.org'
- 'node.monitoring.example.org'
- 'cadvisor.monitoring.example.org'