Compare commits
3 Commits
configurab
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 9cb997b25a | |||
| 48d137d194 | |||
| 1acb5ebd6a |
16
.env.sample
16
.env.sample
@ -6,8 +6,7 @@ DOMAIN=monitoring-ng.example.com
|
||||
ENABLE_BACKUPS=true
|
||||
|
||||
## Enable this secret for Promtail / Prometheus
|
||||
#COMPOSE_FILE="$COMPOSE_FILE:compose.basic-auth.yml"
|
||||
#SECRET_BASIC_AUTH_VERSION=v1
|
||||
# SECRET_BASIC_AUTH_VERSION=v1
|
||||
#
|
||||
# Promtail (Gathering Logs)
|
||||
# COMPOSE_FILE="$COMPOSE_FILE:compose.promtail.yml"
|
||||
@ -80,10 +79,9 @@ ENABLE_BACKUPS=true
|
||||
#GF_MATRIX_ROOM_ID="<room-id>"
|
||||
#GF_MATRIX_HOMESERVER_URL="<homeserver-url>"
|
||||
|
||||
## ALerts
|
||||
|
||||
# Node disk space alert will trigger when free disk space left is below the given number in percent
|
||||
#ALERT_NODE_DISK_SPACE_LEFT=10
|
||||
|
||||
# Node memory usage alert will trigger when memory usage is above the given number in percent
|
||||
#ALERT_NODE_MEMORY_USAGE=85
|
||||
# ALerts
|
||||
#ALERT_BACKUP_FAILED_ENABLED=true
|
||||
#ALERT_BACKUP_MISSING_ENABLED=true
|
||||
#ALERT_BACKUP_NOT_SUCCESSFULL_ENABLED=true
|
||||
#ALERT_NODE_DISK_SPACE_ENABLED=true
|
||||
#ALERT_NODE_MEMORY_USAGE_ENABLED=true
|
||||
|
||||
12
README.md
12
README.md
@ -156,9 +156,13 @@ GF_MATRIX_HOME_SERVER_URL=
|
||||
```
|
||||
4. Configure Alertmanager webhook and set the url to `http://matrix-alertmanager-receiver:12345/alerts/<room-id>`
|
||||
|
||||
## Alerts
|
||||
## alerts
|
||||
|
||||
It is possible to enable the following alerts, by setting the corresponding env variable to `true`:
|
||||
- backupbot failed: `ALERT_BACKUP_FAILED_ENABLED`
|
||||
- backupbot missing: `ALERT_BACKUP_MISSING_ENABLED`
|
||||
- backupbot not successfull: `ALERT_BACKUP_NOT_SUCCESSFULL_ENABLED`
|
||||
- node disk space: `ALERT_NODE_DISK_SPACE_ENABLED`
|
||||
- node memory usage: `ALERT_NODE_MEMORY_USAGE_ENABLED`
|
||||
|
||||
It is possible to enable the following alerts, by uncommenting the corresponding env variable:
|
||||
|
||||
- node disk space: `ALERT_NODE_DISK_SPACE_LEFT`
|
||||
- node memory usage: `ALERT_NODE_MEMORY_USAGE`
|
||||
|
||||
8
abra.sh
8
abra.sh
@ -7,11 +7,11 @@ export GRAFANA_TRAEFIK_DASHBOARD_JSON_VERSION=v2
|
||||
export GRAFANA_BACKUP_DASHBOARD_JSON_VERSION=v1
|
||||
export GRAFANA_CUSTOM_INI_VERSION=v4
|
||||
export PROMTAIL_YML_VERSION=v3
|
||||
export LOKI_YML_VERSION=v2
|
||||
export LOKI_YML_VERSION=v3
|
||||
export PROMETHEUS_YML_VERSION=v2
|
||||
export MATRIX_ALERTMANAGER_CONFIG_VERSION=v1
|
||||
export MATRIX_ALERTMANAGER_ENTRYPOINT_VERSION=v1
|
||||
export GRAFANA_ALERTS_NODE_VERSION=v2
|
||||
export MATRIX_ALERTMANAGER_CONFIG_VERSION=e
|
||||
export MATRIX_ALERTMANAGER_ENTRYPOINT_VERSION=a
|
||||
export GRAFANA_ALERTS_NODE_VERSION=v1c
|
||||
|
||||
# creates a default prometheus scrape config for a given node
|
||||
add_node(){
|
||||
|
||||
@ -2,13 +2,13 @@ apiVersion: 1
|
||||
|
||||
# List of alert rule UIDs that should be deleted
|
||||
deleteRules:
|
||||
{{ if not (env "ALERT_NODE_DISK_SPACE_LEFT") }}
|
||||
{{ if ne (env "ALERT_NODE_DISK_SPACE_ENABLED") "true" }}
|
||||
- orgId: 1
|
||||
uid: coopcloud_node_disk_space_left
|
||||
uid: bds8bhxu97pxca
|
||||
{{ end }}
|
||||
{{ if not (env "ALERT_NODE_MEMORY_USAGE") }}
|
||||
{{ if ne (env "ALERT_NODE_MEMORY_USAGE_ENABLED") "true" }}
|
||||
- orgId: 1
|
||||
uid: coopcloud_node_memory_usage
|
||||
uid: ads8cswmly96oa
|
||||
{{ end }}
|
||||
|
||||
groups:
|
||||
@ -17,8 +17,8 @@ groups:
|
||||
folder: node
|
||||
interval: 5m
|
||||
rules:
|
||||
{{ if (env "ALERT_NODE_DISK_SPACE_LEFT") }}
|
||||
- uid: coopcloud_node_disk_space_left
|
||||
{{ if eq (env "ALERT_NODE_DISK_SPACE_ENABLED") "true" }}
|
||||
- uid: bds8bhxu97pxca
|
||||
title: Node Disk Space
|
||||
condition: C
|
||||
data:
|
||||
@ -45,7 +45,7 @@ groups:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- {{ env "ALERT_NODE_DISK_SPACE_LEFT" }}
|
||||
- 10
|
||||
type: lt
|
||||
operator:
|
||||
type: and
|
||||
@ -70,13 +70,13 @@ groups:
|
||||
annotations:
|
||||
description: ""
|
||||
runbook_url: ""
|
||||
summary: Less than {{ env "ALERT_NODE_DISK_SPACE_LEFT" }}% disk space left on {{`{{ $labels.instance }}`}} ({{`{{ (index $values "A").Value }}`}}% left)
|
||||
summary: Less than 10% disk space left on {{`{{ $labels.instance }}`}} ({{`{{ (index $values "A").Value }}`}}% left)
|
||||
labels:
|
||||
"": ""
|
||||
isPaused: false
|
||||
{{ end }}
|
||||
{{ if (env "ALERT_NODE_MEMORY_USAGE") }}
|
||||
- uid: coopcloud_node_memory_usage
|
||||
{{ if eq (env "ALERT_NODE_MEMORY_USAGE_ENABLED") "true" }}
|
||||
- uid: ads8cswmly96oa
|
||||
title: Node Memory Usage
|
||||
condition: C
|
||||
data:
|
||||
@ -103,7 +103,7 @@ groups:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- {{ env "ALERT_NODE_MEMORY_USAGE" }}
|
||||
- 85
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
@ -126,6 +126,6 @@ groups:
|
||||
execErrState: Error
|
||||
for: 5m
|
||||
annotations:
|
||||
summary: Memory usage is above {{ env "ALERT_NODE_MEMORY_USAGE" }}% on {{`{{ $labels.instance }}`}} ({{`{{ printf "%.2f" (index $values "A").Value }}`}}% usage)
|
||||
summary: Memory usage is above 85% on {{`{{ $labels.instance }}`}} ({{`{{ printf "%.2f" (index $values "A").Value }}`}}% usage)
|
||||
isPaused: false
|
||||
{{ end }}
|
||||
|
||||
@ -1,7 +0,0 @@
|
||||
---
|
||||
version: "3.8"
|
||||
|
||||
secrets:
|
||||
basic_auth:
|
||||
external: true
|
||||
name: ${STACK_NAME}_basic_auth_${SECRET_BASIC_AUTH_VERSION}
|
||||
@ -2,7 +2,7 @@ version: '3.8'
|
||||
|
||||
services:
|
||||
grafana:
|
||||
image: grafana/grafana:10.4.14
|
||||
image: grafana/grafana:12.4.0
|
||||
volumes:
|
||||
- grafana-data:/var/lib/grafana:rw
|
||||
secrets:
|
||||
@ -32,8 +32,8 @@ services:
|
||||
- GF_SECURITY_ADMIN_PASSWORD__FILE=/run/secrets/grafana_admin_password
|
||||
- GF_SECURITY_ALLOW_EMBEDDING
|
||||
- GF_INSTALL_PLUGINS
|
||||
- ALERT_NODE_DISK_SPACE_LEFT
|
||||
- ALERT_NODE_MEMORY_USAGE
|
||||
- ALERT_NODE_DISK_SPACE_ENABLED
|
||||
- ALERT_NODE_MEMORY_USAGE_ENABLED
|
||||
deploy:
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
|
||||
@ -2,7 +2,7 @@ version: '3.8'
|
||||
|
||||
services:
|
||||
loki:
|
||||
image: grafana/loki:2.9.11
|
||||
image: grafana/loki:3.6.7
|
||||
command: -config.file=/etc/loki/local-config.yaml
|
||||
networks:
|
||||
- proxy
|
||||
|
||||
@ -2,7 +2,7 @@ version: '3.8'
|
||||
|
||||
services:
|
||||
matrix-alertmanager-receiver:
|
||||
image: metio/matrix-alertmanager-receiver:2025.2.9
|
||||
image: metio/matrix-alertmanager-receiver:2026.2.25
|
||||
secrets:
|
||||
- matrix_access_token
|
||||
configs:
|
||||
|
||||
@ -2,7 +2,7 @@ version: '3.8'
|
||||
|
||||
services:
|
||||
prometheus:
|
||||
image: prom/prometheus:v2.55.1
|
||||
image: prom/prometheus:v3.10.0
|
||||
secrets:
|
||||
- basic_auth
|
||||
volumes:
|
||||
|
||||
@ -2,7 +2,7 @@ version: "3.8"
|
||||
|
||||
services:
|
||||
promtail:
|
||||
image: grafana/promtail:2.9.11
|
||||
image: grafana/promtail:3.6.7
|
||||
volumes:
|
||||
- /var/log:/var/log:ro
|
||||
- /var/run/docker.sock:/var/run/docker.sock
|
||||
@ -23,3 +23,8 @@ configs:
|
||||
name: ${STACK_NAME}_promtail_yml_${PROMTAIL_YML_VERSION}
|
||||
file: promtail.yml.tmpl
|
||||
template_driver: golang
|
||||
|
||||
secrets:
|
||||
basic_auth:
|
||||
external: true
|
||||
name: ${STACK_NAME}_basic_auth_${SECRET_BASIC_AUTH_VERSION}
|
||||
|
||||
@ -2,7 +2,7 @@ version: '3.8'
|
||||
|
||||
services:
|
||||
pushgateway:
|
||||
image: prom/pushgateway:v1.10.0
|
||||
image: prom/pushgateway:v1.11.2
|
||||
command:
|
||||
- '--web.listen-address=:9191'
|
||||
- '--push.disable-consistency-check'
|
||||
|
||||
@ -3,7 +3,7 @@ version: "3.8"
|
||||
|
||||
services:
|
||||
app:
|
||||
image: prom/node-exporter:v1.8.1
|
||||
image: prom/node-exporter:v1.10.2
|
||||
user: root
|
||||
environment:
|
||||
- NODE_ID={{.Node.ID}}
|
||||
@ -43,7 +43,7 @@ services:
|
||||
- "coop-cloud.${STACK_NAME}.timeout=${TIMEOUT}"
|
||||
|
||||
cadvisor:
|
||||
image: gcr.io/cadvisor/cadvisor:v0.49.2
|
||||
image: gcr.io/cadvisor/cadvisor:v0.55.1
|
||||
command:
|
||||
- "-logtostderr"
|
||||
- "--enable_metrics=cpu,cpuLoad,disk,diskIO,process,memory,network"
|
||||
|
||||
@ -34,7 +34,6 @@ ingester:
|
||||
max_chunk_age: 1h # All chunks will be flushed when they hit this age, default is 1h
|
||||
chunk_target_size: 1048576 # Loki will attempt to build chunks up to 1.5MB, flushing first if chunk_idle_period or max_chunk_age is reached first
|
||||
chunk_retain_period: 30s # Must be greater than index read cache TTL if using an index cache (Default index read cache TTL is 5m)
|
||||
max_transfer_retries: 0 # Chunk transfers disabled
|
||||
wal:
|
||||
dir: "/tmp/wal"
|
||||
|
||||
@ -53,7 +52,7 @@ schema_config:
|
||||
- from: 2020-10-24
|
||||
store: boltdb-shipper
|
||||
object_store: filesystem
|
||||
schema: v11
|
||||
schema: v13
|
||||
index:
|
||||
prefix: index_
|
||||
period: 24h
|
||||
@ -63,7 +62,6 @@ storage_config:
|
||||
active_index_directory: /loki/boltdb-shipper-active
|
||||
cache_location: /loki/boltdb-shipper-cache
|
||||
cache_ttl: 24h # Can be increased for faster performance over longer query periods, uses more disk space
|
||||
shared_store: filesystem
|
||||
filesystem:
|
||||
directory: /loki/chunks
|
||||
{{ end }}
|
||||
@ -72,7 +70,6 @@ schema_config:
|
||||
configs:
|
||||
- from: 2020-11-25
|
||||
store: boltdb-shipper
|
||||
object_store: aws
|
||||
schema: v11
|
||||
index:
|
||||
prefix: index_
|
||||
@ -103,19 +100,24 @@ storage_config:
|
||||
|
||||
compactor:
|
||||
working_directory: /loki/boltdb-shipper-compactor
|
||||
shared_store: filesystem
|
||||
compaction_interval: 10m
|
||||
retention_enabled: true
|
||||
retention_delete_delay: 2h
|
||||
retention_delete_worker_count: 150
|
||||
{{ if eq (env "LOKI_STORAGE_FILESYSTEM") "1" }}
|
||||
delete_request_store: filesystem
|
||||
{{ end }}
|
||||
{{ if eq (env "LOKI_STORAGE_S3") "1" }}
|
||||
delete_request_store: aws
|
||||
{{ end }}
|
||||
|
||||
limits_config:
|
||||
enforce_metric_name: false
|
||||
reject_old_samples: true
|
||||
reject_old_samples_max_age: 168h
|
||||
retention_period: {{ env "LOKI_RETENTION_PERIOD" }}
|
||||
split_queries_by_interval: 24h
|
||||
max_query_parallelism: 100
|
||||
allow_structured_metadata: false
|
||||
|
||||
query_scheduler:
|
||||
max_outstanding_requests_per_tenant: 4096
|
||||
@ -123,9 +125,6 @@ query_scheduler:
|
||||
frontend:
|
||||
max_outstanding_per_tenant: 4096
|
||||
|
||||
chunk_store_config:
|
||||
max_look_back_period: 0s
|
||||
|
||||
table_manager:
|
||||
retention_deletes_enabled: false
|
||||
retention_period: 0s
|
||||
Reference in New Issue
Block a user