4 Commits

Author SHA1 Message Date
9cb997b25a delete_request_store based on env variable 2026-04-09 04:36:03 +00:00
48d137d194 update loki config file 2026-04-09 04:36:03 +00:00
1acb5ebd6a chore: update image tags 2026-04-09 04:36:03 +00:00
99f8790ec4 fix: Update scape-config example to use HTTPS for Traefik metrics (#17)
This fixes the insecure Traefik metrics endpoint. See coop-cloud/traefik#94 for details.

Reviewed-on: #17
Co-authored-by: Danny Groenewegen <mail@dannygroenewegen.nl>
Co-committed-by: Danny Groenewegen <mail@dannygroenewegen.nl>
2026-03-24 09:37:05 +00:00
15 changed files with 36 additions and 235 deletions

View File

@ -44,10 +44,10 @@ ENABLE_BACKUPS=true
## Grafana
#
# COMPOSE_FILE="$COMPOSE_FILE:compose.grafana.yml"
## GRAFANA_DOMAIN needs to be set. change it for a different domain
#GRAFANA_DOMAIN=$DOMAIN
# GF_SERVER_ROOT_URL=https://${GRAFANA_DOMAIN}
# GF_SERVER_ROOT_URL=https://monitoring.example.com
# SECRET_GRAFANA_ADMIN_PASSWORD_VERSION=v1
## Seperate domain for Grafana
#GRAFANA_DOMAIN=grafana.example.com
#
## Single-Sign-On with OIDC
# COMPOSE_FILE="$COMPOSE_FILE:compose.grafana-oidc.yml"
@ -85,5 +85,3 @@ ENABLE_BACKUPS=true
#ALERT_BACKUP_NOT_SUCCESSFULL_ENABLED=true
#ALERT_NODE_DISK_SPACE_ENABLED=true
#ALERT_NODE_MEMORY_USAGE_ENABLED=true
#ALERT_RESTIC_CHECK_FAILED_ENABLED=true
#ALERT_RESTIC_OUTDATED_BACKUP_ENABLED=true

View File

@ -7,17 +7,16 @@ export GRAFANA_TRAEFIK_DASHBOARD_JSON_VERSION=v2
export GRAFANA_BACKUP_DASHBOARD_JSON_VERSION=v1
export GRAFANA_CUSTOM_INI_VERSION=v4
export PROMTAIL_YML_VERSION=v3
export LOKI_YML_VERSION=v2
export LOKI_YML_VERSION=v3
export PROMETHEUS_YML_VERSION=v2
export MATRIX_ALERTMANAGER_CONFIG_VERSION=e
export MATRIX_ALERTMANAGER_ENTRYPOINT_VERSION=a
export GRAFANA_ALERTS_NODE_VERSION=v1c
export GRAFANA_ALERTS_RESTIC_VERSION=v2
# creates a default prometheus scrape config for a given node
add_node(){
name=$1
add_domain "$name" "$name:8082"
add_domain "$name" "metrics.traefik.$name"
add_domain "$name" "node.monitoring.$name"
add_domain "$name" "cadvisor.monitoring.$name"
cat "/prometheus/scrape_configs/$name.yml"

View File

@ -1,191 +0,0 @@
apiVersion: 1
deleteRules:
{{ if ne (env "ALERT_RESTIC_CHECK_FAILED_ENABLED") "true" }}
- orgId: 1
uid: ffglj6egxy8e8c
{{ end }}
{{ if ne (env "ALERT_RESTIC_OUTDATED_BACKUP_ENABLED") "true" }}
- orgId: 1
uid: ffgljntkp9ce8b
{{ end }}
groups:
- orgId: 1
name: restic
folder: restic
interval: 5m
rules:
{{ if eq (env "ALERT_RESTIC_CHECK_FAILED_ENABLED") "true" }}
- uid: ffglj6egxy8e8c
title: Restic Check Failed
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: prometheus
model:
disableTextWrap: false
editorMode: builder
expr: restic_check_success
fullMetaSearch: false
includeNullMetadata: true
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
useBackend: false
- refId: B
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params: []
type: gt
operator:
type: and
query:
params:
- B
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
reducer: last
refId: B
type: reduce
- refId: C
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 1
- 0
type: lt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: B
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: Alerting
execErrState: Error
for: 5m
annotations: {}
labels: {}
isPaused: false
{{ end }}
{{ if eq (env "ALERT_RESTIC_OUTDATED_BACKUP_ENABLED") "true" }}
- uid: ffgljntkp9ce8b
title: Restic Outdated Backup
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: prometheus
model:
disableTextWrap: false
editorMode: builder
expr: time() - max by(instance) (restic_backup_timestamp)
fullMetaSearch: false
includeNullMetadata: true
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
useBackend: false
- refId: B
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params: []
type: gt
operator:
type: and
query:
params:
- B
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
reducer: last
refId: B
type: reduce
- refId: C
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 93600
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: B
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 5m
annotations: {}
labels: {}
isPaused: false
{{ end }}

View File

@ -2,7 +2,7 @@ version: '3.8'
services:
grafana:
image: grafana/grafana:10.4.14
image: grafana/grafana:12.4.0
volumes:
- grafana-data:/var/lib/grafana:rw
secrets:
@ -24,8 +24,6 @@ services:
target: /var/lib/grafana/dashboards/backup.json
- source: gf_alerts_node
target: /etc/grafana/provisioning/alerting/node.yml
- source: gf_alerts_restic
target: /etc/grafana/provisioning/alerting/restic.yml
networks:
- proxy
- internal
@ -36,15 +34,12 @@ services:
- GF_INSTALL_PLUGINS
- ALERT_NODE_DISK_SPACE_ENABLED
- ALERT_NODE_MEMORY_USAGE_ENABLED
- ALERT_RESTIC_CHECK_FAILED_ENABLED
- ALERT_RESTIC_OUTDATED_BACKUP_ENABLED
- DOMAIN
deploy:
labels:
- "traefik.enable=true"
- "traefik.docker.network=proxy"
- "traefik.http.services.${STACK_NAME}-grafana.loadbalancer.server.port=3000"
- "traefik.http.routers.${STACK_NAME}-grafana.rule=Host(`${GRAFANA_DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-grafana.rule=Host(`${GRAFANA_DOMAIN:-$DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-grafana.entrypoints=web-secure"
- "traefik.http.routers.${STACK_NAME}-grafana.tls=true"
- "traefik.http.routers.${STACK_NAME}-grafana.tls.certresolver=${LETS_ENCRYPT_ENV}"
@ -82,10 +77,6 @@ configs:
template_driver: golang
name: ${STACK_NAME}_gf_alerts_node_${GRAFANA_ALERTS_NODE_VERSION}
file: alerts/node.yml.tmpl
gf_alerts_restic:
template_driver: golang
name: ${STACK_NAME}_gf_alerts_restiv_${GRAFANA_ALERTS_RESTIC_VERSION}
file: alerts/restic.yml.tmpl
volumes:
grafana-data:

View File

@ -2,7 +2,7 @@ version: '3.8'
services:
loki:
image: grafana/loki:2.9.11
image: grafana/loki:3.6.7
command: -config.file=/etc/loki/local-config.yaml
networks:
- proxy

View File

@ -2,7 +2,7 @@ version: '3.8'
services:
matrix-alertmanager-receiver:
image: metio/matrix-alertmanager-receiver:2025.2.9
image: metio/matrix-alertmanager-receiver:2026.2.25
secrets:
- matrix_access_token
configs:

View File

@ -2,7 +2,7 @@ version: '3.8'
services:
prometheus:
image: prom/prometheus:v2.55.1
image: prom/prometheus:v3.10.0
secrets:
- basic_auth
volumes:
@ -39,8 +39,3 @@ configs:
volumes:
prometheus-data:
secrets:
basic_auth:
external: true
name: ${STACK_NAME}_basic_auth_${SECRET_BASIC_AUTH_VERSION}

View File

@ -2,7 +2,7 @@ version: "3.8"
services:
promtail:
image: grafana/promtail:2.9.11
image: grafana/promtail:3.6.7
volumes:
- /var/log:/var/log:ro
- /var/run/docker.sock:/var/run/docker.sock
@ -27,4 +27,4 @@ configs:
secrets:
basic_auth:
external: true
name: ${STACK_NAME}_basic_auth_${SECRET_BASIC_AUTH_VERSION}
name: ${STACK_NAME}_basic_auth_${SECRET_BASIC_AUTH_VERSION}

View File

@ -2,7 +2,7 @@ version: '3.8'
services:
pushgateway:
image: prom/pushgateway:v1.10.0
image: prom/pushgateway:v1.11.2
command:
- '--web.listen-address=:9191'
- '--push.disable-consistency-check'

View File

@ -3,7 +3,7 @@ version: "3.8"
services:
app:
image: prom/node-exporter:v1.8.1
image: prom/node-exporter:v1.10.2
user: root
environment:
- NODE_ID={{.Node.ID}}
@ -43,7 +43,7 @@ services:
- "coop-cloud.${STACK_NAME}.timeout=${TIMEOUT}"
cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.49.2
image: gcr.io/cadvisor/cadvisor:v0.55.1
command:
- "-logtostderr"
- "--enable_metrics=cpu,cpuLoad,disk,diskIO,process,memory,network"

View File

@ -11,3 +11,13 @@ providers:
options:
path: /var/lib/grafana/dashboards
foldersFromFilesStructure: true
- name: 'default-alert-provider'
orgId: 1
folder: 'default-alerts'
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
options:
path: /var/lib/grafana/alerts
foldersFromFilesStructure: true

View File

@ -3,7 +3,6 @@ apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
uid: prometheus
access: proxy
orgId: 1
url: http://prometheus:9090
@ -11,7 +10,6 @@ datasources:
editable: false
- name: Loki
type: loki
uid: loki
access: proxy
orgId: 1
url: http://loki:3100

View File

@ -34,7 +34,6 @@ ingester:
max_chunk_age: 1h # All chunks will be flushed when they hit this age, default is 1h
chunk_target_size: 1048576 # Loki will attempt to build chunks up to 1.5MB, flushing first if chunk_idle_period or max_chunk_age is reached first
chunk_retain_period: 30s # Must be greater than index read cache TTL if using an index cache (Default index read cache TTL is 5m)
max_transfer_retries: 0 # Chunk transfers disabled
wal:
dir: "/tmp/wal"
@ -53,7 +52,7 @@ schema_config:
- from: 2020-10-24
store: boltdb-shipper
object_store: filesystem
schema: v11
schema: v13
index:
prefix: index_
period: 24h
@ -63,7 +62,6 @@ storage_config:
active_index_directory: /loki/boltdb-shipper-active
cache_location: /loki/boltdb-shipper-cache
cache_ttl: 24h # Can be increased for faster performance over longer query periods, uses more disk space
shared_store: filesystem
filesystem:
directory: /loki/chunks
{{ end }}
@ -72,7 +70,6 @@ schema_config:
configs:
- from: 2020-11-25
store: boltdb-shipper
object_store: aws
schema: v11
index:
prefix: index_
@ -103,19 +100,24 @@ storage_config:
compactor:
working_directory: /loki/boltdb-shipper-compactor
shared_store: filesystem
compaction_interval: 10m
retention_enabled: true
retention_delete_delay: 2h
retention_delete_worker_count: 150
{{ if eq (env "LOKI_STORAGE_FILESYSTEM") "1" }}
delete_request_store: filesystem
{{ end }}
{{ if eq (env "LOKI_STORAGE_S3") "1" }}
delete_request_store: aws
{{ end }}
limits_config:
enforce_metric_name: false
reject_old_samples: true
reject_old_samples_max_age: 168h
retention_period: {{ env "LOKI_RETENTION_PERIOD" }}
split_queries_by_interval: 24h
max_query_parallelism: 100
allow_structured_metadata: false
query_scheduler:
max_outstanding_requests_per_tenant: 4096
@ -123,9 +125,6 @@ query_scheduler:
frontend:
max_outstanding_per_tenant: 4096
chunk_store_config:
max_look_back_period: 0s
table_manager:
retention_deletes_enabled: false
retention_period: 0s

View File

@ -5,3 +5,5 @@ COMPOSE_FILE="$COMPOSE_FILE:compose.grafana-oidc.yml"
2. SMTP was moved into a seperate compose file. If you have smtp configured you need to add the following line to you .env file:
COMPOSE_FILE="$COMPOSE_FILE:compose.grafana-smtp.yml"
3. The scape-config.example.yml file and add_node() command were updated to use a secure endpoint for the traefik metrics instead http. This requires an updated Traefik recipe that publishes the metrics on https.

View File

@ -1,4 +1,4 @@
- targets:
- 'example.org:8082'
- 'metrics.traefik.example.org'
- 'node.monitoring.example.org'
- 'cadvisor.monitoring.example.org'