Compare commits

...

2 Commits

8 changed files with 215 additions and 15 deletions

View File

@ -44,10 +44,10 @@ ENABLE_BACKUPS=true
## Grafana
#
# COMPOSE_FILE="$COMPOSE_FILE:compose.grafana.yml"
# GF_SERVER_ROOT_URL=https://monitoring.example.com
## GRAFANA_DOMAIN needs to be set. change it for a different domain
#GRAFANA_DOMAIN=$DOMAIN
# GF_SERVER_ROOT_URL=https://${GRAFANA_DOMAIN}
# SECRET_GRAFANA_ADMIN_PASSWORD_VERSION=v1
## Seperate domain for Grafana
#GRAFANA_DOMAIN=grafana.example.com
#
## Single-Sign-On with OIDC
# COMPOSE_FILE="$COMPOSE_FILE:compose.grafana-oidc.yml"
@ -85,3 +85,5 @@ ENABLE_BACKUPS=true
#ALERT_BACKUP_NOT_SUCCESSFULL_ENABLED=true
#ALERT_NODE_DISK_SPACE_ENABLED=true
#ALERT_NODE_MEMORY_USAGE_ENABLED=true
#ALERT_RESTIC_CHECK_FAILED_ENABLED=true
#ALERT_RESTIC_OUTDATED_BACKUP_ENABLED=true

View File

@ -12,6 +12,7 @@ export PROMETHEUS_YML_VERSION=v2
export MATRIX_ALERTMANAGER_CONFIG_VERSION=e
export MATRIX_ALERTMANAGER_ENTRYPOINT_VERSION=a
export GRAFANA_ALERTS_NODE_VERSION=v1c
export GRAFANA_ALERTS_RESTIC_VERSION=v2
# creates a default prometheus scrape config for a given node
add_node(){

191
alerts/restic.yml.tmpl Normal file
View File

@ -0,0 +1,191 @@
apiVersion: 1
deleteRules:
{{ if ne (env "ALERT_RESTIC_CHECK_FAILED_ENABLED") "true" }}
- orgId: 1
uid: ffglj6egxy8e8c
{{ end }}
{{ if ne (env "ALERT_RESTIC_OUTDATED_BACKUP_ENABLED") "true" }}
- orgId: 1
uid: ffgljntkp9ce8b
{{ end }}
groups:
- orgId: 1
name: restic
folder: restic
interval: 5m
rules:
{{ if eq (env "ALERT_RESTIC_CHECK_FAILED_ENABLED") "true" }}
- uid: ffglj6egxy8e8c
title: Restic Check Failed
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: prometheus
model:
disableTextWrap: false
editorMode: builder
expr: restic_check_success
fullMetaSearch: false
includeNullMetadata: true
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
useBackend: false
- refId: B
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params: []
type: gt
operator:
type: and
query:
params:
- B
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
reducer: last
refId: B
type: reduce
- refId: C
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 1
- 0
type: lt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: B
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: Alerting
execErrState: Error
for: 5m
annotations: {}
labels: {}
isPaused: false
{{ end }}
{{ if eq (env "ALERT_RESTIC_OUTDATED_BACKUP_ENABLED") "true" }}
- uid: ffgljntkp9ce8b
title: Restic Outdated Backup
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: prometheus
model:
disableTextWrap: false
editorMode: builder
expr: time() - max by(instance) (restic_backup_timestamp)
fullMetaSearch: false
includeNullMetadata: true
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
useBackend: false
- refId: B
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params: []
type: gt
operator:
type: and
query:
params:
- B
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
reducer: last
refId: B
type: reduce
- refId: C
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 93600
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: B
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 5m
annotations: {}
labels: {}
isPaused: false
{{ end }}

View File

@ -24,6 +24,8 @@ services:
target: /var/lib/grafana/dashboards/backup.json
- source: gf_alerts_node
target: /etc/grafana/provisioning/alerting/node.yml
- source: gf_alerts_restic
target: /etc/grafana/provisioning/alerting/restic.yml
networks:
- proxy
- internal
@ -34,12 +36,15 @@ services:
- GF_INSTALL_PLUGINS
- ALERT_NODE_DISK_SPACE_ENABLED
- ALERT_NODE_MEMORY_USAGE_ENABLED
- ALERT_RESTIC_CHECK_FAILED_ENABLED
- ALERT_RESTIC_OUTDATED_BACKUP_ENABLED
- DOMAIN
deploy:
labels:
- "traefik.enable=true"
- "traefik.docker.network=proxy"
- "traefik.http.services.${STACK_NAME}-grafana.loadbalancer.server.port=3000"
- "traefik.http.routers.${STACK_NAME}-grafana.rule=Host(`${GRAFANA_DOMAIN:-$DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-grafana.rule=Host(`${GRAFANA_DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-grafana.entrypoints=web-secure"
- "traefik.http.routers.${STACK_NAME}-grafana.tls=true"
- "traefik.http.routers.${STACK_NAME}-grafana.tls.certresolver=${LETS_ENCRYPT_ENV}"
@ -77,6 +82,10 @@ configs:
template_driver: golang
name: ${STACK_NAME}_gf_alerts_node_${GRAFANA_ALERTS_NODE_VERSION}
file: alerts/node.yml.tmpl
gf_alerts_restic:
template_driver: golang
name: ${STACK_NAME}_gf_alerts_restiv_${GRAFANA_ALERTS_RESTIC_VERSION}
file: alerts/restic.yml.tmpl
volumes:
grafana-data:

View File

@ -39,3 +39,8 @@ configs:
volumes:
prometheus-data:
secrets:
basic_auth:
external: true
name: ${STACK_NAME}_basic_auth_${SECRET_BASIC_AUTH_VERSION}

View File

@ -27,4 +27,4 @@ configs:
secrets:
basic_auth:
external: true
name: ${STACK_NAME}_basic_auth_${SECRET_BASIC_AUTH_VERSION}
name: ${STACK_NAME}_basic_auth_${SECRET_BASIC_AUTH_VERSION}

View File

@ -11,13 +11,3 @@ providers:
options:
path: /var/lib/grafana/dashboards
foldersFromFilesStructure: true
- name: 'default-alert-provider'
orgId: 1
folder: 'default-alerts'
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
options:
path: /var/lib/grafana/alerts
foldersFromFilesStructure: true

View File

@ -3,6 +3,7 @@ apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
uid: prometheus
access: proxy
orgId: 1
url: http://prometheus:9090
@ -10,6 +11,7 @@ datasources:
editable: false
- name: Loki
type: loki
uid: loki
access: proxy
orgId: 1
url: http://loki:3100