Compare commits
No commits in common. "main" and "main" have entirely different histories.
34
.env.sample
34
.env.sample
@ -1,9 +1,8 @@
|
||||
TYPE=monitoring-ng
|
||||
LETS_ENCRYPT_ENV=production
|
||||
COMPOSE_FILE=compose.yml
|
||||
DOMAIN=monitoring.example.com
|
||||
DOMAIN=monitoring-ng.example.com
|
||||
TIMEOUT=120
|
||||
ENABLE_BACKUPS=true
|
||||
|
||||
## Enable this secret for Promtail / Prometheus
|
||||
# SECRET_BASIC_AUTH_VERSION=v1
|
||||
@ -12,18 +11,11 @@ ENABLE_BACKUPS=true
|
||||
# COMPOSE_FILE="$COMPOSE_FILE:compose.promtail.yml"
|
||||
# LOKI_PUSH_URL=https://loki.monitoring.example.org/loki/api/v1/push
|
||||
|
||||
## Expose node and cadvisor ports instead of traefik
|
||||
# COMPOSE_FILE="$COMPOSE_FILE:compose.expose-ports.yml"
|
||||
|
||||
# Monitoring Server
|
||||
#
|
||||
## Prometheus
|
||||
# COMPOSE_FILE="$COMPOSE_FILE:compose.prometheus.yml"
|
||||
# PROMETHEUS_RETENTION_TIME=1y
|
||||
#
|
||||
## Prometheus Pushgateway
|
||||
# COMPOSE_FILE="$COMPOSE_FILE:compose.pushgateway.yml"
|
||||
#
|
||||
## Loki
|
||||
# Loki Server
|
||||
#
|
||||
@ -44,18 +36,16 @@ ENABLE_BACKUPS=true
|
||||
## Grafana
|
||||
#
|
||||
# COMPOSE_FILE="$COMPOSE_FILE:compose.grafana.yml"
|
||||
# GF_SERVER_ROOT_URL=https://monitoring.example.com
|
||||
# GF_SERVER_ROOT_URL=https://${DOMAIN}
|
||||
# SECRET_GRAFANA_ADMIN_PASSWORD_VERSION=v1
|
||||
## Seperate domain for Grafana
|
||||
#GRAFANA_DOMAIN=grafana.example.com
|
||||
#
|
||||
## Single-Sign-On with OIDC
|
||||
# OIDC_ENABLED=1
|
||||
# SECRET_GRAFANA_OIDC_CLIENT_SECRET_VERSION=v1
|
||||
# OIDC_CLIENT_ID=grafana
|
||||
# OIDC_AUTH_URL="https://authentik.example.com/application/o/authorize/"
|
||||
# OIDC_API_URL="https://authentik.example.com/application/o/userinfo/"
|
||||
# OIDC_TOKEN_URL="https://authentik.example.com/application/o/token/"
|
||||
# OIDC_AUTH_URL="https://sso.example.com/auth/realms/autonomic/protocol/openid-connect/auth"
|
||||
# OIDC_API_URL="https://sso.example.com/auth/realms/autonomic/protocol/openid-connect/userinfo"
|
||||
# OIDC_TOKEN_URL="https://sso.example.com/auth/realms/autonomic/protocol/openid-connect/token"
|
||||
#
|
||||
## Additional grafana settings (unlikely to require editing)
|
||||
# GF_SECURITY_ALLOW_EMBEDDING=1
|
||||
@ -69,17 +59,3 @@ ENABLE_BACKUPS=true
|
||||
# GF_SMTP_SKIP_VERIFY=false
|
||||
# SECRET_GRAFANA_SMTP_PASSWORD_VERSION=v1
|
||||
#
|
||||
|
||||
## Grafana Matrix Contact Point (optional)
|
||||
#COMPOSE_FILE="$COMPOSE_FILE:compose.matrix-alertmanager-receiver.yml"
|
||||
#SECRET_MATRIX_ACCESS_TOKEN_VERSION=v1
|
||||
#GF_MATRIX_USER_ID="<user-id>"
|
||||
#GF_MATRIX_ROOM_ID="<room-id>"
|
||||
#GF_MATRIX_HOMESERVER_URL="<homeserver-url>"
|
||||
|
||||
# ALerts
|
||||
#ALERT_BACKUP_FAILED_ENABLED=true
|
||||
#ALERT_BACKUP_MISSING_ENABLED=true
|
||||
#ALERT_BACKUP_NOT_SUCCESSFULL_ENABLED=true
|
||||
#ALERT_NODE_DISK_SPACE_ENABLED=true
|
||||
#ALERT_NODE_MEMORY_USAGE_ENABLED=true
|
||||
|
55
README.md
55
README.md
@ -36,7 +36,7 @@ Where gathering.org is the node you want to gather metrics from.
|
||||
SECRET_USERSFILE_VERSION=v1
|
||||
```
|
||||
- Generate userslist with httpasswd hashed password
|
||||
`abra app secret insert traefik.gathering.org usersfile v1 'admin:<hashed-secret>'`
|
||||
`abra app secret insert traefik.gathering.org userslist v1 'admin:<hashed-secret>'`
|
||||
make sure there is no whitespace in between `admin:<hashed-secret>`, it seems to break stuff...
|
||||
- `abra app deploy -f traefik`
|
||||
1. `abra app new monitoring-ng`
|
||||
@ -47,13 +47,6 @@ Where gathering.org is the node you want to gather metrics from.
|
||||
- cadvisor.monitoring.gathering.org
|
||||
- node.monitoring.gathering.org
|
||||
|
||||
### Expose node and cadvisor via ports instead of traefik
|
||||
|
||||
In case you have no traefik running on the machine, you can expose the ports directly by uncommenting the following line:
|
||||
```
|
||||
# COMPOSE_FILE="$COMPOSE_FILE:compose.expose-ports.yml"
|
||||
```
|
||||
|
||||
## Setup Metrics Browser
|
||||
|
||||
|
||||
@ -61,7 +54,7 @@ In case you have no traefik running on the machine, you can expose the ports dir
|
||||
- monitoring.example.org
|
||||
- prometheus.monitoring.example.org
|
||||
- loki.monitoring.example.org
|
||||
2. Setup monitoring stack
|
||||
1. Setup monitoring stack
|
||||
- `abra app new monitoring-ng`
|
||||
- `abra app config monitoring.example.org`
|
||||
Uncomment all the stuff
|
||||
@ -69,7 +62,7 @@ In case you have no traefik running on the machine, you can expose the ports dir
|
||||
this needs the plaintext traefik basic-auth secret, not the hashed one!
|
||||
- `abra app secret ls monitoring.example.org`
|
||||
- `abra app deploy monitoring.example.org`
|
||||
3. Add scrape config to prometheus
|
||||
1. add scrape config to prometheus
|
||||
- `abra app cmd monitoring.example.org prometheus gathering.org`
|
||||
- or manually
|
||||
```
|
||||
@ -92,6 +85,7 @@ In case you have no traefik running on the machine, you can expose the ports dir
|
||||
| Cadvisor | traefik basic-auth | cadvisor.monitoring.example.org |
|
||||
| Node Exporter | traefik basic-auth | node.monitoring.example.org |
|
||||
|
||||
|
||||
### Logging from a docker host to loki server without anything else
|
||||
|
||||
```
|
||||
@ -107,18 +101,8 @@ $ echo '{
|
||||
$ systemctl restart docker.service
|
||||
```
|
||||
|
||||
## Setup Push Gateway
|
||||
|
||||
1. Enable in the env fiöle by uncommenting the following lines:
|
||||
```
|
||||
## Prometheus Pushgateway
|
||||
# COMPOSE_FILE="$COMPOSE_FILE:compose.pushgateway.yml"
|
||||
```
|
||||
2. `abra app deploy monitoring.example.org`
|
||||
|
||||
This will expose the pushgateway at `https://pushgateway.${DOMAIN}`.
|
||||
It is secured behind the same basic auth as the other services.
|
||||
After that you need to add the `pushgateway.${DOMAIN}` to the scare config.
|
||||
|
||||
## Post-setup guide
|
||||
|
||||
@ -135,34 +119,3 @@ After that you need to add the `pushgateway.${DOMAIN}` to the scare config.
|
||||
---
|
||||
|
||||
THX to the previous work of @decentral1se @knooflok @3wc @cellarspoon @mirsal
|
||||
|
||||
## Adding Matrix as Alert Contact point
|
||||
|
||||
1. Enable the [matrix-alertmanager-receiver](https://github.com/metio/matrix-alertmanager-receiver/):
|
||||
```
|
||||
COMPOSE_FILE="$COMPOSE_FILE:compose.matrix-alertmanager-receiver.yml"
|
||||
```
|
||||
|
||||
2. Insert the matrix access token secret:
|
||||
```
|
||||
abra app secret insert monitoring.marx.klasse-methode.it matrix_access_token v1
|
||||
```
|
||||
|
||||
3. Set required configurations:
|
||||
```
|
||||
GF_MATRIX_USER_ID=
|
||||
GF_MATRIX_ROOM_ID=
|
||||
GF_MATRIX_HOME_SERVER_URL=
|
||||
```
|
||||
4. Configure Alertmanager webhook and set the url to `http://matrix-alertmanager-receiver:12345/alerts/<room-id>`
|
||||
|
||||
## alerts
|
||||
|
||||
It is possible to enable the following alerts, by setting the corresponding env variable to `true`:
|
||||
- backupbot failed: `ALERT_BACKUP_FAILED_ENABLED`
|
||||
- backupbot missing: `ALERT_BACKUP_MISSING_ENABLED`
|
||||
- backupbot not successfull: `ALERT_BACKUP_NOT_SUCCESSFULL_ENABLED`
|
||||
- node disk space: `ALERT_NODE_DISK_SPACE_ENABLED`
|
||||
- node memory usage: `ALERT_NODE_MEMORY_USAGE_ENABLED`
|
||||
|
||||
|
||||
|
6
abra.sh
6
abra.sh
@ -1,17 +1,13 @@
|
||||
export ENTRYPOINT_VERSION=v1
|
||||
export GRAFANA_DATASOURCES_YML_VERSION=v1
|
||||
export GRAFANA_DASHBOARDS_YML_VERSION=v2
|
||||
export GRAFANA_DASHBOARDS_YML_VERSION=v1
|
||||
export GRAFANA_SWARM_DASHBOARD_JSON_VERSION=v2
|
||||
export GRAFANA_STACKS_DASHBOARD_JSON_VERSION=v2
|
||||
export GRAFANA_TRAEFIK_DASHBOARD_JSON_VERSION=v2
|
||||
export GRAFANA_BACKUP_DASHBOARD_JSON_VERSION=v1
|
||||
export GRAFANA_ALERTS_JSON_VERSION=v3
|
||||
export GRAFANA_CUSTOM_INI_VERSION=v4
|
||||
export PROMTAIL_YML_VERSION=v3
|
||||
export LOKI_YML_VERSION=v2
|
||||
export PROMETHEUS_YML_VERSION=v2
|
||||
export MATRIX_ALERTMANAGER_CONFIG_VERSION=e
|
||||
export MATRIX_ALERTMANAGER_ENTRYPOINT_VERSION=a
|
||||
|
||||
# creates a default prometheus scrape config for a given node
|
||||
add_node(){
|
||||
|
@ -1,74 +0,0 @@
|
||||
# configuration of the HTTP server
|
||||
http:
|
||||
## address: 127.0.0.1 # bind address for this service. Can be left unspecified to bind on all interfaces
|
||||
port: 12345 # port used by this service
|
||||
alerts-path-prefix: /alerts # URL path for the webhook receiver called by an Alertmanager. Defaults to /alerts
|
||||
metrics-path: /metrics # URL path to collect metrics. Defaults to /metrics
|
||||
metrics-enabled: true # Whether to enable metrics or not. Defaults to false
|
||||
# basic-username: alertmanager # Username for basic authentication. Defaults to alertmanager
|
||||
# basic-password: secret # If set, the alerts endpoint expects basic-auth credentials with the configured username and password
|
||||
|
||||
# configuration for the Matrix connection
|
||||
matrix:
|
||||
homeserver-url: "{{ env "GF_MATRIX_HOMESERVER_URL" }}"
|
||||
user-id: "{{ env "GF_MATRIX_USER_ID" }}"
|
||||
access-token: "{{ secret "matrix_access_token" }}"
|
||||
room-mapping:
|
||||
matrixroom: "{{ env "GF_MATRIX_ROOM_ID" }}"
|
||||
|
||||
templating:
|
||||
# mapping of ExternalURL values
|
||||
external-url-mapping:
|
||||
# key is the original value taken from the Alertmanager payload
|
||||
# value is the mapped value which will be available as '.ExternalURL' in templates
|
||||
"http://alertmanager:9093": https://alertmanager.example.com
|
||||
# mapping of GeneratorURL values
|
||||
generator-url-mapping:
|
||||
# key is the original value taken from the Alertmanager payload
|
||||
# value is the mapped value which will be available as '.GeneratorURL' in templates
|
||||
"http://prometheus:8080": https://prometheus.example.com
|
||||
|
||||
# computation of arbitrary values based on matching alert annotations, labels, or status
|
||||
# values will be evaluated top to bottom, last entry wins
|
||||
computed-values:
|
||||
- values: # always set 'color' to 'yellow'
|
||||
color: yellow
|
||||
- values: # set 'color' to 'orange' when alert label 'severity' is 'warning'
|
||||
color: orange
|
||||
when-matching-labels:
|
||||
severity: warning
|
||||
- values: # set 'color' to 'red' when alert label 'severity' is 'critical'
|
||||
color: red
|
||||
when-matching-labels:
|
||||
severity: critical
|
||||
- values: # set 'color' to 'green' when alert status is 'resolved'
|
||||
color: green
|
||||
when-matching-status: resolved
|
||||
|
||||
# template for alerts in status 'firing'
|
||||
firing-template: '{{`
|
||||
<p>
|
||||
<strong><font color="{{ .ComputedValues.color }}">{{ .Alert.Status | ToUpper }}</font></strong>
|
||||
{{ if .Alert.Labels.name }}
|
||||
{{ .Alert.Labels.name }}
|
||||
{{ else if .Alert.Labels.alertname }}
|
||||
{{ .Alert.Labels.alertname }}
|
||||
{{ end }}
|
||||
>>
|
||||
{{ if .Alert.Labels.severity }}
|
||||
{{ .Alert.Labels.severity | ToUpper }}:
|
||||
{{ end }}
|
||||
{{ if .Alert.Annotations.description }}
|
||||
{{ .Alert.Annotations.description }}
|
||||
{{ else if .Alert.Annotations.summary }}
|
||||
{{ .Alert.Annotations.summary }}
|
||||
{{ end }}
|
||||
>>
|
||||
{{ if .Alert.Annotations.runbook }}
|
||||
<a href="{{ .Alert.Annotations.runbook }}">Runbook</a> |
|
||||
{{ end }}
|
||||
{{ if .Alert.Annotations.dashboard }}
|
||||
<a href="{{ .Alert.Annotations.dashboard }}">Dashboard</a> |
|
||||
{{ end }}
|
||||
<a href="{{ .SilenceURL }}">Silence</a>
|
||||
</p>`}}'
|
@ -1,13 +0,0 @@
|
||||
---
|
||||
version: "3.8"
|
||||
|
||||
services:
|
||||
app:
|
||||
ports:
|
||||
- "9100:9100"
|
||||
deploy:
|
||||
|
||||
cadvisor:
|
||||
ports:
|
||||
- "9101:8080"
|
||||
deploy:
|
@ -2,7 +2,7 @@ version: '3.8'
|
||||
|
||||
services:
|
||||
grafana:
|
||||
image: grafana/grafana:10.4.14
|
||||
image: grafana/grafana:10.2.3
|
||||
volumes:
|
||||
- grafana-data:/var/lib/grafana:rw
|
||||
secrets:
|
||||
@ -22,10 +22,6 @@ services:
|
||||
target: /var/lib/grafana/dashboards/docker-swarm-stacks.json
|
||||
- source: grafana_traefik_dashboard_json
|
||||
target: /var/lib/grafana/dashboards/traefik.json
|
||||
- source: grafana_backup_dashboard_json
|
||||
target: /var/lib/grafana/dashboards/backup.json
|
||||
- source: grafana_alerts_json
|
||||
target: /var/lib/grafana/alerts/alerts.json
|
||||
networks:
|
||||
- proxy
|
||||
- internal
|
||||
@ -48,9 +44,8 @@ services:
|
||||
deploy:
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.docker.network=proxy"
|
||||
- "traefik.http.services.${STACK_NAME}-grafana.loadbalancer.server.port=3000"
|
||||
- "traefik.http.routers.${STACK_NAME}-grafana.rule=Host(`${GRAFANA_DOMAIN:-$DOMAIN}`)"
|
||||
- "traefik.http.routers.${STACK_NAME}-grafana.rule=Host(`${DOMAIN}`)"
|
||||
- "traefik.http.routers.${STACK_NAME}-grafana.entrypoints=web-secure"
|
||||
- "traefik.http.routers.${STACK_NAME}-grafana.tls=true"
|
||||
- "traefik.http.routers.${STACK_NAME}-grafana.tls.certresolver=${LETS_ENCRYPT_ENV}"
|
||||
@ -67,27 +62,20 @@ configs:
|
||||
name: ${STACK_NAME}_grafana_custom_ini_${GRAFANA_CUSTOM_INI_VERSION}
|
||||
file: grafana_custom.ini
|
||||
grafana_datasources_yml:
|
||||
name: ${STACK_NAME}_g_datasources_yml_${GRAFANA_DATASOURCES_YML_VERSION}
|
||||
name: ${STACK_NAME}_grafana_datasources_yml_${GRAFANA_DATASOURCES_YML_VERSION}
|
||||
file: grafana-datasources.yml
|
||||
grafana_dashboards_yml:
|
||||
name: ${STACK_NAME}_g_dashboards_yml_${GRAFANA_DASHBOARDS_YML_VERSION}
|
||||
name: ${STACK_NAME}_grafana_dashboards_yml_${GRAFANA_DASHBOARDS_YML_VERSION}
|
||||
file: grafana-dashboards.yml
|
||||
grafana_swarm_dashboard_json:
|
||||
name: ${STACK_NAME}_g_swarm_dashboard_json_${GRAFANA_SWARM_DASHBOARD_JSON_VERSION}
|
||||
name: ${STACK_NAME}_grafana_swarm_dashboard_json_${GRAFANA_SWARM_DASHBOARD_JSON_VERSION}
|
||||
file: grafana-swarm-dashboard.json
|
||||
grafana_stacks_dashboard_json:
|
||||
name: ${STACK_NAME}_g_stacks_dashboard_json_${GRAFANA_STACKS_DASHBOARD_JSON_VERSION}
|
||||
name: ${STACK_NAME}_grafana_stacks_dashboard_json_${GRAFANA_STACKS_DASHBOARD_JSON_VERSION}
|
||||
file: grafana-stacks-dashboard.json
|
||||
grafana_traefik_dashboard_json:
|
||||
name: ${STACK_NAME}_g_traefik_dashboard_json_${GRAFANA_TRAEFIK_DASHBOARD_JSON_VERSION}
|
||||
name: ${STACK_NAME}_grafana_traefik_dashboard_json_${GRAFANA_TRAEFIK_DASHBOARD_JSON_VERSION}
|
||||
file: grafana-traefik-dashboard.json
|
||||
grafana_backup_dashboard_json:
|
||||
name: ${STACK_NAME}_g_backup_dashboard_json_${GRAFANA_BACKUP_DASHBOARD_JSON_VERSION}
|
||||
file: grafana-backup-dashboard.json
|
||||
grafana_alerts_json:
|
||||
template_driver: golang
|
||||
name: ${STACK_NAME}_g_alerts_json_${GRAFANA_ALERTS_JSON_VERSION}
|
||||
file: grafana-alerts.json.tmpl
|
||||
|
||||
volumes:
|
||||
grafana-data:
|
||||
|
@ -2,7 +2,7 @@ version: '3.8'
|
||||
|
||||
services:
|
||||
loki:
|
||||
image: grafana/loki:2.9.11
|
||||
image: grafana/loki:2.9.3
|
||||
command: -config.file=/etc/loki/local-config.yaml
|
||||
networks:
|
||||
- proxy
|
||||
@ -27,7 +27,6 @@ services:
|
||||
condition: on-failure
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.docker.network=proxy"
|
||||
- "traefik.http.services.${STACK_NAME}-loki.loadbalancer.server.port=3100"
|
||||
- "traefik.http.routers.${STACK_NAME}-loki.rule=Host(`loki.${DOMAIN}`)"
|
||||
- "traefik.http.routers.${STACK_NAME}-loki.entrypoints=web-secure"
|
||||
@ -49,4 +48,4 @@ volumes:
|
||||
# secrets:
|
||||
# loki_aws_secret_access_key:
|
||||
# external: true
|
||||
# name: ${STACK_NAME}_loki_aws_secret_access_key_${SECRET_LOKI_AWS_SECRET_ACCESS_KEY_VERSION}
|
||||
# name: ${STACK_NAME}_loki_aws_secret_access_key_${SECRET_LOKI_AWS_SECRET_ACCESS_KEY_VERSION}
|
@ -1,28 +0,0 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
matrix-alertmanager-receiver:
|
||||
image: metio/matrix-alertmanager-receiver:2025.2.9
|
||||
secrets:
|
||||
- matrix_access_token
|
||||
configs:
|
||||
- source: matrix-alertmanager-receiver-config
|
||||
target: /etc/matrix-alertmanager-receiver/config.yml
|
||||
networks:
|
||||
- internal
|
||||
environment:
|
||||
- GF_MATRIX_USER_ID
|
||||
- GF_MATRIX_ROOM_ID
|
||||
- GF_MATRIX_HOMESERVER_URL
|
||||
command: "--config-path=/etc/matrix-alertmanager-receiver/config.yml"
|
||||
|
||||
configs:
|
||||
matrix-alertmanager-receiver-config:
|
||||
template_driver: golang
|
||||
name: ${STACK_NAME}_mar_config_${MATRIX_ALERTMANAGER_CONFIG_VERSION}
|
||||
file: alertmanager-matrix-config.yml.tmpl
|
||||
|
||||
secrets:
|
||||
matrix_access_token:
|
||||
external: true
|
||||
name: ${STACK_NAME}_matrix_access_token_${SECRET_MATRIX_ACCESS_TOKEN_VERSION}
|
@ -2,7 +2,7 @@ version: '3.8'
|
||||
|
||||
services:
|
||||
prometheus:
|
||||
image: prom/prometheus:v2.55.1
|
||||
image: prom/prometheus:v2.48.1
|
||||
secrets:
|
||||
- basic_auth
|
||||
volumes:
|
||||
@ -24,12 +24,12 @@ services:
|
||||
condition: on-failure
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.docker.network=proxy"
|
||||
- "traefik.http.services.${STACK_NAME}-prometheus.loadbalancer.server.port=9090"
|
||||
- "traefik.http.routers.${STACK_NAME}-prometheus.rule=Host(`prometheus.${DOMAIN}`)"
|
||||
- "traefik.http.routers.${STACK_NAME}-prometheus.entrypoints=web-secure"
|
||||
- "traefik.http.routers.${STACK_NAME}-prometheus.tls=true"
|
||||
- "traefik.http.routers.${STACK_NAME}-prometheus.tls.certresolver=${LETS_ENCRYPT_ENV}"
|
||||
- "traefik.http.routers.${STACK_NAME}-prometheus.middlewares=basicauth@file"
|
||||
|
||||
configs:
|
||||
prometheus_yml:
|
||||
|
@ -2,7 +2,7 @@ version: "3.8"
|
||||
|
||||
services:
|
||||
promtail:
|
||||
image: grafana/promtail:2.9.11
|
||||
image: grafana/promtail:2.9.3
|
||||
volumes:
|
||||
- /var/log:/var/log:ro
|
||||
- /var/run/docker.sock:/var/run/docker.sock
|
||||
|
@ -1,26 +0,0 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
pushgateway:
|
||||
image: prom/pushgateway:v1.10.0
|
||||
command:
|
||||
- '--web.listen-address=:9191'
|
||||
- '--push.disable-consistency-check'
|
||||
- '--persistence.interval=5m'
|
||||
ports:
|
||||
- 9191:9191
|
||||
networks:
|
||||
- internal
|
||||
- proxy
|
||||
deploy:
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.docker.network=proxy"
|
||||
- "traefik.http.services.${STACK_NAME}-pushgateway.loadbalancer.server.port=9191"
|
||||
- "traefik.http.routers.${STACK_NAME}-pushgateway.rule=Host(`pushgateway.${DOMAIN}`)"
|
||||
- "traefik.http.routers.${STACK_NAME}-pushgateway.entrypoints=web-secure"
|
||||
- "traefik.http.routers.${STACK_NAME}-pushgateway.tls=true"
|
||||
- "traefik.http.routers.${STACK_NAME}-pushgateway.tls.certresolver=${LETS_ENCRYPT_ENV}"
|
||||
- "traefik.http.routers.${STACK_NAME}-pushgateway.middlewares=basicauth@file"
|
@ -3,7 +3,7 @@ version: "3.8"
|
||||
|
||||
services:
|
||||
app:
|
||||
image: prom/node-exporter:v1.8.1
|
||||
image: prom/node-exporter:v1.7.0
|
||||
user: root
|
||||
environment:
|
||||
- NODE_ID={{.Node.ID}}
|
||||
@ -30,20 +30,18 @@ services:
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
labels:
|
||||
- "backupbot.backup=${ENABLE_BACKUPS:-true}"
|
||||
- "traefik.enable=true"
|
||||
- "traefik.docker.network=proxy"
|
||||
- "traefik.http.services.${STACK_NAME}-node.loadbalancer.server.port=9100"
|
||||
- "traefik.http.routers.${STACK_NAME}-node.rule=Host(`node.${DOMAIN}`)"
|
||||
- "traefik.http.routers.${STACK_NAME}-node.entrypoints=web-secure"
|
||||
- "traefik.http.routers.${STACK_NAME}-node.tls=true"
|
||||
- "traefik.http.routers.${STACK_NAME}-node.tls.certresolver=${LETS_ENCRYPT_ENV}"
|
||||
- "traefik.http.routers.${STACK_NAME}-node.middlewares=basicauth@file"
|
||||
- "coop-cloud.${STACK_NAME}.version=1.6.0+v1.8.1"
|
||||
- "coop-cloud.${STACK_NAME}.version=1.0.0+v1.7.0"
|
||||
- "coop-cloud.${STACK_NAME}.timeout=${TIMEOUT:-120}"
|
||||
|
||||
cadvisor:
|
||||
image: gcr.io/cadvisor/cadvisor:v0.49.2
|
||||
image: gcr.io/cadvisor/cadvisor:v0.47.2
|
||||
command:
|
||||
- "-logtostderr"
|
||||
- "--enable_metrics=cpu,cpuLoad,disk,diskIO,process,memory,network"
|
||||
@ -64,7 +62,6 @@ services:
|
||||
condition: on-failure
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.docker.network=proxy"
|
||||
- "traefik.http.services.${STACK_NAME}-cadvisor.loadbalancer.server.port=8080"
|
||||
- "traefik.http.routers.${STACK_NAME}-cadvisor.rule=Host(`cadvisor.${DOMAIN}`)"
|
||||
- "traefik.http.routers.${STACK_NAME}-cadvisor.entrypoints=web-secure"
|
||||
|
@ -1,315 +0,0 @@
|
||||
{
|
||||
"apiVersion": 1,
|
||||
"groups": [
|
||||
{
|
||||
"orgId": 1,
|
||||
"name": "backupbot",
|
||||
"folder": "node",
|
||||
"interval": "1m",
|
||||
"rules": [
|
||||
{{ if eq (env "ALERT_BACKUP_FAILED_ENABLED") "true" }}
|
||||
{
|
||||
"uid": "de8e5xxup7t34a",
|
||||
"title": "Backup Failed",
|
||||
"condition": "C",
|
||||
"data": [
|
||||
{
|
||||
"refId": "A",
|
||||
"relativeTimeRange": { "from": 600, "to": 0 },
|
||||
"datasourceUid": "PBFA97CFB590B2093",
|
||||
"model": {
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "builder",
|
||||
"expr": "backup",
|
||||
"fullMetaSearch": false,
|
||||
"includeNullMetadata": true,
|
||||
"instant": true,
|
||||
"intervalMs": 1000,
|
||||
"legendFormat": "__auto",
|
||||
"maxDataPoints": 43200,
|
||||
"range": false,
|
||||
"refId": "A",
|
||||
"useBackend": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"refId": "C",
|
||||
"relativeTimeRange": { "from": 600, "to": 0 },
|
||||
"datasourceUid": "__expr__",
|
||||
"model": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": { "params": [0], "type": "lt" },
|
||||
"operator": { "type": "and" },
|
||||
"query": { "params": ["C"] },
|
||||
"reducer": { "params": [], "type": "last" },
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"datasource": { "type": "__expr__", "uid": "__expr__" },
|
||||
"expression": "A",
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"refId": "C",
|
||||
"type": "threshold"
|
||||
}
|
||||
}
|
||||
],
|
||||
"noDataState": "NoData",
|
||||
"execErrState": "Error",
|
||||
"for": "1m",
|
||||
"isPaused": false
|
||||
},
|
||||
{{ end }}
|
||||
{{ if eq (env "ALERT_BACKUP_MISSING_ENABLED") "true" }}
|
||||
{
|
||||
"uid": "ce8e65uddcwe8d",
|
||||
"title": "Backup Missing",
|
||||
"condition": "B",
|
||||
"data": [
|
||||
{
|
||||
"refId": "A",
|
||||
"relativeTimeRange": { "from": 600, "to": 0 },
|
||||
"datasourceUid": "PBFA97CFB590B2093",
|
||||
"model": {
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "builder",
|
||||
"expr": "rate(backup[24h])",
|
||||
"fullMetaSearch": false,
|
||||
"includeNullMetadata": true,
|
||||
"instant": true,
|
||||
"intervalMs": 1000,
|
||||
"legendFormat": "__auto",
|
||||
"maxDataPoints": 43200,
|
||||
"range": false,
|
||||
"refId": "A",
|
||||
"useBackend": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"refId": "B",
|
||||
"relativeTimeRange": { "from": 600, "to": 0 },
|
||||
"datasourceUid": "__expr__",
|
||||
"model": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": { "params": [0, 0], "type": "within_range" },
|
||||
"operator": { "type": "and" },
|
||||
"query": { "params": ["C"] },
|
||||
"reducer": { "params": [], "type": "last" },
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"datasource": { "type": "__expr__", "uid": "__expr__" },
|
||||
"expression": "A",
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"refId": "B",
|
||||
"type": "threshold"
|
||||
}
|
||||
}
|
||||
],
|
||||
"noDataState": "NoData",
|
||||
"execErrState": "Error",
|
||||
"for": "5m",
|
||||
"isPaused": false
|
||||
},
|
||||
{{ end }}
|
||||
{{ if eq (env "ALERT_BACKUP_NOT_SUCCESSFULL_ENABLED") "true" }}
|
||||
{
|
||||
"uid": "de8e6bc92a8lcc",
|
||||
"title": "Backup Not Successfull",
|
||||
"condition": "B",
|
||||
"data": [
|
||||
{
|
||||
"refId": "A",
|
||||
"relativeTimeRange": {
|
||||
"from": 60,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "PBFA97CFB590B2093",
|
||||
"model": {
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "builder",
|
||||
"expr": "backup",
|
||||
"fullMetaSearch": false,
|
||||
"includeNullMetadata": true,
|
||||
"instant": true,
|
||||
"intervalMs": 1000,
|
||||
"legendFormat": "__auto",
|
||||
"maxDataPoints": 43200,
|
||||
"range": false,
|
||||
"refId": "A",
|
||||
"useBackend": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"refId": "B",
|
||||
"relativeTimeRange": {
|
||||
"from": 60,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "__expr__",
|
||||
"model": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": {
|
||||
"params": [
|
||||
0
|
||||
],
|
||||
"type": "gt"
|
||||
},
|
||||
"operator": {
|
||||
"type": "and"
|
||||
},
|
||||
"query": {
|
||||
"params": [
|
||||
"C"
|
||||
]
|
||||
},
|
||||
"reducer": {
|
||||
"params": [],
|
||||
"type": "last"
|
||||
},
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"datasource": {
|
||||
"type": "__expr__",
|
||||
"uid": "__expr__"
|
||||
},
|
||||
"expression": "A",
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"refId": "B",
|
||||
"type": "threshold"
|
||||
}
|
||||
}
|
||||
],
|
||||
"noDataState": "NoData",
|
||||
"execErrState": "Error",
|
||||
"for": "20m",
|
||||
"annotations": {
|
||||
"summary": "Backup did not finish within 20 minutes"
|
||||
},
|
||||
"labels": {},
|
||||
"isPaused": false
|
||||
}
|
||||
{{ end }}
|
||||
]
|
||||
},
|
||||
{
|
||||
"orgId": 1,
|
||||
"name": "node",
|
||||
"folder": "node",
|
||||
"interval": "5m",
|
||||
"rules": [
|
||||
{{ if eq (env "ALERT_NODE_DISK_SPACE_ENABLED") "true" }}
|
||||
{
|
||||
"uid": "bds8bhxu97pxca",
|
||||
"title": "Node Disk Space",
|
||||
"condition": "C",
|
||||
"data": [
|
||||
{
|
||||
"refId": "A",
|
||||
"relativeTimeRange": { "from": 600, "to": 0 },
|
||||
"datasourceUid": "PBFA97CFB590B2093",
|
||||
"model": {
|
||||
"editorMode": "code",
|
||||
"expr": "(node_filesystem_free_bytes{fstype=\"ext4\",mountpoint=~\"(/$)|(/media.*)\"} / node_filesystem_size_bytes{fstype=\"ext4\",mountpoint=~\"(/$)|(/media.*)\"}) * 100",
|
||||
"instant": true,
|
||||
"intervalMs": 1000,
|
||||
"legendFormat": "__auto",
|
||||
"maxDataPoints": 43200,
|
||||
"range": false,
|
||||
"refId": "A"
|
||||
}
|
||||
},
|
||||
{
|
||||
"refId": "C",
|
||||
"relativeTimeRange": { "from": 600, "to": 0 },
|
||||
"datasourceUid": "__expr__",
|
||||
"model": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": { "params": [10], "type": "lt" },
|
||||
"operator": { "type": "and" },
|
||||
"query": { "params": ["C"] },
|
||||
"reducer": { "params": [], "type": "last" },
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"datasource": { "type": "__expr__", "uid": "__expr__" },
|
||||
"expression": "A",
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"refId": "C",
|
||||
"type": "threshold"
|
||||
}
|
||||
}
|
||||
],
|
||||
"noDataState": "NoData",
|
||||
"execErrState": "Error",
|
||||
"for": "5m",
|
||||
"annotations": {},
|
||||
"labels": {},
|
||||
"isPaused": false
|
||||
},
|
||||
{{ end }}
|
||||
{{ if eq (env "ALERT_NODE_MEMORY_USAGE_ENABLED") "true" }}
|
||||
{
|
||||
"uid": "ads8cswmly96oa",
|
||||
"title": "Node Memory Usage",
|
||||
"condition": "C",
|
||||
"data": [
|
||||
{
|
||||
"refId": "A",
|
||||
"relativeTimeRange": { "from": 600, "to": 0 },
|
||||
"datasourceUid": "PBFA97CFB590B2093",
|
||||
"model": {
|
||||
"editorMode": "code",
|
||||
"expr": "(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100",
|
||||
"instant": true,
|
||||
"intervalMs": 1000,
|
||||
"legendFormat": "__auto",
|
||||
"maxDataPoints": 43200,
|
||||
"range": false,
|
||||
"refId": "A"
|
||||
}
|
||||
},
|
||||
{
|
||||
"refId": "C",
|
||||
"relativeTimeRange": { "from": 600, "to": 0 },
|
||||
"datasourceUid": "__expr__",
|
||||
"model": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": { "params": [90], "type": "gt" },
|
||||
"operator": { "type": "and" },
|
||||
"query": { "params": ["C"] },
|
||||
"reducer": { "params": [], "type": "last" },
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"datasource": { "type": "__expr__", "uid": "__expr__" },
|
||||
"expression": "A",
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"refId": "C",
|
||||
"type": "threshold"
|
||||
}
|
||||
}
|
||||
],
|
||||
"noDataState": "NoData",
|
||||
"execErrState": "Error",
|
||||
"for": "5m",
|
||||
"annotations": {},
|
||||
"labels": {},
|
||||
"isPaused": false
|
||||
}
|
||||
{{ end }}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
@ -1,228 +0,0 @@
|
||||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": {
|
||||
"type": "grafana",
|
||||
"uid": "-- Grafana --"
|
||||
},
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 0,
|
||||
"id": 6,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"axisSoftMax": 2,
|
||||
"axisSoftMin": -2,
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 0,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"options": {
|
||||
"0": {
|
||||
"color": "dark-green",
|
||||
"index": 0
|
||||
},
|
||||
"1": {
|
||||
"color": "dark-yellow",
|
||||
"index": 1,
|
||||
"text": "Running"
|
||||
},
|
||||
"-1": {
|
||||
"index": 2,
|
||||
"text": "Fail"
|
||||
}
|
||||
},
|
||||
"type": "value"
|
||||
}
|
||||
],
|
||||
"max": 1,
|
||||
"min": -1,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "string"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 7,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 1,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "builder",
|
||||
"exemplar": false,
|
||||
"expr": "backup",
|
||||
"fullMetaSearch": false,
|
||||
"includeNullMetadata": true,
|
||||
"instant": false,
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A",
|
||||
"useBackend": false
|
||||
}
|
||||
],
|
||||
"title": "Backup Status",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "loki",
|
||||
"uid": "P8E80F9AEF21F6940"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 11,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 7
|
||||
},
|
||||
"id": 2,
|
||||
"options": {
|
||||
"dedupStrategy": "none",
|
||||
"enableLogDetails": true,
|
||||
"prettifyLogMessage": false,
|
||||
"showCommonLabels": false,
|
||||
"showLabels": false,
|
||||
"showTime": true,
|
||||
"sortOrder": "Descending",
|
||||
"wrapLogMessage": false
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "loki",
|
||||
"uid": "P8E80F9AEF21F6940"
|
||||
},
|
||||
"editorMode": "builder",
|
||||
"expr": "{service_name=\"$ServiceName\"} |= ``",
|
||||
"queryType": "range",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Backupbot Logs",
|
||||
"type": "logs"
|
||||
}
|
||||
],
|
||||
"refresh": "auto",
|
||||
"schemaVersion": 39,
|
||||
"tags": [],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {
|
||||
"selected": true,
|
||||
"text": "backup_marx_klasse-methode_it_app",
|
||||
"value": "backup_marx_klasse-methode_it_app"
|
||||
},
|
||||
"datasource": {
|
||||
"type": "loki",
|
||||
"uid": "P8E80F9AEF21F6940"
|
||||
},
|
||||
"definition": "",
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "Backupbot Service",
|
||||
"multi": false,
|
||||
"name": "ServiceName",
|
||||
"options": [],
|
||||
"query": {
|
||||
"label": "service_name",
|
||||
"refId": "LokiVariableQueryEditor-VariableQuery",
|
||||
"stream": "",
|
||||
"type": 1
|
||||
},
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-24h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {},
|
||||
"timezone": "browser",
|
||||
"title": "backupbot-two",
|
||||
"uid": "be8e2xeofw4xsa",
|
||||
"version": 3,
|
||||
"weekStart": ""
|
||||
}
|
@ -11,13 +11,3 @@ providers:
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards
|
||||
foldersFromFilesStructure: true
|
||||
- name: 'default-alert-provider'
|
||||
orgId: 1
|
||||
folder: 'default-alerts'
|
||||
type: file
|
||||
disableDeletion: false
|
||||
updateIntervalSeconds: 10
|
||||
allowUiUpdates: true
|
||||
options:
|
||||
path: /var/lib/grafana/alerts
|
||||
foldersFromFilesStructure: true
|
||||
|
@ -1 +0,0 @@
|
||||
Adds an optional GRAFANA_DOMAIN
|
@ -1 +0,0 @@
|
||||
Adds an optional matrix contact point for grafana
|
@ -1 +0,0 @@
|
||||
Adds option to expose ports for node and cadvisor service
|
Loading…
x
Reference in New Issue
Block a user