Compare commits

..

8 Commits

14 changed files with 171 additions and 6 deletions

View File

@ -12,6 +12,9 @@ ENABLE_BACKUPS=true
# COMPOSE_FILE="$COMPOSE_FILE:compose.promtail.yml" # COMPOSE_FILE="$COMPOSE_FILE:compose.promtail.yml"
# LOKI_PUSH_URL=https://loki.monitoring.example.org/loki/api/v1/push # LOKI_PUSH_URL=https://loki.monitoring.example.org/loki/api/v1/push
## Expose node and cadvisor ports instead of traefik
# COMPOSE_FILE="$COMPOSE_FILE:compose.expose-ports.yml"
# Monitoring Server # Monitoring Server
# #
## Prometheus ## Prometheus
@ -43,6 +46,8 @@ ENABLE_BACKUPS=true
# COMPOSE_FILE="$COMPOSE_FILE:compose.grafana.yml" # COMPOSE_FILE="$COMPOSE_FILE:compose.grafana.yml"
# GF_SERVER_ROOT_URL=https://monitoring.example.com # GF_SERVER_ROOT_URL=https://monitoring.example.com
# SECRET_GRAFANA_ADMIN_PASSWORD_VERSION=v1 # SECRET_GRAFANA_ADMIN_PASSWORD_VERSION=v1
## Seperate domain for Grafana
#GRAFANA_DOMAIN=grafana.example.com
# #
## Single-Sign-On with OIDC ## Single-Sign-On with OIDC
# OIDC_ENABLED=1 # OIDC_ENABLED=1
@ -65,6 +70,13 @@ ENABLE_BACKUPS=true
# SECRET_GRAFANA_SMTP_PASSWORD_VERSION=v1 # SECRET_GRAFANA_SMTP_PASSWORD_VERSION=v1
# #
## Grafana Matrix Contact Point (optional)
#COMPOSE_FILE="$COMPOSE_FILE:compose.matrix-alertmanager-receiver.yml"
#SECRET_MATRIX_ACCESS_TOKEN_VERSION=v1
#GF_MATRIX_USER_ID="<user-id>"
#GF_MATRIX_ROOM_ID="<room-id>"
#GF_MATRIX_HOMESERVER_URL="<homeserver-url>"
# ALerts # ALerts
#ALERT_BACKUP_FAILED_ENABLED=true #ALERT_BACKUP_FAILED_ENABLED=true
#ALERT_BACKUP_MISSING_ENABLED=true #ALERT_BACKUP_MISSING_ENABLED=true

View File

@ -36,7 +36,7 @@ Where gathering.org is the node you want to gather metrics from.
SECRET_USERSFILE_VERSION=v1 SECRET_USERSFILE_VERSION=v1
``` ```
- Generate userslist with httpasswd hashed password - Generate userslist with httpasswd hashed password
`abra app secret insert traefik.gathering.org userslist v1 'admin:<hashed-secret>'` `abra app secret insert traefik.gathering.org usersfile v1 'admin:<hashed-secret>'`
make sure there is no whitespace in between `admin:<hashed-secret>`, it seems to break stuff... make sure there is no whitespace in between `admin:<hashed-secret>`, it seems to break stuff...
- `abra app deploy -f traefik` - `abra app deploy -f traefik`
1. `abra app new monitoring-ng` 1. `abra app new monitoring-ng`
@ -47,6 +47,13 @@ Where gathering.org is the node you want to gather metrics from.
- cadvisor.monitoring.gathering.org - cadvisor.monitoring.gathering.org
- node.monitoring.gathering.org - node.monitoring.gathering.org
### Expose node and cadvisor via ports instead of traefik
In case you have no traefik running on the machine, you can expose the ports directly by uncommenting the following line:
```
# COMPOSE_FILE="$COMPOSE_FILE:compose.expose-ports.yml"
```
## Setup Metrics Browser ## Setup Metrics Browser
@ -85,7 +92,6 @@ Where gathering.org is the node you want to gather metrics from.
| Cadvisor | traefik basic-auth | cadvisor.monitoring.example.org | | Cadvisor | traefik basic-auth | cadvisor.monitoring.example.org |
| Node Exporter | traefik basic-auth | node.monitoring.example.org | | Node Exporter | traefik basic-auth | node.monitoring.example.org |
### Logging from a docker host to loki server without anything else ### Logging from a docker host to loki server without anything else
``` ```
@ -130,6 +136,26 @@ After that you need to add the `pushgateway.${DOMAIN}` to the scare config.
THX to the previous work of @decentral1se @knooflok @3wc @cellarspoon @mirsal THX to the previous work of @decentral1se @knooflok @3wc @cellarspoon @mirsal
## Adding Matrix as Alert Contact point
1. Enable the [matrix-alertmanager-receiver](https://github.com/metio/matrix-alertmanager-receiver/):
```
COMPOSE_FILE="$COMPOSE_FILE:compose.matrix-alertmanager-receiver.yml"
```
2. Insert the matrix access token secret:
```
abra app secret insert monitoring.marx.klasse-methode.it matrix_access_token v1
```
3. Set required configurations:
```
GF_MATRIX_USER_ID=
GF_MATRIX_ROOM_ID=
GF_MATRIX_HOME_SERVER_URL=
```
4. Configure Alertmanager webhook and set the url to `http://matrix-alertmanager-receiver:12345/alerts/<room-id>`
## alerts ## alerts
It is possible to enable the following alerts, by setting the corresponding env variable to `true`: It is possible to enable the following alerts, by setting the corresponding env variable to `true`:
@ -138,3 +164,5 @@ It is possible to enable the following alerts, by setting the corresponding env
- backupbot not successfull: `ALERT_BACKUP_NOT_SUCCESSFULL_ENABLED` - backupbot not successfull: `ALERT_BACKUP_NOT_SUCCESSFULL_ENABLED`
- node disk space: `ALERT_NODE_DISK_SPACE_ENABLED` - node disk space: `ALERT_NODE_DISK_SPACE_ENABLED`
- node memory usage: `ALERT_NODE_MEMORY_USAGE_ENABLED` - node memory usage: `ALERT_NODE_MEMORY_USAGE_ENABLED`

View File

@ -10,6 +10,8 @@ export GRAFANA_CUSTOM_INI_VERSION=v4
export PROMTAIL_YML_VERSION=v3 export PROMTAIL_YML_VERSION=v3
export LOKI_YML_VERSION=v2 export LOKI_YML_VERSION=v2
export PROMETHEUS_YML_VERSION=v2 export PROMETHEUS_YML_VERSION=v2
export MATRIX_ALERTMANAGER_CONFIG_VERSION=e
export MATRIX_ALERTMANAGER_ENTRYPOINT_VERSION=a
# creates a default prometheus scrape config for a given node # creates a default prometheus scrape config for a given node
add_node(){ add_node(){

View File

@ -0,0 +1,74 @@
# configuration of the HTTP server
http:
## address: 127.0.0.1 # bind address for this service. Can be left unspecified to bind on all interfaces
port: 12345 # port used by this service
alerts-path-prefix: /alerts # URL path for the webhook receiver called by an Alertmanager. Defaults to /alerts
metrics-path: /metrics # URL path to collect metrics. Defaults to /metrics
metrics-enabled: true # Whether to enable metrics or not. Defaults to false
# basic-username: alertmanager # Username for basic authentication. Defaults to alertmanager
# basic-password: secret # If set, the alerts endpoint expects basic-auth credentials with the configured username and password
# configuration for the Matrix connection
matrix:
homeserver-url: "{{ env "GF_MATRIX_HOMESERVER_URL" }}"
user-id: "{{ env "GF_MATRIX_USER_ID" }}"
access-token: "{{ secret "matrix_access_token" }}"
room-mapping:
matrixroom: "{{ env "GF_MATRIX_ROOM_ID" }}"
templating:
# mapping of ExternalURL values
external-url-mapping:
# key is the original value taken from the Alertmanager payload
# value is the mapped value which will be available as '.ExternalURL' in templates
"http://alertmanager:9093": https://alertmanager.example.com
# mapping of GeneratorURL values
generator-url-mapping:
# key is the original value taken from the Alertmanager payload
# value is the mapped value which will be available as '.GeneratorURL' in templates
"http://prometheus:8080": https://prometheus.example.com
# computation of arbitrary values based on matching alert annotations, labels, or status
# values will be evaluated top to bottom, last entry wins
computed-values:
- values: # always set 'color' to 'yellow'
color: yellow
- values: # set 'color' to 'orange' when alert label 'severity' is 'warning'
color: orange
when-matching-labels:
severity: warning
- values: # set 'color' to 'red' when alert label 'severity' is 'critical'
color: red
when-matching-labels:
severity: critical
- values: # set 'color' to 'green' when alert status is 'resolved'
color: green
when-matching-status: resolved
# template for alerts in status 'firing'
firing-template: '{{`
<p>
<strong><font color="{{ .ComputedValues.color }}">{{ .Alert.Status | ToUpper }}</font></strong>
{{ if .Alert.Labels.name }}
{{ .Alert.Labels.name }}
{{ else if .Alert.Labels.alertname }}
{{ .Alert.Labels.alertname }}
{{ end }}
>>
{{ if .Alert.Labels.severity }}
{{ .Alert.Labels.severity | ToUpper }}:
{{ end }}
{{ if .Alert.Annotations.description }}
{{ .Alert.Annotations.description }}
{{ else if .Alert.Annotations.summary }}
{{ .Alert.Annotations.summary }}
{{ end }}
>>
{{ if .Alert.Annotations.runbook }}
<a href="{{ .Alert.Annotations.runbook }}">Runbook</a> |
{{ end }}
{{ if .Alert.Annotations.dashboard }}
<a href="{{ .Alert.Annotations.dashboard }}">Dashboard</a> |
{{ end }}
<a href="{{ .SilenceURL }}">Silence</a>
</p>`}}'

13
compose.expose-ports.yml Normal file
View File

@ -0,0 +1,13 @@
---
version: "3.8"
services:
app:
ports:
- "9100:9100"
deploy:
cadvisor:
ports:
- "9101:8080"
deploy:

View File

@ -48,8 +48,9 @@ services:
deploy: deploy:
labels: labels:
- "traefik.enable=true" - "traefik.enable=true"
- "traefik.docker.network=proxy"
- "traefik.http.services.${STACK_NAME}-grafana.loadbalancer.server.port=3000" - "traefik.http.services.${STACK_NAME}-grafana.loadbalancer.server.port=3000"
- "traefik.http.routers.${STACK_NAME}-grafana.rule=Host(`${DOMAIN}`)" - "traefik.http.routers.${STACK_NAME}-grafana.rule=Host(`${GRAFANA_DOMAIN:-$DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-grafana.entrypoints=web-secure" - "traefik.http.routers.${STACK_NAME}-grafana.entrypoints=web-secure"
- "traefik.http.routers.${STACK_NAME}-grafana.tls=true" - "traefik.http.routers.${STACK_NAME}-grafana.tls=true"
- "traefik.http.routers.${STACK_NAME}-grafana.tls.certresolver=${LETS_ENCRYPT_ENV}" - "traefik.http.routers.${STACK_NAME}-grafana.tls.certresolver=${LETS_ENCRYPT_ENV}"

View File

@ -27,6 +27,7 @@ services:
condition: on-failure condition: on-failure
labels: labels:
- "traefik.enable=true" - "traefik.enable=true"
- "traefik.docker.network=proxy"
- "traefik.http.services.${STACK_NAME}-loki.loadbalancer.server.port=3100" - "traefik.http.services.${STACK_NAME}-loki.loadbalancer.server.port=3100"
- "traefik.http.routers.${STACK_NAME}-loki.rule=Host(`loki.${DOMAIN}`)" - "traefik.http.routers.${STACK_NAME}-loki.rule=Host(`loki.${DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-loki.entrypoints=web-secure" - "traefik.http.routers.${STACK_NAME}-loki.entrypoints=web-secure"
@ -48,4 +49,4 @@ volumes:
# secrets: # secrets:
# loki_aws_secret_access_key: # loki_aws_secret_access_key:
# external: true # external: true
# name: ${STACK_NAME}_loki_aws_secret_access_key_${SECRET_LOKI_AWS_SECRET_ACCESS_KEY_VERSION} # name: ${STACK_NAME}_loki_aws_secret_access_key_${SECRET_LOKI_AWS_SECRET_ACCESS_KEY_VERSION}

View File

@ -0,0 +1,28 @@
version: '3.8'
services:
matrix-alertmanager-receiver:
image: metio/matrix-alertmanager-receiver:2025.2.9
secrets:
- matrix_access_token
configs:
- source: matrix-alertmanager-receiver-config
target: /etc/matrix-alertmanager-receiver/config.yml
networks:
- internal
environment:
- GF_MATRIX_USER_ID
- GF_MATRIX_ROOM_ID
- GF_MATRIX_HOMESERVER_URL
command: "--config-path=/etc/matrix-alertmanager-receiver/config.yml"
configs:
matrix-alertmanager-receiver-config:
template_driver: golang
name: ${STACK_NAME}_mar_config_${MATRIX_ALERTMANAGER_CONFIG_VERSION}
file: alertmanager-matrix-config.yml.tmpl
secrets:
matrix_access_token:
external: true
name: ${STACK_NAME}_matrix_access_token_${SECRET_MATRIX_ACCESS_TOKEN_VERSION}

View File

@ -24,12 +24,12 @@ services:
condition: on-failure condition: on-failure
labels: labels:
- "traefik.enable=true" - "traefik.enable=true"
- "traefik.docker.network=proxy"
- "traefik.http.services.${STACK_NAME}-prometheus.loadbalancer.server.port=9090" - "traefik.http.services.${STACK_NAME}-prometheus.loadbalancer.server.port=9090"
- "traefik.http.routers.${STACK_NAME}-prometheus.rule=Host(`prometheus.${DOMAIN}`)" - "traefik.http.routers.${STACK_NAME}-prometheus.rule=Host(`prometheus.${DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-prometheus.entrypoints=web-secure" - "traefik.http.routers.${STACK_NAME}-prometheus.entrypoints=web-secure"
- "traefik.http.routers.${STACK_NAME}-prometheus.tls=true" - "traefik.http.routers.${STACK_NAME}-prometheus.tls=true"
- "traefik.http.routers.${STACK_NAME}-prometheus.tls.certresolver=${LETS_ENCRYPT_ENV}" - "traefik.http.routers.${STACK_NAME}-prometheus.tls.certresolver=${LETS_ENCRYPT_ENV}"
- "traefik.http.routers.${STACK_NAME}-prometheus.middlewares=basicauth@file"
configs: configs:
prometheus_yml: prometheus_yml:

View File

@ -17,6 +17,7 @@ services:
condition: on-failure condition: on-failure
labels: labels:
- "traefik.enable=true" - "traefik.enable=true"
- "traefik.docker.network=proxy"
- "traefik.http.services.${STACK_NAME}-pushgateway.loadbalancer.server.port=9191" - "traefik.http.services.${STACK_NAME}-pushgateway.loadbalancer.server.port=9191"
- "traefik.http.routers.${STACK_NAME}-pushgateway.rule=Host(`pushgateway.${DOMAIN}`)" - "traefik.http.routers.${STACK_NAME}-pushgateway.rule=Host(`pushgateway.${DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-pushgateway.entrypoints=web-secure" - "traefik.http.routers.${STACK_NAME}-pushgateway.entrypoints=web-secure"

View File

@ -32,13 +32,14 @@ services:
labels: labels:
- "backupbot.backup=${ENABLE_BACKUPS:-true}" - "backupbot.backup=${ENABLE_BACKUPS:-true}"
- "traefik.enable=true" - "traefik.enable=true"
- "traefik.docker.network=proxy"
- "traefik.http.services.${STACK_NAME}-node.loadbalancer.server.port=9100" - "traefik.http.services.${STACK_NAME}-node.loadbalancer.server.port=9100"
- "traefik.http.routers.${STACK_NAME}-node.rule=Host(`node.${DOMAIN}`)" - "traefik.http.routers.${STACK_NAME}-node.rule=Host(`node.${DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-node.entrypoints=web-secure" - "traefik.http.routers.${STACK_NAME}-node.entrypoints=web-secure"
- "traefik.http.routers.${STACK_NAME}-node.tls=true" - "traefik.http.routers.${STACK_NAME}-node.tls=true"
- "traefik.http.routers.${STACK_NAME}-node.tls.certresolver=${LETS_ENCRYPT_ENV}" - "traefik.http.routers.${STACK_NAME}-node.tls.certresolver=${LETS_ENCRYPT_ENV}"
- "traefik.http.routers.${STACK_NAME}-node.middlewares=basicauth@file" - "traefik.http.routers.${STACK_NAME}-node.middlewares=basicauth@file"
- "coop-cloud.${STACK_NAME}.version=1.3.0+v1.8.1" - "coop-cloud.${STACK_NAME}.version=1.6.0+v1.8.1"
- "coop-cloud.${STACK_NAME}.timeout=${TIMEOUT:-120}" - "coop-cloud.${STACK_NAME}.timeout=${TIMEOUT:-120}"
cadvisor: cadvisor:
@ -63,6 +64,7 @@ services:
condition: on-failure condition: on-failure
labels: labels:
- "traefik.enable=true" - "traefik.enable=true"
- "traefik.docker.network=proxy"
- "traefik.http.services.${STACK_NAME}-cadvisor.loadbalancer.server.port=8080" - "traefik.http.services.${STACK_NAME}-cadvisor.loadbalancer.server.port=8080"
- "traefik.http.routers.${STACK_NAME}-cadvisor.rule=Host(`cadvisor.${DOMAIN}`)" - "traefik.http.routers.${STACK_NAME}-cadvisor.rule=Host(`cadvisor.${DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-cadvisor.entrypoints=web-secure" - "traefik.http.routers.${STACK_NAME}-cadvisor.entrypoints=web-secure"

1
release/1.4.0+v1.8.1 Normal file
View File

@ -0,0 +1 @@
Adds an optional GRAFANA_DOMAIN

1
release/1.5.0+v1.8.1 Normal file
View File

@ -0,0 +1 @@
Adds an optional matrix contact point for grafana

1
release/1.6.0+v1.8.1 Normal file
View File

@ -0,0 +1 @@
Adds option to expose ports for node and cadvisor service