Compare commits
No commits in common. "main" and "backup-dashboard" have entirely different histories.
main
...
backup-das
12
.env.sample
12
.env.sample
@ -12,9 +12,6 @@ ENABLE_BACKUPS=true
|
||||
# COMPOSE_FILE="$COMPOSE_FILE:compose.promtail.yml"
|
||||
# LOKI_PUSH_URL=https://loki.monitoring.example.org/loki/api/v1/push
|
||||
|
||||
## Expose node and cadvisor ports instead of traefik
|
||||
# COMPOSE_FILE="$COMPOSE_FILE:compose.expose-ports.yml"
|
||||
|
||||
# Monitoring Server
|
||||
#
|
||||
## Prometheus
|
||||
@ -46,8 +43,6 @@ ENABLE_BACKUPS=true
|
||||
# COMPOSE_FILE="$COMPOSE_FILE:compose.grafana.yml"
|
||||
# GF_SERVER_ROOT_URL=https://monitoring.example.com
|
||||
# SECRET_GRAFANA_ADMIN_PASSWORD_VERSION=v1
|
||||
## Seperate domain for Grafana
|
||||
#GRAFANA_DOMAIN=grafana.example.com
|
||||
#
|
||||
## Single-Sign-On with OIDC
|
||||
# OIDC_ENABLED=1
|
||||
@ -70,13 +65,6 @@ ENABLE_BACKUPS=true
|
||||
# SECRET_GRAFANA_SMTP_PASSWORD_VERSION=v1
|
||||
#
|
||||
|
||||
## Grafana Matrix Contact Point (optional)
|
||||
#COMPOSE_FILE="$COMPOSE_FILE:compose.matrix-alertmanager-receiver.yml"
|
||||
#SECRET_MATRIX_ACCESS_TOKEN_VERSION=v1
|
||||
#GF_MATRIX_USER_ID="<user-id>"
|
||||
#GF_MATRIX_ROOM_ID="<room-id>"
|
||||
#GF_MATRIX_HOMESERVER_URL="<homeserver-url>"
|
||||
|
||||
# ALerts
|
||||
#ALERT_BACKUP_FAILED_ENABLED=true
|
||||
#ALERT_BACKUP_MISSING_ENABLED=true
|
||||
|
32
README.md
32
README.md
@ -36,7 +36,7 @@ Where gathering.org is the node you want to gather metrics from.
|
||||
SECRET_USERSFILE_VERSION=v1
|
||||
```
|
||||
- Generate userslist with httpasswd hashed password
|
||||
`abra app secret insert traefik.gathering.org usersfile v1 'admin:<hashed-secret>'`
|
||||
`abra app secret insert traefik.gathering.org userslist v1 'admin:<hashed-secret>'`
|
||||
make sure there is no whitespace in between `admin:<hashed-secret>`, it seems to break stuff...
|
||||
- `abra app deploy -f traefik`
|
||||
1. `abra app new monitoring-ng`
|
||||
@ -47,13 +47,6 @@ Where gathering.org is the node you want to gather metrics from.
|
||||
- cadvisor.monitoring.gathering.org
|
||||
- node.monitoring.gathering.org
|
||||
|
||||
### Expose node and cadvisor via ports instead of traefik
|
||||
|
||||
In case you have no traefik running on the machine, you can expose the ports directly by uncommenting the following line:
|
||||
```
|
||||
# COMPOSE_FILE="$COMPOSE_FILE:compose.expose-ports.yml"
|
||||
```
|
||||
|
||||
## Setup Metrics Browser
|
||||
|
||||
|
||||
@ -92,6 +85,7 @@ In case you have no traefik running on the machine, you can expose the ports dir
|
||||
| Cadvisor | traefik basic-auth | cadvisor.monitoring.example.org |
|
||||
| Node Exporter | traefik basic-auth | node.monitoring.example.org |
|
||||
|
||||
|
||||
### Logging from a docker host to loki server without anything else
|
||||
|
||||
```
|
||||
@ -136,26 +130,6 @@ After that you need to add the `pushgateway.${DOMAIN}` to the scare config.
|
||||
|
||||
THX to the previous work of @decentral1se @knooflok @3wc @cellarspoon @mirsal
|
||||
|
||||
## Adding Matrix as Alert Contact point
|
||||
|
||||
1. Enable the [matrix-alertmanager-receiver](https://github.com/metio/matrix-alertmanager-receiver/):
|
||||
```
|
||||
COMPOSE_FILE="$COMPOSE_FILE:compose.matrix-alertmanager-receiver.yml"
|
||||
```
|
||||
|
||||
2. Insert the matrix access token secret:
|
||||
```
|
||||
abra app secret insert monitoring.marx.klasse-methode.it matrix_access_token v1
|
||||
```
|
||||
|
||||
3. Set required configurations:
|
||||
```
|
||||
GF_MATRIX_USER_ID=
|
||||
GF_MATRIX_ROOM_ID=
|
||||
GF_MATRIX_HOME_SERVER_URL=
|
||||
```
|
||||
4. Configure Alertmanager webhook and set the url to `http://matrix-alertmanager-receiver:12345/alerts/<room-id>`
|
||||
|
||||
## alerts
|
||||
|
||||
It is possible to enable the following alerts, by setting the corresponding env variable to `true`:
|
||||
@ -164,5 +138,3 @@ It is possible to enable the following alerts, by setting the corresponding env
|
||||
- backupbot not successfull: `ALERT_BACKUP_NOT_SUCCESSFULL_ENABLED`
|
||||
- node disk space: `ALERT_NODE_DISK_SPACE_ENABLED`
|
||||
- node memory usage: `ALERT_NODE_MEMORY_USAGE_ENABLED`
|
||||
|
||||
|
||||
|
2
abra.sh
2
abra.sh
@ -10,8 +10,6 @@ export GRAFANA_CUSTOM_INI_VERSION=v4
|
||||
export PROMTAIL_YML_VERSION=v3
|
||||
export LOKI_YML_VERSION=v2
|
||||
export PROMETHEUS_YML_VERSION=v2
|
||||
export MATRIX_ALERTMANAGER_CONFIG_VERSION=e
|
||||
export MATRIX_ALERTMANAGER_ENTRYPOINT_VERSION=a
|
||||
|
||||
# creates a default prometheus scrape config for a given node
|
||||
add_node(){
|
||||
|
@ -1,74 +0,0 @@
|
||||
# configuration of the HTTP server
|
||||
http:
|
||||
## address: 127.0.0.1 # bind address for this service. Can be left unspecified to bind on all interfaces
|
||||
port: 12345 # port used by this service
|
||||
alerts-path-prefix: /alerts # URL path for the webhook receiver called by an Alertmanager. Defaults to /alerts
|
||||
metrics-path: /metrics # URL path to collect metrics. Defaults to /metrics
|
||||
metrics-enabled: true # Whether to enable metrics or not. Defaults to false
|
||||
# basic-username: alertmanager # Username for basic authentication. Defaults to alertmanager
|
||||
# basic-password: secret # If set, the alerts endpoint expects basic-auth credentials with the configured username and password
|
||||
|
||||
# configuration for the Matrix connection
|
||||
matrix:
|
||||
homeserver-url: "{{ env "GF_MATRIX_HOMESERVER_URL" }}"
|
||||
user-id: "{{ env "GF_MATRIX_USER_ID" }}"
|
||||
access-token: "{{ secret "matrix_access_token" }}"
|
||||
room-mapping:
|
||||
matrixroom: "{{ env "GF_MATRIX_ROOM_ID" }}"
|
||||
|
||||
templating:
|
||||
# mapping of ExternalURL values
|
||||
external-url-mapping:
|
||||
# key is the original value taken from the Alertmanager payload
|
||||
# value is the mapped value which will be available as '.ExternalURL' in templates
|
||||
"http://alertmanager:9093": https://alertmanager.example.com
|
||||
# mapping of GeneratorURL values
|
||||
generator-url-mapping:
|
||||
# key is the original value taken from the Alertmanager payload
|
||||
# value is the mapped value which will be available as '.GeneratorURL' in templates
|
||||
"http://prometheus:8080": https://prometheus.example.com
|
||||
|
||||
# computation of arbitrary values based on matching alert annotations, labels, or status
|
||||
# values will be evaluated top to bottom, last entry wins
|
||||
computed-values:
|
||||
- values: # always set 'color' to 'yellow'
|
||||
color: yellow
|
||||
- values: # set 'color' to 'orange' when alert label 'severity' is 'warning'
|
||||
color: orange
|
||||
when-matching-labels:
|
||||
severity: warning
|
||||
- values: # set 'color' to 'red' when alert label 'severity' is 'critical'
|
||||
color: red
|
||||
when-matching-labels:
|
||||
severity: critical
|
||||
- values: # set 'color' to 'green' when alert status is 'resolved'
|
||||
color: green
|
||||
when-matching-status: resolved
|
||||
|
||||
# template for alerts in status 'firing'
|
||||
firing-template: '{{`
|
||||
<p>
|
||||
<strong><font color="{{ .ComputedValues.color }}">{{ .Alert.Status | ToUpper }}</font></strong>
|
||||
{{ if .Alert.Labels.name }}
|
||||
{{ .Alert.Labels.name }}
|
||||
{{ else if .Alert.Labels.alertname }}
|
||||
{{ .Alert.Labels.alertname }}
|
||||
{{ end }}
|
||||
>>
|
||||
{{ if .Alert.Labels.severity }}
|
||||
{{ .Alert.Labels.severity | ToUpper }}:
|
||||
{{ end }}
|
||||
{{ if .Alert.Annotations.description }}
|
||||
{{ .Alert.Annotations.description }}
|
||||
{{ else if .Alert.Annotations.summary }}
|
||||
{{ .Alert.Annotations.summary }}
|
||||
{{ end }}
|
||||
>>
|
||||
{{ if .Alert.Annotations.runbook }}
|
||||
<a href="{{ .Alert.Annotations.runbook }}">Runbook</a> |
|
||||
{{ end }}
|
||||
{{ if .Alert.Annotations.dashboard }}
|
||||
<a href="{{ .Alert.Annotations.dashboard }}">Dashboard</a> |
|
||||
{{ end }}
|
||||
<a href="{{ .SilenceURL }}">Silence</a>
|
||||
</p>`}}'
|
@ -1,13 +0,0 @@
|
||||
---
|
||||
version: "3.8"
|
||||
|
||||
services:
|
||||
app:
|
||||
ports:
|
||||
- "9100:9100"
|
||||
deploy:
|
||||
|
||||
cadvisor:
|
||||
ports:
|
||||
- "9101:8080"
|
||||
deploy:
|
@ -48,9 +48,8 @@ services:
|
||||
deploy:
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.docker.network=proxy"
|
||||
- "traefik.http.services.${STACK_NAME}-grafana.loadbalancer.server.port=3000"
|
||||
- "traefik.http.routers.${STACK_NAME}-grafana.rule=Host(`${GRAFANA_DOMAIN:-$DOMAIN}`)"
|
||||
- "traefik.http.routers.${STACK_NAME}-grafana.rule=Host(`${DOMAIN}`)"
|
||||
- "traefik.http.routers.${STACK_NAME}-grafana.entrypoints=web-secure"
|
||||
- "traefik.http.routers.${STACK_NAME}-grafana.tls=true"
|
||||
- "traefik.http.routers.${STACK_NAME}-grafana.tls.certresolver=${LETS_ENCRYPT_ENV}"
|
||||
|
@ -27,7 +27,6 @@ services:
|
||||
condition: on-failure
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.docker.network=proxy"
|
||||
- "traefik.http.services.${STACK_NAME}-loki.loadbalancer.server.port=3100"
|
||||
- "traefik.http.routers.${STACK_NAME}-loki.rule=Host(`loki.${DOMAIN}`)"
|
||||
- "traefik.http.routers.${STACK_NAME}-loki.entrypoints=web-secure"
|
||||
@ -49,4 +48,4 @@ volumes:
|
||||
# secrets:
|
||||
# loki_aws_secret_access_key:
|
||||
# external: true
|
||||
# name: ${STACK_NAME}_loki_aws_secret_access_key_${SECRET_LOKI_AWS_SECRET_ACCESS_KEY_VERSION}
|
||||
# name: ${STACK_NAME}_loki_aws_secret_access_key_${SECRET_LOKI_AWS_SECRET_ACCESS_KEY_VERSION}
|
@ -1,28 +0,0 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
matrix-alertmanager-receiver:
|
||||
image: metio/matrix-alertmanager-receiver:2025.2.9
|
||||
secrets:
|
||||
- matrix_access_token
|
||||
configs:
|
||||
- source: matrix-alertmanager-receiver-config
|
||||
target: /etc/matrix-alertmanager-receiver/config.yml
|
||||
networks:
|
||||
- internal
|
||||
environment:
|
||||
- GF_MATRIX_USER_ID
|
||||
- GF_MATRIX_ROOM_ID
|
||||
- GF_MATRIX_HOMESERVER_URL
|
||||
command: "--config-path=/etc/matrix-alertmanager-receiver/config.yml"
|
||||
|
||||
configs:
|
||||
matrix-alertmanager-receiver-config:
|
||||
template_driver: golang
|
||||
name: ${STACK_NAME}_mar_config_${MATRIX_ALERTMANAGER_CONFIG_VERSION}
|
||||
file: alertmanager-matrix-config.yml.tmpl
|
||||
|
||||
secrets:
|
||||
matrix_access_token:
|
||||
external: true
|
||||
name: ${STACK_NAME}_matrix_access_token_${SECRET_MATRIX_ACCESS_TOKEN_VERSION}
|
@ -24,12 +24,12 @@ services:
|
||||
condition: on-failure
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.docker.network=proxy"
|
||||
- "traefik.http.services.${STACK_NAME}-prometheus.loadbalancer.server.port=9090"
|
||||
- "traefik.http.routers.${STACK_NAME}-prometheus.rule=Host(`prometheus.${DOMAIN}`)"
|
||||
- "traefik.http.routers.${STACK_NAME}-prometheus.entrypoints=web-secure"
|
||||
- "traefik.http.routers.${STACK_NAME}-prometheus.tls=true"
|
||||
- "traefik.http.routers.${STACK_NAME}-prometheus.tls.certresolver=${LETS_ENCRYPT_ENV}"
|
||||
- "traefik.http.routers.${STACK_NAME}-prometheus.middlewares=basicauth@file"
|
||||
|
||||
configs:
|
||||
prometheus_yml:
|
||||
|
@ -17,7 +17,6 @@ services:
|
||||
condition: on-failure
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.docker.network=proxy"
|
||||
- "traefik.http.services.${STACK_NAME}-pushgateway.loadbalancer.server.port=9191"
|
||||
- "traefik.http.routers.${STACK_NAME}-pushgateway.rule=Host(`pushgateway.${DOMAIN}`)"
|
||||
- "traefik.http.routers.${STACK_NAME}-pushgateway.entrypoints=web-secure"
|
||||
|
@ -32,14 +32,13 @@ services:
|
||||
labels:
|
||||
- "backupbot.backup=${ENABLE_BACKUPS:-true}"
|
||||
- "traefik.enable=true"
|
||||
- "traefik.docker.network=proxy"
|
||||
- "traefik.http.services.${STACK_NAME}-node.loadbalancer.server.port=9100"
|
||||
- "traefik.http.routers.${STACK_NAME}-node.rule=Host(`node.${DOMAIN}`)"
|
||||
- "traefik.http.routers.${STACK_NAME}-node.entrypoints=web-secure"
|
||||
- "traefik.http.routers.${STACK_NAME}-node.tls=true"
|
||||
- "traefik.http.routers.${STACK_NAME}-node.tls.certresolver=${LETS_ENCRYPT_ENV}"
|
||||
- "traefik.http.routers.${STACK_NAME}-node.middlewares=basicauth@file"
|
||||
- "coop-cloud.${STACK_NAME}.version=1.6.0+v1.8.1"
|
||||
- "coop-cloud.${STACK_NAME}.version=1.3.0+v1.8.1"
|
||||
- "coop-cloud.${STACK_NAME}.timeout=${TIMEOUT:-120}"
|
||||
|
||||
cadvisor:
|
||||
@ -64,7 +63,6 @@ services:
|
||||
condition: on-failure
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.docker.network=proxy"
|
||||
- "traefik.http.services.${STACK_NAME}-cadvisor.loadbalancer.server.port=8080"
|
||||
- "traefik.http.routers.${STACK_NAME}-cadvisor.rule=Host(`cadvisor.${DOMAIN}`)"
|
||||
- "traefik.http.routers.${STACK_NAME}-cadvisor.entrypoints=web-secure"
|
||||
|
@ -1 +0,0 @@
|
||||
Adds an optional GRAFANA_DOMAIN
|
@ -1 +0,0 @@
|
||||
Adds an optional matrix contact point for grafana
|
@ -1 +0,0 @@
|
||||
Adds option to expose ports for node and cadvisor service
|
Loading…
x
Reference in New Issue
Block a user