Compare commits
29 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
bf8af312eb
|
|||
|
a2f1636ed4
|
|||
|
f2711fa16e
|
|||
|
2870b9486c
|
|||
|
3a1fabe4f9
|
|||
|
a358837922
|
|||
|
dd0a0c1bb0
|
|||
|
31cabc36ae
|
|||
|
d25986d5cb
|
|||
|
f8f8004445
|
|||
|
aa05d022da
|
|||
|
fb52a76247
|
|||
|
2e2a52eae0
|
|||
|
48419d5afa
|
|||
|
a0a6e2c509
|
|||
|
024f2a8aec
|
|||
|
38095e23fa
|
|||
|
641161329e
|
|||
|
cdacfd035e
|
|||
|
b2d3901f61
|
|||
|
8becf1c1d6
|
|||
|
777b1355dd
|
|||
|
e83433cebd
|
|||
|
a713f98ffb
|
|||
|
8dc84c591c
|
|||
|
d9aa05a4b5
|
|||
|
349df12204
|
|||
|
6c33089078
|
|||
|
4bedebfab1
|
41
.env.sample
41
.env.sample
@ -5,16 +5,20 @@ DOMAIN=monitoring-ng.example.com
|
|||||||
#TIMEOUT=120
|
#TIMEOUT=120
|
||||||
ENABLE_BACKUPS=true
|
ENABLE_BACKUPS=true
|
||||||
|
|
||||||
## Enable this secret for Promtail / Prometheus
|
SECRET_BASIC_AUTH_VERSION=v1
|
||||||
#COMPOSE_FILE="$COMPOSE_FILE:compose.basic-auth.yml"
|
# Enable this to send logs to a Loki server, adapt DOMAIN if server is
|
||||||
#SECRET_BASIC_AUTH_VERSION=v1
|
# remote
|
||||||
#
|
# LOKI_PUSH_URL=https://loki.$DOMAIN/loki/api/v1/push
|
||||||
# Promtail (Gathering Logs)
|
# Enable this on SystemD hosts to read logs
|
||||||
# COMPOSE_FILE="$COMPOSE_FILE:compose.promtail.yml"
|
# JOURNALD=1
|
||||||
# LOKI_PUSH_URL=https://loki.monitoring.example.org/loki/api/v1/push
|
# Enable this on syslogd hosts and configure the syslogd to send logs to
|
||||||
|
# Alloy on port 514/tcp
|
||||||
|
# SYSLOG=1
|
||||||
|
# COMPOSE_FILE="$COMPOSE_FILE:compose.syslog.yml"
|
||||||
|
|
||||||
## Expose node and cadvisor ports instead of traefik
|
# Enable this to send metrics to a Prometheus server, adapt DOMAIN if
|
||||||
# COMPOSE_FILE="$COMPOSE_FILE:compose.expose-ports.yml"
|
# server is remote
|
||||||
|
# PROMETHEUS_REMOTE_WRITE_URL=https://prometheus.$DOMAIN/api/v1/write
|
||||||
|
|
||||||
# Monitoring Server
|
# Monitoring Server
|
||||||
#
|
#
|
||||||
@ -47,8 +51,6 @@ ENABLE_BACKUPS=true
|
|||||||
# COMPOSE_FILE="$COMPOSE_FILE:compose.grafana.yml"
|
# COMPOSE_FILE="$COMPOSE_FILE:compose.grafana.yml"
|
||||||
# GF_SERVER_ROOT_URL=https://monitoring.example.com
|
# GF_SERVER_ROOT_URL=https://monitoring.example.com
|
||||||
# SECRET_GF_ADMINPASSWD_VERSION=v1
|
# SECRET_GF_ADMINPASSWD_VERSION=v1
|
||||||
## Seperate domain for Grafana
|
|
||||||
#GRAFANA_DOMAIN=grafana.example.com
|
|
||||||
#
|
#
|
||||||
## Single-Sign-On with OIDC
|
## Single-Sign-On with OIDC
|
||||||
# COMPOSE_FILE="$COMPOSE_FILE:compose.grafana-oidc.yml"
|
# COMPOSE_FILE="$COMPOSE_FILE:compose.grafana-oidc.yml"
|
||||||
@ -80,10 +82,15 @@ ENABLE_BACKUPS=true
|
|||||||
#GF_MATRIX_ROOM_ID="<room-id>"
|
#GF_MATRIX_ROOM_ID="<room-id>"
|
||||||
#GF_MATRIX_HOMESERVER_URL="<homeserver-url>"
|
#GF_MATRIX_HOMESERVER_URL="<homeserver-url>"
|
||||||
|
|
||||||
## ALerts
|
# ALerts
|
||||||
|
#ALERT_BACKUP_FAILED_ENABLED=true
|
||||||
|
#ALERT_BACKUP_MISSING_ENABLED=true
|
||||||
|
#ALERT_BACKUP_NOT_SUCCESSFULL_ENABLED=true
|
||||||
|
#ALERT_NODE_DISK_SPACE_ENABLED=true
|
||||||
|
#ALERT_NODE_MEMORY_USAGE_ENABLED=true
|
||||||
|
|
||||||
# Node disk space alert will trigger when free disk space left is below the given number in percent
|
# Forgejo metrics
|
||||||
#ALERT_NODE_DISK_SPACE_LEFT=10
|
# SECRET_FORGEJO_METRICS_TOKEN_VERSION=v1
|
||||||
|
# FORGEJO_METRICS_HOSTNAME=
|
||||||
# Node memory usage alert will trigger when memory usage is above the given number in percent
|
# FORGEJO_INSECURE_SKIP_VERIFY=false
|
||||||
#ALERT_NODE_MEMORY_USAGE=85
|
# COMPOSE_FILE="$COMPOSE_FILE:compose.forgejo.yml"
|
||||||
|
|||||||
58
README.md
58
README.md
@ -1,8 +1,8 @@
|
|||||||
# monitoring-ng
|
# monitoring-ng
|
||||||
|
|
||||||
Yet another monitoring stack ...
|
Yet another monitoring stack ...
|
||||||
This time its a all-in-one grafana/prometheus/loki/node_exporter/cadvisor/promtail stack.
|
This time its a all-in-one grafana/prometheus/loki/alloy stack.
|
||||||
It's based heavily on the [monitoring-lite](https://git.coopcloud.tech/coop-cloud/monitoring-lite) stack, but has everything in one recipe included now. So you can deploy monitoring instances to only gather metrics / logs (node_exporter/cadvisor/promtail) and also deploy instances with the full monitoring stack (grafana/prometheus/loki) with the same recipe and just different .env configuration.
|
It's based heavily on the [monitoring-lite](https://git.coopcloud.tech/coop-cloud/monitoring-lite) stack, but has everything in one recipe included now. So you can deploy monitoring instances to only gather metrics / logs (alloy) and also deploy instances with the full monitoring stack (grafana/prometheus/loki) with the same recipe and just different .env configuration.
|
||||||
|
|
||||||
|
|
||||||
<!-- metadata -->
|
<!-- metadata -->
|
||||||
@ -18,37 +18,47 @@ It's based heavily on the [monitoring-lite](https://git.coopcloud.tech/coop-clou
|
|||||||
|
|
||||||
<!-- endmetadata -->
|
<!-- endmetadata -->
|
||||||
|
|
||||||
## Setup Metrics Gathering
|
## Setup a Metrics Gathering
|
||||||
|
|
||||||
Where gathering.org is the node you want to gather metrics from.
|
Where gathering.org is the node you want to gather metrics from.
|
||||||
|
|
||||||
1. Configure DNS
|
1. Configure DNS
|
||||||
|
- monitoring.gathering.org
|
||||||
- cadvisor.monitoring.gathering.org
|
- cadvisor.monitoring.gathering.org
|
||||||
- node.monitoring.gathering.org
|
- node.monitoring.gathering.org
|
||||||
2. [Configure Traefik to use BasicAuth](https://git.coopcloud.tech/coop-cloud/traefik#configuring-wildcard-ssl-using-dns)
|
1. Configure Traefik to use BasicAuth
|
||||||
3. `abra app new monitoring-ng`
|
* `abra app config traefik.gathering.org`
|
||||||
4. `abra app config monitoring.gathering.org` (for gathering only the main `compose.yml` is needed, nothing more.)
|
uncomment
|
||||||
5. `abra app deploy monitoring.gathering.org`
|
```
|
||||||
6. check that endpoints are up and basic-auth works
|
# BASIC_AUTH
|
||||||
|
COMPOSE_FILE="$COMPOSE_FILE:compose.basicauth.yml"
|
||||||
|
BASIC_AUTH=1
|
||||||
|
SECRET_USERSFILE_VERSION=v1
|
||||||
|
```
|
||||||
|
- Generate userslist with httpasswd hashed password
|
||||||
|
`abra app secret insert traefik.gathering.org usersfile v1 'admin:<hashed-secret>'`
|
||||||
|
make sure there is no whitespace in between `admin:<hashed-secret>`, it seems to break stuff...
|
||||||
|
- `abra app deploy -f traefik`
|
||||||
|
1. `abra app new monitoring-ng`
|
||||||
|
1. `abra app config monitoring.gathering.org`
|
||||||
|
for gathering only the main `compose.yml` is needed, nothing more.
|
||||||
|
1. `abra app deploy monitoring.gathering.org`
|
||||||
|
1. check that endpoints are up and basic-auth works
|
||||||
- cadvisor.monitoring.gathering.org
|
- cadvisor.monitoring.gathering.org
|
||||||
- node.monitoring.gathering.org
|
- node.monitoring.gathering.org
|
||||||
|
|
||||||
### Expose node and cadvisor via ports instead of traefik
|
|
||||||
|
|
||||||
In case you have no traefik running on the machine, you can expose the ports directly by uncommenting the following line:
|
|
||||||
```
|
|
||||||
# COMPOSE_FILE="$COMPOSE_FILE:compose.expose-ports.yml"
|
|
||||||
```
|
|
||||||
|
|
||||||
## Setup Metrics Browser
|
## Setup Metrics Browser
|
||||||
|
|
||||||
This builds upon [Setup Metrics Gathering](#setup-metrics-grathering) so make sure you did that first.
|
|
||||||
|
|
||||||
1. Configure DNS
|
1. Configure DNS
|
||||||
- monitoring.example.org
|
- monitoring.example.org
|
||||||
|
- prometheus.monitoring.example.org
|
||||||
|
- loki.monitoring.example.org
|
||||||
2. Setup monitoring stack
|
2. Setup monitoring stack
|
||||||
- `abra app config monitoring.example.org` Uncomment prometheus, loki and grafana
|
- `abra app new monitoring-ng`
|
||||||
- `abra app secret insert monitoring.example.org basic_auth v1 <password>`
|
- `abra app config monitoring.example.org`
|
||||||
|
Uncomment all the stuff
|
||||||
|
- `abra app secret insert monitoring.example.org basic_auth v1 <secret>`
|
||||||
this needs the plaintext traefik basic-auth secret, not the hashed one!
|
this needs the plaintext traefik basic-auth secret, not the hashed one!
|
||||||
- `abra app secret ls monitoring.example.org`
|
- `abra app secret ls monitoring.example.org`
|
||||||
- `abra app deploy monitoring.example.org`
|
- `abra app deploy monitoring.example.org`
|
||||||
@ -139,9 +149,13 @@ GF_MATRIX_HOME_SERVER_URL=
|
|||||||
```
|
```
|
||||||
4. Configure Alertmanager webhook and set the url to `http://matrix-alertmanager-receiver:12345/alerts/<room-id>`
|
4. Configure Alertmanager webhook and set the url to `http://matrix-alertmanager-receiver:12345/alerts/<room-id>`
|
||||||
|
|
||||||
## Alerts
|
## alerts
|
||||||
|
|
||||||
|
It is possible to enable the following alerts, by setting the corresponding env variable to `true`:
|
||||||
|
- backupbot failed: `ALERT_BACKUP_FAILED_ENABLED`
|
||||||
|
- backupbot missing: `ALERT_BACKUP_MISSING_ENABLED`
|
||||||
|
- backupbot not successfull: `ALERT_BACKUP_NOT_SUCCESSFULL_ENABLED`
|
||||||
|
- node disk space: `ALERT_NODE_DISK_SPACE_ENABLED`
|
||||||
|
- node memory usage: `ALERT_NODE_MEMORY_USAGE_ENABLED`
|
||||||
|
|
||||||
It is possible to enable the following alerts, by uncommenting the corresponding env variable:
|
|
||||||
|
|
||||||
- node disk space: `ALERT_NODE_DISK_SPACE_LEFT`
|
|
||||||
- node memory usage: `ALERT_NODE_MEMORY_USAGE`
|
|
||||||
|
|||||||
8
abra.sh
8
abra.sh
@ -6,12 +6,12 @@ export GF_STACKS_DASH_VERSION=v2
|
|||||||
export GF_TRAEFIK_DASH_VERSION=v2
|
export GF_TRAEFIK_DASH_VERSION=v2
|
||||||
export GF_BACKUP_DASH_VERSION=v1
|
export GF_BACKUP_DASH_VERSION=v1
|
||||||
export GF_CUSTOM_INI_VERSION=v4
|
export GF_CUSTOM_INI_VERSION=v4
|
||||||
export PROMTAIL_YML_VERSION=v3
|
|
||||||
export LOKI_YML_VERSION=v3
|
export LOKI_YML_VERSION=v3
|
||||||
export PROMETHEUS_YML_VERSION=v2
|
export PROMETHEUS_YML_VERSION=v2
|
||||||
export MATRIX_ALERTMANAGER_CONFIG_VERSION=v1
|
export MATRIX_ALERTMANAGER_CONFIG_VERSION=e
|
||||||
export MATRIX_ALERTMANAGER_ENTRYPOINT_VERSION=v1
|
export MATRIX_ALERTMANAGER_ENTRYPOINT_VERSION=a
|
||||||
export GRAFANA_ALERTS_NODE_VERSION=v2
|
export GRAFANA_ALERTS_NODE_VERSION=v1c
|
||||||
|
export CONFIG_ALLOY_VERSION=v9
|
||||||
|
|
||||||
# creates a default prometheus scrape config for a given node
|
# creates a default prometheus scrape config for a given node
|
||||||
add_node(){
|
add_node(){
|
||||||
|
|||||||
@ -2,13 +2,13 @@ apiVersion: 1
|
|||||||
|
|
||||||
# List of alert rule UIDs that should be deleted
|
# List of alert rule UIDs that should be deleted
|
||||||
deleteRules:
|
deleteRules:
|
||||||
{{ if not (env "ALERT_NODE_DISK_SPACE_LEFT") }}
|
{{ if ne (env "ALERT_NODE_DISK_SPACE_ENABLED") "true" }}
|
||||||
- orgId: 1
|
- orgId: 1
|
||||||
uid: coopcloud_node_disk_space_left
|
uid: bds8bhxu97pxca
|
||||||
{{ end }}
|
{{ end }}
|
||||||
{{ if not (env "ALERT_NODE_MEMORY_USAGE") }}
|
{{ if ne (env "ALERT_NODE_MEMORY_USAGE_ENABLED") "true" }}
|
||||||
- orgId: 1
|
- orgId: 1
|
||||||
uid: coopcloud_node_memory_usage
|
uid: ads8cswmly96oa
|
||||||
{{ end }}
|
{{ end }}
|
||||||
|
|
||||||
groups:
|
groups:
|
||||||
@ -17,8 +17,8 @@ groups:
|
|||||||
folder: node
|
folder: node
|
||||||
interval: 5m
|
interval: 5m
|
||||||
rules:
|
rules:
|
||||||
{{ if (env "ALERT_NODE_DISK_SPACE_LEFT") }}
|
{{ if eq (env "ALERT_NODE_DISK_SPACE_ENABLED") "true" }}
|
||||||
- uid: coopcloud_node_disk_space_left
|
- uid: bds8bhxu97pxca
|
||||||
title: Node Disk Space
|
title: Node Disk Space
|
||||||
condition: C
|
condition: C
|
||||||
data:
|
data:
|
||||||
@ -45,7 +45,7 @@ groups:
|
|||||||
conditions:
|
conditions:
|
||||||
- evaluator:
|
- evaluator:
|
||||||
params:
|
params:
|
||||||
- {{ env "ALERT_NODE_DISK_SPACE_LEFT" }}
|
- 10
|
||||||
type: lt
|
type: lt
|
||||||
operator:
|
operator:
|
||||||
type: and
|
type: and
|
||||||
@ -70,13 +70,13 @@ groups:
|
|||||||
annotations:
|
annotations:
|
||||||
description: ""
|
description: ""
|
||||||
runbook_url: ""
|
runbook_url: ""
|
||||||
summary: Less than {{ env "ALERT_NODE_DISK_SPACE_LEFT" }}% disk space left on {{`{{ $labels.instance }}`}} ({{`{{ (index $values "A").Value }}`}}% left)
|
summary: Less than 10% disk space left on {{`{{ $labels.instance }}`}} ({{`{{ (index $values "A").Value }}`}}% left)
|
||||||
labels:
|
labels:
|
||||||
"": ""
|
"": ""
|
||||||
isPaused: false
|
isPaused: false
|
||||||
{{ end }}
|
{{ end }}
|
||||||
{{ if (env "ALERT_NODE_MEMORY_USAGE") }}
|
{{ if eq (env "ALERT_NODE_MEMORY_USAGE_ENABLED") "true" }}
|
||||||
- uid: coopcloud_node_memory_usage
|
- uid: ads8cswmly96oa
|
||||||
title: Node Memory Usage
|
title: Node Memory Usage
|
||||||
condition: C
|
condition: C
|
||||||
data:
|
data:
|
||||||
@ -103,7 +103,7 @@ groups:
|
|||||||
conditions:
|
conditions:
|
||||||
- evaluator:
|
- evaluator:
|
||||||
params:
|
params:
|
||||||
- {{ env "ALERT_NODE_MEMORY_USAGE" }}
|
- 85
|
||||||
type: gt
|
type: gt
|
||||||
operator:
|
operator:
|
||||||
type: and
|
type: and
|
||||||
@ -126,6 +126,6 @@ groups:
|
|||||||
execErrState: Error
|
execErrState: Error
|
||||||
for: 5m
|
for: 5m
|
||||||
annotations:
|
annotations:
|
||||||
summary: Memory usage is above {{ env "ALERT_NODE_MEMORY_USAGE" }}% on {{`{{ $labels.instance }}`}} ({{`{{ printf "%.2f" (index $values "A").Value }}`}}% usage)
|
summary: Memory usage is above 85% on {{`{{ $labels.instance }}`}} ({{`{{ printf "%.2f" (index $values "A").Value }}`}}% usage)
|
||||||
isPaused: false
|
isPaused: false
|
||||||
{{ end }}
|
{{ end }}
|
||||||
|
|||||||
@ -1,7 +0,0 @@
|
|||||||
---
|
|
||||||
version: "3.8"
|
|
||||||
|
|
||||||
secrets:
|
|
||||||
basic_auth:
|
|
||||||
external: true
|
|
||||||
name: ${STACK_NAME}_basic_auth_${SECRET_BASIC_AUTH_VERSION}
|
|
||||||
@ -1,13 +0,0 @@
|
|||||||
---
|
|
||||||
version: "3.8"
|
|
||||||
|
|
||||||
services:
|
|
||||||
app:
|
|
||||||
ports:
|
|
||||||
- "9100:9100"
|
|
||||||
deploy:
|
|
||||||
|
|
||||||
cadvisor:
|
|
||||||
ports:
|
|
||||||
- "9101:8080"
|
|
||||||
deploy:
|
|
||||||
10
compose.forgejo.yml
Normal file
10
compose.forgejo.yml
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
---
|
||||||
|
version: "3.8"
|
||||||
|
services:
|
||||||
|
app:
|
||||||
|
secrets:
|
||||||
|
- forgejo_token
|
||||||
|
secrets:
|
||||||
|
forgejo_token:
|
||||||
|
external: true
|
||||||
|
name: ${STACK_NAME}_forgejo_token_${SECRET_FORGEJO_METRICS_TOKEN_VERSION}
|
||||||
@ -2,7 +2,7 @@ version: '3.8'
|
|||||||
|
|
||||||
services:
|
services:
|
||||||
grafana:
|
grafana:
|
||||||
image: grafana/grafana:12.4.0
|
image: grafana/grafana:12.4.3
|
||||||
volumes:
|
volumes:
|
||||||
- grafana-data:/var/lib/grafana:rw
|
- grafana-data:/var/lib/grafana:rw
|
||||||
secrets:
|
secrets:
|
||||||
@ -32,19 +32,19 @@ services:
|
|||||||
- GF_SECURITY_ADMIN_PASSWORD__FILE=/run/secrets/gf_adminpasswd
|
- GF_SECURITY_ADMIN_PASSWORD__FILE=/run/secrets/gf_adminpasswd
|
||||||
- GF_SECURITY_ALLOW_EMBEDDING
|
- GF_SECURITY_ALLOW_EMBEDDING
|
||||||
- GF_INSTALL_PLUGINS
|
- GF_INSTALL_PLUGINS
|
||||||
- ALERT_NODE_DISK_SPACE_LEFT
|
- ALERT_NODE_DISK_SPACE_ENABLED
|
||||||
- ALERT_NODE_MEMORY_USAGE
|
- ALERT_NODE_MEMORY_USAGE_ENABLED
|
||||||
deploy:
|
deploy:
|
||||||
labels:
|
labels:
|
||||||
- "traefik.enable=true"
|
- "traefik.enable=true"
|
||||||
- "traefik.docker.network=proxy"
|
- "traefik.swarm.network=proxy"
|
||||||
- "traefik.http.services.${STACK_NAME}-grafana.loadbalancer.server.port=3000"
|
- "traefik.http.services.${STACK_NAME}-grafana.loadbalancer.server.port=3000"
|
||||||
- "traefik.http.routers.${STACK_NAME}-grafana.rule=Host(`${GRAFANA_DOMAIN:-$DOMAIN}`)"
|
- "traefik.http.routers.${STACK_NAME}-grafana.rule=Host(`${DOMAIN}`)"
|
||||||
- "traefik.http.routers.${STACK_NAME}-grafana.entrypoints=web-secure"
|
- "traefik.http.routers.${STACK_NAME}-grafana.entrypoints=web-secure"
|
||||||
- "traefik.http.routers.${STACK_NAME}-grafana.tls=true"
|
- "traefik.http.routers.${STACK_NAME}-grafana.tls=true"
|
||||||
- "traefik.http.routers.${STACK_NAME}-grafana.tls.certresolver=${LETS_ENCRYPT_ENV}"
|
- "traefik.http.routers.${STACK_NAME}-grafana.tls.certresolver=${LETS_ENCRYPT_ENV}"
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: "wget -q http://localhost:3000/ -O/dev/null"
|
test: "wget -q http://localhost:3000/healthz -O/dev/null"
|
||||||
interval: 5s
|
interval: 5s
|
||||||
timeout: 10s
|
timeout: 10s
|
||||||
retries: 3
|
retries: 3
|
||||||
|
|||||||
@ -2,7 +2,7 @@ version: '3.8'
|
|||||||
|
|
||||||
services:
|
services:
|
||||||
loki:
|
loki:
|
||||||
image: grafana/loki:3.6.7
|
image: grafana/loki:3.7.2
|
||||||
command: -config.file=/etc/loki/local-config.yaml
|
command: -config.file=/etc/loki/local-config.yaml
|
||||||
networks:
|
networks:
|
||||||
- proxy
|
- proxy
|
||||||
@ -27,7 +27,7 @@ services:
|
|||||||
condition: on-failure
|
condition: on-failure
|
||||||
labels:
|
labels:
|
||||||
- "traefik.enable=true"
|
- "traefik.enable=true"
|
||||||
- "traefik.docker.network=proxy"
|
- "traefik.swarm.network=proxy"
|
||||||
- "traefik.http.services.${STACK_NAME}-loki.loadbalancer.server.port=3100"
|
- "traefik.http.services.${STACK_NAME}-loki.loadbalancer.server.port=3100"
|
||||||
- "traefik.http.routers.${STACK_NAME}-loki.rule=Host(`loki.${DOMAIN}`)"
|
- "traefik.http.routers.${STACK_NAME}-loki.rule=Host(`loki.${DOMAIN}`)"
|
||||||
- "traefik.http.routers.${STACK_NAME}-loki.entrypoints=web-secure"
|
- "traefik.http.routers.${STACK_NAME}-loki.entrypoints=web-secure"
|
||||||
|
|||||||
@ -2,7 +2,7 @@ version: '3.8'
|
|||||||
|
|
||||||
services:
|
services:
|
||||||
prometheus:
|
prometheus:
|
||||||
image: prom/prometheus:v3.10.0
|
image: prom/prometheus:v3.12.0
|
||||||
secrets:
|
secrets:
|
||||||
- basic_auth
|
- basic_auth
|
||||||
volumes:
|
volumes:
|
||||||
@ -16,6 +16,8 @@ services:
|
|||||||
- "--web.console.libraries=/usr/share/prometheus/console_libraries"
|
- "--web.console.libraries=/usr/share/prometheus/console_libraries"
|
||||||
- "--web.console.templates=/usr/share/prometheus/consoles"
|
- "--web.console.templates=/usr/share/prometheus/consoles"
|
||||||
- "--storage.tsdb.retention.time=${PROMETHEUS_RETENTION_TIME}"
|
- "--storage.tsdb.retention.time=${PROMETHEUS_RETENTION_TIME}"
|
||||||
|
- "--enable-feature=remote-write-receiver"
|
||||||
|
- "--web.enable-remote-write-receiver"
|
||||||
networks:
|
networks:
|
||||||
- proxy
|
- proxy
|
||||||
- internal
|
- internal
|
||||||
@ -24,7 +26,7 @@ services:
|
|||||||
condition: on-failure
|
condition: on-failure
|
||||||
labels:
|
labels:
|
||||||
- "traefik.enable=true"
|
- "traefik.enable=true"
|
||||||
- "traefik.docker.network=proxy"
|
- "traefik.swarm.network=proxy"
|
||||||
- "traefik.http.services.${STACK_NAME}-prometheus.loadbalancer.server.port=9090"
|
- "traefik.http.services.${STACK_NAME}-prometheus.loadbalancer.server.port=9090"
|
||||||
- "traefik.http.routers.${STACK_NAME}-prometheus.rule=Host(`prometheus.${DOMAIN}`)"
|
- "traefik.http.routers.${STACK_NAME}-prometheus.rule=Host(`prometheus.${DOMAIN}`)"
|
||||||
- "traefik.http.routers.${STACK_NAME}-prometheus.entrypoints=web-secure"
|
- "traefik.http.routers.${STACK_NAME}-prometheus.entrypoints=web-secure"
|
||||||
|
|||||||
@ -1,25 +0,0 @@
|
|||||||
version: "3.8"
|
|
||||||
|
|
||||||
services:
|
|
||||||
promtail:
|
|
||||||
image: grafana/promtail:3.6.7
|
|
||||||
volumes:
|
|
||||||
- /var/log:/var/log:ro
|
|
||||||
- /var/run/docker.sock:/var/run/docker.sock
|
|
||||||
command: -config.file=/etc/promtail/config.yml
|
|
||||||
configs:
|
|
||||||
- source: promtail_yml
|
|
||||||
target: /etc/promtail/config.yml
|
|
||||||
networks:
|
|
||||||
- internal
|
|
||||||
secrets:
|
|
||||||
- basic_auth
|
|
||||||
environment:
|
|
||||||
- DOMAIN
|
|
||||||
- LOKI_PUSH_URL
|
|
||||||
|
|
||||||
configs:
|
|
||||||
promtail_yml:
|
|
||||||
name: ${STACK_NAME}_promtail_yml_${PROMTAIL_YML_VERSION}
|
|
||||||
file: promtail.yml.tmpl
|
|
||||||
template_driver: golang
|
|
||||||
@ -17,7 +17,7 @@ services:
|
|||||||
condition: on-failure
|
condition: on-failure
|
||||||
labels:
|
labels:
|
||||||
- "traefik.enable=true"
|
- "traefik.enable=true"
|
||||||
- "traefik.docker.network=proxy"
|
- "traefik.swarm.network=proxy"
|
||||||
- "traefik.http.services.${STACK_NAME}-pushgateway.loadbalancer.server.port=9191"
|
- "traefik.http.services.${STACK_NAME}-pushgateway.loadbalancer.server.port=9191"
|
||||||
- "traefik.http.routers.${STACK_NAME}-pushgateway.rule=Host(`pushgateway.${DOMAIN}`)"
|
- "traefik.http.routers.${STACK_NAME}-pushgateway.rule=Host(`pushgateway.${DOMAIN}`)"
|
||||||
- "traefik.http.routers.${STACK_NAME}-pushgateway.entrypoints=web-secure"
|
- "traefik.http.routers.${STACK_NAME}-pushgateway.entrypoints=web-secure"
|
||||||
|
|||||||
6
compose.syslog.yml
Normal file
6
compose.syslog.yml
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
---
|
||||||
|
version: "3.8"
|
||||||
|
services:
|
||||||
|
app:
|
||||||
|
ports:
|
||||||
|
- "514:514"
|
||||||
101
compose.yml
101
compose.yml
@ -3,89 +3,46 @@ version: "3.8"
|
|||||||
|
|
||||||
services:
|
services:
|
||||||
app:
|
app:
|
||||||
image: prom/node-exporter:v1.10.2
|
image: grafana/alloy:v1.16.1
|
||||||
user: root
|
hostname: "${DOMAIN}"
|
||||||
environment:
|
|
||||||
- NODE_ID={{.Node.ID}}
|
|
||||||
volumes:
|
|
||||||
- /proc:/host/proc:ro
|
|
||||||
- /sys:/host/sys:ro
|
|
||||||
- /:/rootfs:ro
|
|
||||||
- /etc/hostname:/etc/nodename:ro
|
|
||||||
command:
|
|
||||||
- "--path.sysfs=/host/sys"
|
|
||||||
- "--path.procfs=/host/proc"
|
|
||||||
- "--path.rootfs=/rootfs"
|
|
||||||
- "--collector.textfile.directory=/etc/node-exporter/"
|
|
||||||
- "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)"
|
|
||||||
- "--no-collector.ipvs"
|
|
||||||
configs:
|
configs:
|
||||||
- source: entrypoint
|
- source: config_alloy
|
||||||
target: /entrypoint.sh
|
target: /etc/alloy/config.alloy
|
||||||
|
volumes:
|
||||||
|
- /:/rootfs:ro
|
||||||
|
- /var/run:/var/run:rw
|
||||||
|
- /var/run/docker.sock:/var/run/docker.sock
|
||||||
|
- /sys:/sys:ro
|
||||||
|
- /var/lib/docker:/var/lib/docker:ro
|
||||||
|
- /dev:/dev:ro
|
||||||
|
- alloy-data:/var/lib/alloy/data
|
||||||
|
command:
|
||||||
|
- "run"
|
||||||
|
- "--storage.path=/var/lib/alloy/data"
|
||||||
|
- "/etc/alloy/config.alloy"
|
||||||
networks:
|
networks:
|
||||||
- internal
|
- internal
|
||||||
- proxy
|
secrets:
|
||||||
entrypoint: [ "/bin/sh", "-e", "/entrypoint.sh" ]
|
- basic_auth
|
||||||
deploy:
|
deploy:
|
||||||
restart_policy:
|
restart_policy:
|
||||||
condition: on-failure
|
condition: on-failure
|
||||||
labels:
|
labels:
|
||||||
- "backupbot.backup=${ENABLE_BACKUPS:-true}"
|
- "backupbot.backup=${ENABLE_BACKUPS:-true}"
|
||||||
- "traefik.enable=true"
|
- "traefik.enable=false"
|
||||||
- "traefik.docker.network=proxy"
|
|
||||||
- "traefik.http.services.${STACK_NAME}-node.loadbalancer.server.port=9100"
|
|
||||||
- "traefik.http.routers.${STACK_NAME}-node.rule=Host(`node.${DOMAIN}`)"
|
|
||||||
- "traefik.http.routers.${STACK_NAME}-node.entrypoints=web-secure"
|
|
||||||
- "traefik.http.routers.${STACK_NAME}-node.tls=true"
|
|
||||||
- "traefik.http.routers.${STACK_NAME}-node.tls.certresolver=${LETS_ENCRYPT_ENV}"
|
|
||||||
- "traefik.http.routers.${STACK_NAME}-node.middlewares=basicauth@file"
|
|
||||||
- "coop-cloud.${STACK_NAME}.version=1.6.0+v1.8.1"
|
- "coop-cloud.${STACK_NAME}.version=1.6.0+v1.8.1"
|
||||||
- "coop-cloud.${STACK_NAME}.timeout=${TIMEOUT}"
|
|
||||||
|
|
||||||
cadvisor:
|
|
||||||
image: gcr.io/cadvisor/cadvisor:v0.55.1
|
|
||||||
command:
|
|
||||||
- "-logtostderr"
|
|
||||||
- "--enable_metrics=cpu,cpuLoad,disk,diskIO,process,memory,network"
|
|
||||||
# all possible metrics: advtcp,app,cpu,cpuLoad,cpu_topology,cpuset,disk,diskIO,hugetlb,memory,memory_numa,network,oom_event,percpu,perf_event,process,referenced_memory,resctrl,sched,tcp,udp.
|
|
||||||
- "--housekeeping_interval=120s"
|
|
||||||
- "--docker_only=true"
|
|
||||||
volumes:
|
|
||||||
- /var/lib/docker/:/var/lib/docker:ro
|
|
||||||
- /dev/disk/:/dev/disk:ro
|
|
||||||
- /sys:/sys:ro
|
|
||||||
- /var/run:/var/run:ro
|
|
||||||
- /:/rootfs:ro
|
|
||||||
networks:
|
|
||||||
- internal
|
|
||||||
- proxy
|
|
||||||
deploy:
|
|
||||||
restart_policy:
|
|
||||||
condition: on-failure
|
|
||||||
labels:
|
|
||||||
- "traefik.enable=true"
|
|
||||||
- "traefik.docker.network=proxy"
|
|
||||||
- "traefik.http.services.${STACK_NAME}-cadvisor.loadbalancer.server.port=8080"
|
|
||||||
- "traefik.http.routers.${STACK_NAME}-cadvisor.rule=Host(`cadvisor.${DOMAIN}`)"
|
|
||||||
- "traefik.http.routers.${STACK_NAME}-cadvisor.entrypoints=web-secure"
|
|
||||||
- "traefik.http.routers.${STACK_NAME}-cadvisor.tls=true"
|
|
||||||
- "traefik.http.routers.${STACK_NAME}-cadvisor.tls.certresolver=${LETS_ENCRYPT_ENV}"
|
|
||||||
- "traefik.http.routers.${STACK_NAME}-cadvisor.middlewares=basicauth@file"
|
|
||||||
healthcheck:
|
|
||||||
test: wget --quiet --tries=1 --spider http://localhost:8080/healthz || exit 1
|
|
||||||
interval: 15s
|
|
||||||
timeout: 15s
|
|
||||||
retries: 5
|
|
||||||
start_period: 30s
|
|
||||||
|
|
||||||
configs:
|
configs:
|
||||||
entrypoint:
|
config_alloy:
|
||||||
name: ${STACK_NAME}_entrypoint_${ENTRYPOINT_VERSION}
|
template_driver: golang
|
||||||
file: node-exporter-entrypoint.sh
|
name: ${STACK_NAME}_config_alloy_${CONFIG_ALLOY_VERSION}
|
||||||
|
file: config.alloy.tmpl
|
||||||
|
|
||||||
|
|
||||||
networks:
|
networks:
|
||||||
proxy:
|
proxy:
|
||||||
external: true
|
external: true
|
||||||
internal:
|
internal:
|
||||||
|
volumes:
|
||||||
|
alloy-data:
|
||||||
|
secrets:
|
||||||
|
basic_auth:
|
||||||
|
external: true
|
||||||
|
name: ${STACK_NAME}_basic_auth_${SECRET_BASIC_AUTH_VERSION}
|
||||||
|
|||||||
108
config.alloy.tmpl
Normal file
108
config.alloy.tmpl
Normal file
@ -0,0 +1,108 @@
|
|||||||
|
logging {
|
||||||
|
level = "info"
|
||||||
|
format = "logfmt"
|
||||||
|
}
|
||||||
|
|
||||||
|
discovery.docker "linux" {
|
||||||
|
host = "unix:///var/run/docker.sock"
|
||||||
|
}
|
||||||
|
|
||||||
|
{{ if ne (env "PROMETHEUS_REMOTE_WRITE_URL") "" }}
|
||||||
|
prometheus.exporter.cadvisor "docker" {
|
||||||
|
}
|
||||||
|
|
||||||
|
prometheus.exporter.unix "default" {
|
||||||
|
include_exporter_metrics = true
|
||||||
|
rootfs_path = "/rootfs"
|
||||||
|
}
|
||||||
|
|
||||||
|
prometheus.scrape "default" {
|
||||||
|
targets = array.concat(
|
||||||
|
[{
|
||||||
|
job = "alloy",
|
||||||
|
__address__ = "127.0.0.1:12345",
|
||||||
|
}],
|
||||||
|
prometheus.exporter.unix.default.targets,
|
||||||
|
prometheus.exporter.cadvisor.docker.targets,
|
||||||
|
)
|
||||||
|
|
||||||
|
forward_to = [prometheus.remote_write.prometheus.receiver]
|
||||||
|
}
|
||||||
|
|
||||||
|
{{ if ne (env "FORGEJO_METRICS_HOSTNAME") "" }}
|
||||||
|
prometheus.scrape "forgejo" {
|
||||||
|
bearer_token = "{{ secret "forgejo_token" }}"
|
||||||
|
job_name = "forgejo"
|
||||||
|
scheme = "https"
|
||||||
|
|
||||||
|
targets = [{ __address__ = "{{ env "FORGEJO_METRICS_HOSTNAME" }}" }]
|
||||||
|
forward_to = [prometheus.remote_write.prometheus.receiver]
|
||||||
|
|
||||||
|
tls_config {
|
||||||
|
insecure_skip_verify = {{ env "FORGEJO_INSECURE_SKIP_VERIFY" }}
|
||||||
|
server_name = "{{ env "FORGEJO_METRICS_HOSTNAME" }}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
{{ end }}
|
||||||
|
|
||||||
|
prometheus.remote_write "prometheus" {
|
||||||
|
endpoint {
|
||||||
|
url = "{{ env "PROMETHEUS_REMOTE_WRITE_URL" }}"
|
||||||
|
|
||||||
|
basic_auth {
|
||||||
|
username = "admin"
|
||||||
|
password = "{{ secret "basic_auth" }}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
{{ end }}
|
||||||
|
|
||||||
|
{{ if ne (env "LOKI_PUSH_URL") "" }}
|
||||||
|
loki.source.docker "docker" {
|
||||||
|
host = "unix:///var/run/docker.sock"
|
||||||
|
targets = discovery.docker.linux.targets
|
||||||
|
labels = {"app" = "docker"}
|
||||||
|
forward_to = [loki.write.loki.receiver]
|
||||||
|
}
|
||||||
|
|
||||||
|
{{ if eq (env "JOURNALD") "1" }}
|
||||||
|
loki.source.journal "journal" {
|
||||||
|
path = "/var/log/journal"
|
||||||
|
labels = { job = "{{ env "DOMAIN" }}" }
|
||||||
|
forward_to = [loki.write.loki.receiver]
|
||||||
|
}
|
||||||
|
{{ end }}
|
||||||
|
|
||||||
|
{{ if eq (env "SYSLOG") "1" }}
|
||||||
|
loki.relabel "syslog" {
|
||||||
|
rule {
|
||||||
|
action = "labelmap"
|
||||||
|
regex = "__syslog_(.+)"
|
||||||
|
}
|
||||||
|
|
||||||
|
forward_to = []
|
||||||
|
}
|
||||||
|
|
||||||
|
loki.source.syslog "syslog" {
|
||||||
|
listener {
|
||||||
|
address = "[::1]:514"
|
||||||
|
label_structured_data = true
|
||||||
|
labels = { component = "loki.source.syslog" }
|
||||||
|
}
|
||||||
|
|
||||||
|
relabel_rules = loki.relabel.syslog.rules
|
||||||
|
forward_to = [loki.write.loki.receiver]
|
||||||
|
}
|
||||||
|
{{ end }}
|
||||||
|
|
||||||
|
loki.write "loki" {
|
||||||
|
endpoint {
|
||||||
|
url = "{{ env "LOKI_PUSH_URL" }}"
|
||||||
|
|
||||||
|
basic_auth {
|
||||||
|
username = "admin"
|
||||||
|
password = "{{ secret "basic_auth" }}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
{{ end }}
|
||||||
315
grafana-alerts.json.tmpl
Normal file
315
grafana-alerts.json.tmpl
Normal file
@ -0,0 +1,315 @@
|
|||||||
|
{
|
||||||
|
"apiVersion": 1,
|
||||||
|
"groups": [
|
||||||
|
{
|
||||||
|
"orgId": 1,
|
||||||
|
"name": "backupbot",
|
||||||
|
"folder": "node",
|
||||||
|
"interval": "1m",
|
||||||
|
"rules": [
|
||||||
|
{{ if eq (env "ALERT_BACKUP_FAILED_ENABLED") "true" }}
|
||||||
|
{
|
||||||
|
"uid": "de8e5xxup7t34a",
|
||||||
|
"title": "Backup Failed",
|
||||||
|
"condition": "C",
|
||||||
|
"data": [
|
||||||
|
{
|
||||||
|
"refId": "A",
|
||||||
|
"relativeTimeRange": { "from": 600, "to": 0 },
|
||||||
|
"datasourceUid": "PBFA97CFB590B2093",
|
||||||
|
"model": {
|
||||||
|
"disableTextWrap": false,
|
||||||
|
"editorMode": "builder",
|
||||||
|
"expr": "backup",
|
||||||
|
"fullMetaSearch": false,
|
||||||
|
"includeNullMetadata": true,
|
||||||
|
"instant": true,
|
||||||
|
"intervalMs": 1000,
|
||||||
|
"legendFormat": "__auto",
|
||||||
|
"maxDataPoints": 43200,
|
||||||
|
"range": false,
|
||||||
|
"refId": "A",
|
||||||
|
"useBackend": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"refId": "C",
|
||||||
|
"relativeTimeRange": { "from": 600, "to": 0 },
|
||||||
|
"datasourceUid": "__expr__",
|
||||||
|
"model": {
|
||||||
|
"conditions": [
|
||||||
|
{
|
||||||
|
"evaluator": { "params": [0], "type": "lt" },
|
||||||
|
"operator": { "type": "and" },
|
||||||
|
"query": { "params": ["C"] },
|
||||||
|
"reducer": { "params": [], "type": "last" },
|
||||||
|
"type": "query"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"datasource": { "type": "__expr__", "uid": "__expr__" },
|
||||||
|
"expression": "A",
|
||||||
|
"intervalMs": 1000,
|
||||||
|
"maxDataPoints": 43200,
|
||||||
|
"refId": "C",
|
||||||
|
"type": "threshold"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"noDataState": "NoData",
|
||||||
|
"execErrState": "Error",
|
||||||
|
"for": "1m",
|
||||||
|
"isPaused": false
|
||||||
|
},
|
||||||
|
{{ end }}
|
||||||
|
{{ if eq (env "ALERT_BACKUP_MISSING_ENABLED") "true" }}
|
||||||
|
{
|
||||||
|
"uid": "ce8e65uddcwe8d",
|
||||||
|
"title": "Backup Missing",
|
||||||
|
"condition": "B",
|
||||||
|
"data": [
|
||||||
|
{
|
||||||
|
"refId": "A",
|
||||||
|
"relativeTimeRange": { "from": 600, "to": 0 },
|
||||||
|
"datasourceUid": "PBFA97CFB590B2093",
|
||||||
|
"model": {
|
||||||
|
"disableTextWrap": false,
|
||||||
|
"editorMode": "builder",
|
||||||
|
"expr": "rate(backup[24h])",
|
||||||
|
"fullMetaSearch": false,
|
||||||
|
"includeNullMetadata": true,
|
||||||
|
"instant": true,
|
||||||
|
"intervalMs": 1000,
|
||||||
|
"legendFormat": "__auto",
|
||||||
|
"maxDataPoints": 43200,
|
||||||
|
"range": false,
|
||||||
|
"refId": "A",
|
||||||
|
"useBackend": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"refId": "B",
|
||||||
|
"relativeTimeRange": { "from": 600, "to": 0 },
|
||||||
|
"datasourceUid": "__expr__",
|
||||||
|
"model": {
|
||||||
|
"conditions": [
|
||||||
|
{
|
||||||
|
"evaluator": { "params": [0, 0], "type": "within_range" },
|
||||||
|
"operator": { "type": "and" },
|
||||||
|
"query": { "params": ["C"] },
|
||||||
|
"reducer": { "params": [], "type": "last" },
|
||||||
|
"type": "query"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"datasource": { "type": "__expr__", "uid": "__expr__" },
|
||||||
|
"expression": "A",
|
||||||
|
"intervalMs": 1000,
|
||||||
|
"maxDataPoints": 43200,
|
||||||
|
"refId": "B",
|
||||||
|
"type": "threshold"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"noDataState": "NoData",
|
||||||
|
"execErrState": "Error",
|
||||||
|
"for": "5m",
|
||||||
|
"isPaused": false
|
||||||
|
},
|
||||||
|
{{ end }}
|
||||||
|
{{ if eq (env "ALERT_BACKUP_NOT_SUCCESSFULL_ENABLED") "true" }}
|
||||||
|
{
|
||||||
|
"uid": "de8e6bc92a8lcc",
|
||||||
|
"title": "Backup Not Successfull",
|
||||||
|
"condition": "B",
|
||||||
|
"data": [
|
||||||
|
{
|
||||||
|
"refId": "A",
|
||||||
|
"relativeTimeRange": {
|
||||||
|
"from": 60,
|
||||||
|
"to": 0
|
||||||
|
},
|
||||||
|
"datasourceUid": "PBFA97CFB590B2093",
|
||||||
|
"model": {
|
||||||
|
"disableTextWrap": false,
|
||||||
|
"editorMode": "builder",
|
||||||
|
"expr": "backup",
|
||||||
|
"fullMetaSearch": false,
|
||||||
|
"includeNullMetadata": true,
|
||||||
|
"instant": true,
|
||||||
|
"intervalMs": 1000,
|
||||||
|
"legendFormat": "__auto",
|
||||||
|
"maxDataPoints": 43200,
|
||||||
|
"range": false,
|
||||||
|
"refId": "A",
|
||||||
|
"useBackend": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"refId": "B",
|
||||||
|
"relativeTimeRange": {
|
||||||
|
"from": 60,
|
||||||
|
"to": 0
|
||||||
|
},
|
||||||
|
"datasourceUid": "__expr__",
|
||||||
|
"model": {
|
||||||
|
"conditions": [
|
||||||
|
{
|
||||||
|
"evaluator": {
|
||||||
|
"params": [
|
||||||
|
0
|
||||||
|
],
|
||||||
|
"type": "gt"
|
||||||
|
},
|
||||||
|
"operator": {
|
||||||
|
"type": "and"
|
||||||
|
},
|
||||||
|
"query": {
|
||||||
|
"params": [
|
||||||
|
"C"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"reducer": {
|
||||||
|
"params": [],
|
||||||
|
"type": "last"
|
||||||
|
},
|
||||||
|
"type": "query"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"datasource": {
|
||||||
|
"type": "__expr__",
|
||||||
|
"uid": "__expr__"
|
||||||
|
},
|
||||||
|
"expression": "A",
|
||||||
|
"intervalMs": 1000,
|
||||||
|
"maxDataPoints": 43200,
|
||||||
|
"refId": "B",
|
||||||
|
"type": "threshold"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"noDataState": "NoData",
|
||||||
|
"execErrState": "Error",
|
||||||
|
"for": "20m",
|
||||||
|
"annotations": {
|
||||||
|
"summary": "Backup did not finish within 20 minutes"
|
||||||
|
},
|
||||||
|
"labels": {},
|
||||||
|
"isPaused": false
|
||||||
|
}
|
||||||
|
{{ end }}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"orgId": 1,
|
||||||
|
"name": "node",
|
||||||
|
"folder": "node",
|
||||||
|
"interval": "5m",
|
||||||
|
"rules": [
|
||||||
|
{{ if eq (env "ALERT_NODE_DISK_SPACE_ENABLED") "true" }}
|
||||||
|
{
|
||||||
|
"uid": "bds8bhxu97pxca",
|
||||||
|
"title": "Node Disk Space",
|
||||||
|
"condition": "C",
|
||||||
|
"data": [
|
||||||
|
{
|
||||||
|
"refId": "A",
|
||||||
|
"relativeTimeRange": { "from": 600, "to": 0 },
|
||||||
|
"datasourceUid": "PBFA97CFB590B2093",
|
||||||
|
"model": {
|
||||||
|
"editorMode": "code",
|
||||||
|
"expr": "(node_filesystem_free_bytes{fstype=\"ext4\"} / node_filesystem_size_bytes{fstype=\"ext4\"}) * 100",
|
||||||
|
"instant": true,
|
||||||
|
"intervalMs": 1000,
|
||||||
|
"legendFormat": "__auto",
|
||||||
|
"maxDataPoints": 43200,
|
||||||
|
"range": false,
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"refId": "C",
|
||||||
|
"relativeTimeRange": { "from": 600, "to": 0 },
|
||||||
|
"datasourceUid": "__expr__",
|
||||||
|
"model": {
|
||||||
|
"conditions": [
|
||||||
|
{
|
||||||
|
"evaluator": { "params": [10], "type": "lt" },
|
||||||
|
"operator": { "type": "and" },
|
||||||
|
"query": { "params": ["C"] },
|
||||||
|
"reducer": { "params": [], "type": "last" },
|
||||||
|
"type": "query"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"datasource": { "type": "__expr__", "uid": "__expr__" },
|
||||||
|
"expression": "A",
|
||||||
|
"intervalMs": 1000,
|
||||||
|
"maxDataPoints": 43200,
|
||||||
|
"refId": "C",
|
||||||
|
"type": "threshold"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"noDataState": "NoData",
|
||||||
|
"execErrState": "Error",
|
||||||
|
"for": "5m",
|
||||||
|
"annotations": {},
|
||||||
|
"labels": {},
|
||||||
|
"isPaused": false
|
||||||
|
},
|
||||||
|
{{ end }}
|
||||||
|
{{ if eq (env "ALERT_NODE_MEMORY_USAGE_ENABLED") "true" }}
|
||||||
|
{
|
||||||
|
"uid": "ads8cswmly96oa",
|
||||||
|
"title": "Node Memory Usage",
|
||||||
|
"condition": "C",
|
||||||
|
"data": [
|
||||||
|
{
|
||||||
|
"refId": "A",
|
||||||
|
"relativeTimeRange": { "from": 600, "to": 0 },
|
||||||
|
"datasourceUid": "PBFA97CFB590B2093",
|
||||||
|
"model": {
|
||||||
|
"editorMode": "code",
|
||||||
|
"expr": "(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100",
|
||||||
|
"instant": true,
|
||||||
|
"intervalMs": 1000,
|
||||||
|
"legendFormat": "__auto",
|
||||||
|
"maxDataPoints": 43200,
|
||||||
|
"range": false,
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"refId": "C",
|
||||||
|
"relativeTimeRange": { "from": 600, "to": 0 },
|
||||||
|
"datasourceUid": "__expr__",
|
||||||
|
"model": {
|
||||||
|
"conditions": [
|
||||||
|
{
|
||||||
|
"evaluator": { "params": [90], "type": "gt" },
|
||||||
|
"operator": { "type": "and" },
|
||||||
|
"query": { "params": ["C"] },
|
||||||
|
"reducer": { "params": [], "type": "last" },
|
||||||
|
"type": "query"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"datasource": { "type": "__expr__", "uid": "__expr__" },
|
||||||
|
"expression": "A",
|
||||||
|
"intervalMs": 1000,
|
||||||
|
"maxDataPoints": 43200,
|
||||||
|
"refId": "C",
|
||||||
|
"type": "threshold"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"noDataState": "NoData",
|
||||||
|
"execErrState": "Error",
|
||||||
|
"for": "5m",
|
||||||
|
"annotations": {},
|
||||||
|
"labels": {},
|
||||||
|
"isPaused": false
|
||||||
|
}
|
||||||
|
{{ end }}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
@ -1,11 +0,0 @@
|
|||||||
#!/bin/sh -e
|
|
||||||
|
|
||||||
NODE_NAME=$(cat /etc/nodename)
|
|
||||||
|
|
||||||
mkdir -p /etc/node-exporter
|
|
||||||
|
|
||||||
echo "node_meta{node_id=\"$NODE_ID\", container_label_com_docker_swarm_node_id=\"$NODE_ID\", node_name=\"$NODE_NAME\"} 1" > /etc/node-exporter/node-meta.prom
|
|
||||||
|
|
||||||
set -- /bin/node_exporter "$@"
|
|
||||||
|
|
||||||
exec "$@"
|
|
||||||
@ -1,37 +0,0 @@
|
|||||||
server:
|
|
||||||
http_listen_port: 9080
|
|
||||||
grpc_listen_port: 0
|
|
||||||
|
|
||||||
positions:
|
|
||||||
filename: /tmp/positions.yaml
|
|
||||||
|
|
||||||
clients:
|
|
||||||
- url: {{ env "LOKI_PUSH_URL" }}
|
|
||||||
basic_auth:
|
|
||||||
username: admin
|
|
||||||
password: {{ secret "basic_auth" }}
|
|
||||||
external_labels:
|
|
||||||
hostname: {{ env "DOMAIN" }}
|
|
||||||
|
|
||||||
scrape_configs:
|
|
||||||
- job_name: system
|
|
||||||
static_configs:
|
|
||||||
- targets:
|
|
||||||
- localhost
|
|
||||||
labels:
|
|
||||||
job: varlogs
|
|
||||||
__path__: /var/log/*log
|
|
||||||
|
|
||||||
- job_name: "docker"
|
|
||||||
docker_sd_configs:
|
|
||||||
- host: "unix:///var/run/docker.sock"
|
|
||||||
refresh_interval: "10s"
|
|
||||||
relabel_configs:
|
|
||||||
- source_labels: ['__meta_docker_container_name']
|
|
||||||
target_label: "container_name"
|
|
||||||
- source_labels: ['__meta_docker_container_id']
|
|
||||||
target_label: "container_id"
|
|
||||||
- source_labels: ['__meta_docker_container_label_com_docker_stack_namespace']
|
|
||||||
target_label: "stack_namespace"
|
|
||||||
- source_labels: ['__meta_docker_container_label_com_docker_swarm_service_name']
|
|
||||||
target_label: "service_name"
|
|
||||||
Reference in New Issue
Block a user