This commit is contained in:
Philipp Rothmann 2023-05-11 15:23:35 +02:00
parent 49a4d6ab17
commit 6d556f5ad1
7 changed files with 588 additions and 598 deletions

View File

@ -38,8 +38,7 @@ Where gathering.org is the node you want to gather metrics from.
* `abra app deploy traefik` (might need to undeploy before)
1. `abra app new monitoring-ng`
1. `abra app config monitoring.gathering.org`
for gathering only this is required:
`COMPOSE_FILE="$COMPOSE_FILE:compose.metrics.yml"`
for gathering only the main `compose.yml` is needed, nothing more.
1. `abra app deploy monitoring.gathering.org`
1. check that endpoints are up and basic-auth works
* cadvisor.monitoring.gathering.org
@ -51,7 +50,10 @@ Where gathering.org is the node you want to gather metrics from.
* monitoring.example.org
* loki.monitoring.example.org
* loki.monitoring.example.org
1. Setup monitoring stack
* `abra app new monitoring-ng`
* `abra app config monitoring.example.org`
*
```
cp scrape-config.example.yml gathering.org.yml
@ -79,26 +81,26 @@ abra app cp monitoring.dev.local-it.cloud gathering.org.yml prometheus:/promethe
| Cadvisor | traefik basic-auth | cadvisor.monitoring.example.org |
| Node Exporter | traefik basic-auth | node.monitoring.example.org |
### TODO
* metrics.compose.yml -> compose.yml
todo:
* [x] metrics.compose.yml -> compose.yml
* Grafana
* [ ] Test SSO
* Loki
* [ ] s3 aws secret?
* [ ] understand config, make it sane
* [ ] Promtail
* [ ] make it work
* prometheus retention!
* traefik metrics
* [x] make it work
* [ ] test it with second server
* [ ] prometheus retention / storage size limit
* [x] traefik metrics
* [ ] document example scrape config prometheus
nice to have:
* [uptime-kuma](https://github.com/louislam/uptime-kuma/wiki/Prometheus-Integration), [dashboard](https://grafana.com/grafana/dashboards/14847-uptime-kuma/)
* authentik metrics?
* cool alerts
* note: alle gathering nodes will have the same httpasswd basic-auth secret ...
-> this could be a use case to actually use docker swarm ...
could use swarm_service_discovery then in prometheus
* improve prometheus discovery / security things
-> multiple scrape_configs in prometheus
service
-> oauth / header? prometheus could do it, does promtail? does traefik?

View File

@ -1,79 +0,0 @@
version: '3.8'
services:
node_exporter:
image: prom/node-exporter:v1.0.1
user: root
environment:
- NODE_ID={{.Node.ID}}
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
- /etc/hostname:/etc/nodename:ro
command:
- "--path.sysfs=/host/sys"
- "--path.procfs=/host/proc"
- "--path.rootfs=/rootfs"
- "--collector.textfile.directory=/etc/node-exporter/"
- "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)"
- "--no-collector.ipvs"
configs:
- source: node_exporter_entrypoint_sh
target: /entrypoint.sh
networks:
- internal
- proxy
entrypoint: [ "/bin/sh", "-e", "/entrypoint.sh" ]
deploy:
restart_policy:
condition: on-failure
labels:
- "traefik.enable=true"
- "traefik.http.services.${STACK_NAME}-node.loadbalancer.server.port=9100"
- "traefik.http.routers.${STACK_NAME}-node.rule=Host(`node.${DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-node.entrypoints=web-secure"
- "traefik.http.routers.${STACK_NAME}-node.tls=true"
- "traefik.http.routers.${STACK_NAME}-node.tls.certresolver=${LETS_ENCRYPT_ENV}"
- "traefik.http.routers.${STACK_NAME}-node.middlewares=basicauth@file"
cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.47.0
command:
- "-logtostderr"
- "-docker_only"
- "--enable_metrics=cpu,cpuLoad,disk,memory,network"
# all possible metrics: advtcp,app,cpu,cpuLoad,cpu_topology,cpuset,disk,diskIO,hugetlb,memory,memory_numa,network,oom_event,percpu,perf_event,process,referenced_memory,resctrl,sched,tcp,udp.
- "--housekeeping_interval=60s"
volumes:
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
- /sys:/sys:ro
- /var/run:/var/run:ro
- /:/rootfs:ro
networks:
- internal
- proxy
deploy:
restart_policy:
condition: on-failure
labels:
- "traefik.enable=true"
- "traefik.http.services.${STACK_NAME}-cadvisor.loadbalancer.server.port=8080"
- "traefik.http.routers.${STACK_NAME}-cadvisor.rule=Host(`cadvisor.${DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-cadvisor.entrypoints=web-secure"
- "traefik.http.routers.${STACK_NAME}-cadvisor.tls=true"
- "traefik.http.routers.${STACK_NAME}-cadvisor.tls.certresolver=${LETS_ENCRYPT_ENV}"
- "traefik.http.routers.${STACK_NAME}-cadvisor.middlewares=basicauth@file"
healthcheck:
test: wget --quiet --tries=1 --spider http://localhost:8080/healthz || exit 1
interval: 15s
timeout: 15s
retries: 5
start_period: 30s
configs:
node_exporter_entrypoint_sh:
name: ${STACK_NAME}_node_exporter_entrypoint_${NODE_EXPORTER_ENTRYPOINT_VERSION}
file: node-exporter-entrypoint.sh

View File

@ -3,8 +3,82 @@ version: "3.8"
services:
app:
image: debian:stable-slim
entrypoint: "/bin/tail -f /dev/null"
image: prom/node-exporter:v1.0.1
user: root
environment:
- NODE_ID={{.Node.ID}}
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
- /etc/hostname:/etc/nodename:ro
command:
- "--path.sysfs=/host/sys"
- "--path.procfs=/host/proc"
- "--path.rootfs=/rootfs"
- "--collector.textfile.directory=/etc/node-exporter/"
- "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)"
- "--no-collector.ipvs"
configs:
- source: node_exporter_entrypoint_sh
target: /entrypoint.sh
networks:
- internal
- proxy
entrypoint: [ "/bin/sh", "-e", "/entrypoint.sh" ]
deploy:
restart_policy:
condition: on-failure
labels:
- "traefik.enable=true"
- "traefik.http.services.${STACK_NAME}-node.loadbalancer.server.port=9100"
- "traefik.http.routers.${STACK_NAME}-node.rule=Host(`node.${DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-node.entrypoints=web-secure"
- "traefik.http.routers.${STACK_NAME}-node.tls=true"
- "traefik.http.routers.${STACK_NAME}-node.tls.certresolver=${LETS_ENCRYPT_ENV}"
- "traefik.http.routers.${STACK_NAME}-node.middlewares=basicauth@file"
cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.47.0
command:
- "-logtostderr"
- "-docker_only"
- "--enable_metrics=cpu,cpuLoad,disk,memory,network"
# all possible metrics: advtcp,app,cpu,cpuLoad,cpu_topology,cpuset,disk,diskIO,hugetlb,memory,memory_numa,network,oom_event,percpu,perf_event,process,referenced_memory,resctrl,sched,tcp,udp.
- "--housekeeping_interval=60s"
volumes:
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
- /sys:/sys:ro
- /var/run:/var/run:ro
- /:/rootfs:ro
networks:
- internal
- proxy
deploy:
restart_policy:
condition: on-failure
labels:
- "traefik.enable=true"
- "traefik.http.services.${STACK_NAME}-cadvisor.loadbalancer.server.port=8080"
- "traefik.http.routers.${STACK_NAME}-cadvisor.rule=Host(`cadvisor.${DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-cadvisor.entrypoints=web-secure"
- "traefik.http.routers.${STACK_NAME}-cadvisor.tls=true"
- "traefik.http.routers.${STACK_NAME}-cadvisor.tls.certresolver=${LETS_ENCRYPT_ENV}"
- "traefik.http.routers.${STACK_NAME}-cadvisor.middlewares=basicauth@file"
healthcheck:
test: wget --quiet --tries=1 --spider http://localhost:8080/healthz || exit 1
interval: 15s
timeout: 15s
retries: 5
start_period: 30s
configs:
node_exporter_entrypoint_sh:
name: ${STACK_NAME}_node_exporter_entrypoint_${NODE_EXPORTER_ENTRYPOINT_VERSION}
file: node-exporter-entrypoint.sh
networks:

3
demo.yml Normal file
View File

@ -0,0 +1,3 @@
- targets:
- 'node.monitoring.demo.local-it.cloud'
- 'cadvisor.monitoring.demo.local-it.cloud'

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,4 @@
- targets:
- 'traefik.example.org'
- 'example.org:8082/metrics'
- 'node.monitoring.example.org'
- 'cadvisor.monitoring.example.org'