This commit is contained in:
Philipp Rothmann 2023-05-11 15:23:35 +02:00
parent 49a4d6ab17
commit 6d556f5ad1
7 changed files with 588 additions and 598 deletions

View File

@ -38,8 +38,7 @@ Where gathering.org is the node you want to gather metrics from.
* `abra app deploy traefik` (might need to undeploy before)
1. `abra app new monitoring-ng`
1. `abra app config monitoring.gathering.org`
for gathering only this is required:
`COMPOSE_FILE="$COMPOSE_FILE:compose.metrics.yml"`
for gathering only the main `compose.yml` is needed, nothing more.
1. `abra app deploy monitoring.gathering.org`
1. check that endpoints are up and basic-auth works
* cadvisor.monitoring.gathering.org
@ -51,7 +50,10 @@ Where gathering.org is the node you want to gather metrics from.
* monitoring.example.org
* loki.monitoring.example.org
* loki.monitoring.example.org
1. Setup monitoring stack
* `abra app new monitoring-ng`
* `abra app config monitoring.example.org`
*
```
cp scrape-config.example.yml gathering.org.yml
@ -79,29 +81,29 @@ abra app cp monitoring.dev.local-it.cloud gathering.org.yml prometheus:/promethe
| Cadvisor | traefik basic-auth | cadvisor.monitoring.example.org |
| Node Exporter | traefik basic-auth | node.monitoring.example.org |
### TODO
* metrics.compose.yml -> compose.yml
* Grafana
* [ ] Test SSO
* Loki
* [ ] s3 aws secret?
* [ ] understand config, make it sane
* [ ] Promtail
* [ ] make it work
* prometheus retention!
* traefik metrics
* [uptime-kuma](https://github.com/louislam/uptime-kuma/wiki/Prometheus-Integration), [dashboard](https://grafana.com/grafana/dashboards/14847-uptime-kuma/)
* authentik metrics?
* cool alerts
* note: alle gathering nodes will have the same httpasswd basic-auth secret ...
-> this could be a use case to actually use docker swarm ...
could use swarm_service_discovery then in prometheus
-> multiple scrape_configs in prometheus
service
-> oauth / header? prometheus could do it, does promtail? does traefik?
todo:
* [x] metrics.compose.yml -> compose.yml
* Grafana
* [ ] Test SSO
* Loki
* [ ] s3 aws secret?
* [ ] understand config, make it sane
* [ ] Promtail
* [x] make it work
* [ ] test it with second server
* [ ] prometheus retention / storage size limit
* [x] traefik metrics
* [ ] document example scrape config prometheus
nice to have:
* [uptime-kuma](https://github.com/louislam/uptime-kuma/wiki/Prometheus-Integration), [dashboard](https://grafana.com/grafana/dashboards/14847-uptime-kuma/)
* authentik metrics?
* improve prometheus discovery / security things
-> multiple scrape_configs in prometheus
service
-> oauth / header? prometheus could do it, does promtail? does traefik?
This stack requires 3 domains, one for grafana, prometheus, loki. This is due to the need for the gathering tools, such as node_exporter, to have a publicy accessible URL for making connections. We make use of the internal prometheus HTTP basic auth & wire up an Nginx proxy with HTTP basic auth for loki. Grafana uses Keycloak OpenId Connect sign in. The alertmanager setup remains internal and is only connected with grafana. It also assume that you are deploying the [`coop-cloud/gathering`](https://git.coopcloud.tech/knoflook/gathering/) recipe on the machines that you want to gather metrics & logs from. Each instance of the gathering recipe will report back and/or be scraped by your central install of monitoring-lite.

View File

@ -1,79 +0,0 @@
version: '3.8'
services:
node_exporter:
image: prom/node-exporter:v1.0.1
user: root
environment:
- NODE_ID={{.Node.ID}}
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
- /etc/hostname:/etc/nodename:ro
command:
- "--path.sysfs=/host/sys"
- "--path.procfs=/host/proc"
- "--path.rootfs=/rootfs"
- "--collector.textfile.directory=/etc/node-exporter/"
- "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)"
- "--no-collector.ipvs"
configs:
- source: node_exporter_entrypoint_sh
target: /entrypoint.sh
networks:
- internal
- proxy
entrypoint: [ "/bin/sh", "-e", "/entrypoint.sh" ]
deploy:
restart_policy:
condition: on-failure
labels:
- "traefik.enable=true"
- "traefik.http.services.${STACK_NAME}-node.loadbalancer.server.port=9100"
- "traefik.http.routers.${STACK_NAME}-node.rule=Host(`node.${DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-node.entrypoints=web-secure"
- "traefik.http.routers.${STACK_NAME}-node.tls=true"
- "traefik.http.routers.${STACK_NAME}-node.tls.certresolver=${LETS_ENCRYPT_ENV}"
- "traefik.http.routers.${STACK_NAME}-node.middlewares=basicauth@file"
cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.47.0
command:
- "-logtostderr"
- "-docker_only"
- "--enable_metrics=cpu,cpuLoad,disk,memory,network"
# all possible metrics: advtcp,app,cpu,cpuLoad,cpu_topology,cpuset,disk,diskIO,hugetlb,memory,memory_numa,network,oom_event,percpu,perf_event,process,referenced_memory,resctrl,sched,tcp,udp.
- "--housekeeping_interval=60s"
volumes:
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
- /sys:/sys:ro
- /var/run:/var/run:ro
- /:/rootfs:ro
networks:
- internal
- proxy
deploy:
restart_policy:
condition: on-failure
labels:
- "traefik.enable=true"
- "traefik.http.services.${STACK_NAME}-cadvisor.loadbalancer.server.port=8080"
- "traefik.http.routers.${STACK_NAME}-cadvisor.rule=Host(`cadvisor.${DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-cadvisor.entrypoints=web-secure"
- "traefik.http.routers.${STACK_NAME}-cadvisor.tls=true"
- "traefik.http.routers.${STACK_NAME}-cadvisor.tls.certresolver=${LETS_ENCRYPT_ENV}"
- "traefik.http.routers.${STACK_NAME}-cadvisor.middlewares=basicauth@file"
healthcheck:
test: wget --quiet --tries=1 --spider http://localhost:8080/healthz || exit 1
interval: 15s
timeout: 15s
retries: 5
start_period: 30s
configs:
node_exporter_entrypoint_sh:
name: ${STACK_NAME}_node_exporter_entrypoint_${NODE_EXPORTER_ENTRYPOINT_VERSION}
file: node-exporter-entrypoint.sh

View File

@ -3,8 +3,82 @@ version: "3.8"
services:
app:
image: debian:stable-slim
entrypoint: "/bin/tail -f /dev/null"
image: prom/node-exporter:v1.0.1
user: root
environment:
- NODE_ID={{.Node.ID}}
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
- /etc/hostname:/etc/nodename:ro
command:
- "--path.sysfs=/host/sys"
- "--path.procfs=/host/proc"
- "--path.rootfs=/rootfs"
- "--collector.textfile.directory=/etc/node-exporter/"
- "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)"
- "--no-collector.ipvs"
configs:
- source: node_exporter_entrypoint_sh
target: /entrypoint.sh
networks:
- internal
- proxy
entrypoint: [ "/bin/sh", "-e", "/entrypoint.sh" ]
deploy:
restart_policy:
condition: on-failure
labels:
- "traefik.enable=true"
- "traefik.http.services.${STACK_NAME}-node.loadbalancer.server.port=9100"
- "traefik.http.routers.${STACK_NAME}-node.rule=Host(`node.${DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-node.entrypoints=web-secure"
- "traefik.http.routers.${STACK_NAME}-node.tls=true"
- "traefik.http.routers.${STACK_NAME}-node.tls.certresolver=${LETS_ENCRYPT_ENV}"
- "traefik.http.routers.${STACK_NAME}-node.middlewares=basicauth@file"
cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.47.0
command:
- "-logtostderr"
- "-docker_only"
- "--enable_metrics=cpu,cpuLoad,disk,memory,network"
# all possible metrics: advtcp,app,cpu,cpuLoad,cpu_topology,cpuset,disk,diskIO,hugetlb,memory,memory_numa,network,oom_event,percpu,perf_event,process,referenced_memory,resctrl,sched,tcp,udp.
- "--housekeeping_interval=60s"
volumes:
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
- /sys:/sys:ro
- /var/run:/var/run:ro
- /:/rootfs:ro
networks:
- internal
- proxy
deploy:
restart_policy:
condition: on-failure
labels:
- "traefik.enable=true"
- "traefik.http.services.${STACK_NAME}-cadvisor.loadbalancer.server.port=8080"
- "traefik.http.routers.${STACK_NAME}-cadvisor.rule=Host(`cadvisor.${DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-cadvisor.entrypoints=web-secure"
- "traefik.http.routers.${STACK_NAME}-cadvisor.tls=true"
- "traefik.http.routers.${STACK_NAME}-cadvisor.tls.certresolver=${LETS_ENCRYPT_ENV}"
- "traefik.http.routers.${STACK_NAME}-cadvisor.middlewares=basicauth@file"
healthcheck:
test: wget --quiet --tries=1 --spider http://localhost:8080/healthz || exit 1
interval: 15s
timeout: 15s
retries: 5
start_period: 30s
configs:
node_exporter_entrypoint_sh:
name: ${STACK_NAME}_node_exporter_entrypoint_${NODE_EXPORTER_ENTRYPOINT_VERSION}
file: node-exporter-entrypoint.sh
networks:

3
demo.yml Normal file
View File

@ -0,0 +1,3 @@
- targets:
- 'node.monitoring.demo.local-it.cloud'
- 'cadvisor.monitoring.demo.local-it.cloud'

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,4 @@
- targets:
- 'traefik.example.org'
- 'example.org:8082/metrics'
- 'node.monitoring.example.org'
- 'cadvisor.monitoring.example.org'