This commit is contained in:
Philipp Rothmann 2023-05-11 15:23:35 +02:00
parent 49a4d6ab17
commit 6d556f5ad1
7 changed files with 588 additions and 598 deletions

View File

@ -38,8 +38,7 @@ Where gathering.org is the node you want to gather metrics from.
* `abra app deploy traefik` (might need to undeploy before) * `abra app deploy traefik` (might need to undeploy before)
1. `abra app new monitoring-ng` 1. `abra app new monitoring-ng`
1. `abra app config monitoring.gathering.org` 1. `abra app config monitoring.gathering.org`
for gathering only this is required: for gathering only the main `compose.yml` is needed, nothing more.
`COMPOSE_FILE="$COMPOSE_FILE:compose.metrics.yml"`
1. `abra app deploy monitoring.gathering.org` 1. `abra app deploy monitoring.gathering.org`
1. check that endpoints are up and basic-auth works 1. check that endpoints are up and basic-auth works
* cadvisor.monitoring.gathering.org * cadvisor.monitoring.gathering.org
@ -51,7 +50,10 @@ Where gathering.org is the node you want to gather metrics from.
* monitoring.example.org * monitoring.example.org
* loki.monitoring.example.org * loki.monitoring.example.org
* loki.monitoring.example.org * loki.monitoring.example.org
1. Setup monitoring stack
* `abra app new monitoring-ng`
* `abra app config monitoring.example.org`
*
``` ```
cp scrape-config.example.yml gathering.org.yml cp scrape-config.example.yml gathering.org.yml
@ -79,29 +81,29 @@ abra app cp monitoring.dev.local-it.cloud gathering.org.yml prometheus:/promethe
| Cadvisor | traefik basic-auth | cadvisor.monitoring.example.org | | Cadvisor | traefik basic-auth | cadvisor.monitoring.example.org |
| Node Exporter | traefik basic-auth | node.monitoring.example.org | | Node Exporter | traefik basic-auth | node.monitoring.example.org |
### TODO ### TODO
* metrics.compose.yml -> compose.yml todo:
* Grafana * [x] metrics.compose.yml -> compose.yml
* [ ] Test SSO * Grafana
* Loki * [ ] Test SSO
* [ ] s3 aws secret? * Loki
* [ ] understand config, make it sane * [ ] s3 aws secret?
* [ ] Promtail * [ ] understand config, make it sane
* [ ] make it work * [ ] Promtail
* prometheus retention! * [x] make it work
* traefik metrics * [ ] test it with second server
* [uptime-kuma](https://github.com/louislam/uptime-kuma/wiki/Prometheus-Integration), [dashboard](https://grafana.com/grafana/dashboards/14847-uptime-kuma/) * [ ] prometheus retention / storage size limit
* authentik metrics? * [x] traefik metrics
* cool alerts * [ ] document example scrape config prometheus
* note: alle gathering nodes will have the same httpasswd basic-auth secret ...
-> this could be a use case to actually use docker swarm ... nice to have:
could use swarm_service_discovery then in prometheus * [uptime-kuma](https://github.com/louislam/uptime-kuma/wiki/Prometheus-Integration), [dashboard](https://grafana.com/grafana/dashboards/14847-uptime-kuma/)
-> multiple scrape_configs in prometheus * authentik metrics?
service * improve prometheus discovery / security things
-> oauth / header? prometheus could do it, does promtail? does traefik? -> multiple scrape_configs in prometheus
service
-> oauth / header? prometheus could do it, does promtail? does traefik?
This stack requires 3 domains, one for grafana, prometheus, loki. This is due to the need for the gathering tools, such as node_exporter, to have a publicy accessible URL for making connections. We make use of the internal prometheus HTTP basic auth & wire up an Nginx proxy with HTTP basic auth for loki. Grafana uses Keycloak OpenId Connect sign in. The alertmanager setup remains internal and is only connected with grafana. It also assume that you are deploying the [`coop-cloud/gathering`](https://git.coopcloud.tech/knoflook/gathering/) recipe on the machines that you want to gather metrics & logs from. Each instance of the gathering recipe will report back and/or be scraped by your central install of monitoring-lite. This stack requires 3 domains, one for grafana, prometheus, loki. This is due to the need for the gathering tools, such as node_exporter, to have a publicy accessible URL for making connections. We make use of the internal prometheus HTTP basic auth & wire up an Nginx proxy with HTTP basic auth for loki. Grafana uses Keycloak OpenId Connect sign in. The alertmanager setup remains internal and is only connected with grafana. It also assume that you are deploying the [`coop-cloud/gathering`](https://git.coopcloud.tech/knoflook/gathering/) recipe on the machines that you want to gather metrics & logs from. Each instance of the gathering recipe will report back and/or be scraped by your central install of monitoring-lite.

View File

@ -1,79 +0,0 @@
version: '3.8'
services:
node_exporter:
image: prom/node-exporter:v1.0.1
user: root
environment:
- NODE_ID={{.Node.ID}}
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
- /etc/hostname:/etc/nodename:ro
command:
- "--path.sysfs=/host/sys"
- "--path.procfs=/host/proc"
- "--path.rootfs=/rootfs"
- "--collector.textfile.directory=/etc/node-exporter/"
- "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)"
- "--no-collector.ipvs"
configs:
- source: node_exporter_entrypoint_sh
target: /entrypoint.sh
networks:
- internal
- proxy
entrypoint: [ "/bin/sh", "-e", "/entrypoint.sh" ]
deploy:
restart_policy:
condition: on-failure
labels:
- "traefik.enable=true"
- "traefik.http.services.${STACK_NAME}-node.loadbalancer.server.port=9100"
- "traefik.http.routers.${STACK_NAME}-node.rule=Host(`node.${DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-node.entrypoints=web-secure"
- "traefik.http.routers.${STACK_NAME}-node.tls=true"
- "traefik.http.routers.${STACK_NAME}-node.tls.certresolver=${LETS_ENCRYPT_ENV}"
- "traefik.http.routers.${STACK_NAME}-node.middlewares=basicauth@file"
cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.47.0
command:
- "-logtostderr"
- "-docker_only"
- "--enable_metrics=cpu,cpuLoad,disk,memory,network"
# all possible metrics: advtcp,app,cpu,cpuLoad,cpu_topology,cpuset,disk,diskIO,hugetlb,memory,memory_numa,network,oom_event,percpu,perf_event,process,referenced_memory,resctrl,sched,tcp,udp.
- "--housekeeping_interval=60s"
volumes:
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
- /sys:/sys:ro
- /var/run:/var/run:ro
- /:/rootfs:ro
networks:
- internal
- proxy
deploy:
restart_policy:
condition: on-failure
labels:
- "traefik.enable=true"
- "traefik.http.services.${STACK_NAME}-cadvisor.loadbalancer.server.port=8080"
- "traefik.http.routers.${STACK_NAME}-cadvisor.rule=Host(`cadvisor.${DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-cadvisor.entrypoints=web-secure"
- "traefik.http.routers.${STACK_NAME}-cadvisor.tls=true"
- "traefik.http.routers.${STACK_NAME}-cadvisor.tls.certresolver=${LETS_ENCRYPT_ENV}"
- "traefik.http.routers.${STACK_NAME}-cadvisor.middlewares=basicauth@file"
healthcheck:
test: wget --quiet --tries=1 --spider http://localhost:8080/healthz || exit 1
interval: 15s
timeout: 15s
retries: 5
start_period: 30s
configs:
node_exporter_entrypoint_sh:
name: ${STACK_NAME}_node_exporter_entrypoint_${NODE_EXPORTER_ENTRYPOINT_VERSION}
file: node-exporter-entrypoint.sh

View File

@ -3,8 +3,82 @@ version: "3.8"
services: services:
app: app:
image: debian:stable-slim image: prom/node-exporter:v1.0.1
entrypoint: "/bin/tail -f /dev/null" user: root
environment:
- NODE_ID={{.Node.ID}}
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
- /etc/hostname:/etc/nodename:ro
command:
- "--path.sysfs=/host/sys"
- "--path.procfs=/host/proc"
- "--path.rootfs=/rootfs"
- "--collector.textfile.directory=/etc/node-exporter/"
- "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)"
- "--no-collector.ipvs"
configs:
- source: node_exporter_entrypoint_sh
target: /entrypoint.sh
networks:
- internal
- proxy
entrypoint: [ "/bin/sh", "-e", "/entrypoint.sh" ]
deploy:
restart_policy:
condition: on-failure
labels:
- "traefik.enable=true"
- "traefik.http.services.${STACK_NAME}-node.loadbalancer.server.port=9100"
- "traefik.http.routers.${STACK_NAME}-node.rule=Host(`node.${DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-node.entrypoints=web-secure"
- "traefik.http.routers.${STACK_NAME}-node.tls=true"
- "traefik.http.routers.${STACK_NAME}-node.tls.certresolver=${LETS_ENCRYPT_ENV}"
- "traefik.http.routers.${STACK_NAME}-node.middlewares=basicauth@file"
cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.47.0
command:
- "-logtostderr"
- "-docker_only"
- "--enable_metrics=cpu,cpuLoad,disk,memory,network"
# all possible metrics: advtcp,app,cpu,cpuLoad,cpu_topology,cpuset,disk,diskIO,hugetlb,memory,memory_numa,network,oom_event,percpu,perf_event,process,referenced_memory,resctrl,sched,tcp,udp.
- "--housekeeping_interval=60s"
volumes:
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
- /sys:/sys:ro
- /var/run:/var/run:ro
- /:/rootfs:ro
networks:
- internal
- proxy
deploy:
restart_policy:
condition: on-failure
labels:
- "traefik.enable=true"
- "traefik.http.services.${STACK_NAME}-cadvisor.loadbalancer.server.port=8080"
- "traefik.http.routers.${STACK_NAME}-cadvisor.rule=Host(`cadvisor.${DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-cadvisor.entrypoints=web-secure"
- "traefik.http.routers.${STACK_NAME}-cadvisor.tls=true"
- "traefik.http.routers.${STACK_NAME}-cadvisor.tls.certresolver=${LETS_ENCRYPT_ENV}"
- "traefik.http.routers.${STACK_NAME}-cadvisor.middlewares=basicauth@file"
healthcheck:
test: wget --quiet --tries=1 --spider http://localhost:8080/healthz || exit 1
interval: 15s
timeout: 15s
retries: 5
start_period: 30s
configs:
node_exporter_entrypoint_sh:
name: ${STACK_NAME}_node_exporter_entrypoint_${NODE_EXPORTER_ENTRYPOINT_VERSION}
file: node-exporter-entrypoint.sh
networks: networks:

3
demo.yml Normal file
View File

@ -0,0 +1,3 @@
- targets:
- 'node.monitoring.demo.local-it.cloud'
- 'cadvisor.monitoring.demo.local-it.cloud'

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,4 @@
- targets: - targets:
- 'traefik.example.org' - 'example.org:8082/metrics'
- 'node.monitoring.example.org' - 'node.monitoring.example.org'
- 'cadvisor.monitoring.example.org' - 'cadvisor.monitoring.example.org'