forked from coop-cloud/monitoring-ng
foo
This commit is contained in:
parent
49a4d6ab17
commit
6d556f5ad1
38
README.md
38
README.md
@ -38,8 +38,7 @@ Where gathering.org is the node you want to gather metrics from.
|
||||
* `abra app deploy traefik` (might need to undeploy before)
|
||||
1. `abra app new monitoring-ng`
|
||||
1. `abra app config monitoring.gathering.org`
|
||||
for gathering only this is required:
|
||||
`COMPOSE_FILE="$COMPOSE_FILE:compose.metrics.yml"`
|
||||
for gathering only the main `compose.yml` is needed, nothing more.
|
||||
1. `abra app deploy monitoring.gathering.org`
|
||||
1. check that endpoints are up and basic-auth works
|
||||
* cadvisor.monitoring.gathering.org
|
||||
@ -51,7 +50,10 @@ Where gathering.org is the node you want to gather metrics from.
|
||||
* monitoring.example.org
|
||||
* loki.monitoring.example.org
|
||||
* loki.monitoring.example.org
|
||||
|
||||
1. Setup monitoring stack
|
||||
* `abra app new monitoring-ng`
|
||||
* `abra app config monitoring.example.org`
|
||||
*
|
||||
|
||||
```
|
||||
cp scrape-config.example.yml gathering.org.yml
|
||||
@ -79,26 +81,26 @@ abra app cp monitoring.dev.local-it.cloud gathering.org.yml prometheus:/promethe
|
||||
| Cadvisor | traefik basic-auth | cadvisor.monitoring.example.org |
|
||||
| Node Exporter | traefik basic-auth | node.monitoring.example.org |
|
||||
|
||||
|
||||
|
||||
### TODO
|
||||
|
||||
* metrics.compose.yml -> compose.yml
|
||||
* Grafana
|
||||
todo:
|
||||
* [x] metrics.compose.yml -> compose.yml
|
||||
* Grafana
|
||||
* [ ] Test SSO
|
||||
* Loki
|
||||
* Loki
|
||||
* [ ] s3 aws secret?
|
||||
* [ ] understand config, make it sane
|
||||
* [ ] Promtail
|
||||
* [ ] make it work
|
||||
* prometheus retention!
|
||||
* traefik metrics
|
||||
* [uptime-kuma](https://github.com/louislam/uptime-kuma/wiki/Prometheus-Integration), [dashboard](https://grafana.com/grafana/dashboards/14847-uptime-kuma/)
|
||||
* authentik metrics?
|
||||
* cool alerts
|
||||
* note: alle gathering nodes will have the same httpasswd basic-auth secret ...
|
||||
-> this could be a use case to actually use docker swarm ...
|
||||
could use swarm_service_discovery then in prometheus
|
||||
* [ ] Promtail
|
||||
* [x] make it work
|
||||
* [ ] test it with second server
|
||||
* [ ] prometheus retention / storage size limit
|
||||
* [x] traefik metrics
|
||||
* [ ] document example scrape config prometheus
|
||||
|
||||
nice to have:
|
||||
* [uptime-kuma](https://github.com/louislam/uptime-kuma/wiki/Prometheus-Integration), [dashboard](https://grafana.com/grafana/dashboards/14847-uptime-kuma/)
|
||||
* authentik metrics?
|
||||
* improve prometheus discovery / security things
|
||||
-> multiple scrape_configs in prometheus
|
||||
service
|
||||
-> oauth / header? prometheus could do it, does promtail? does traefik?
|
||||
|
@ -1,79 +0,0 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
node_exporter:
|
||||
image: prom/node-exporter:v1.0.1
|
||||
user: root
|
||||
environment:
|
||||
- NODE_ID={{.Node.ID}}
|
||||
volumes:
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
- /:/rootfs:ro
|
||||
- /etc/hostname:/etc/nodename:ro
|
||||
command:
|
||||
- "--path.sysfs=/host/sys"
|
||||
- "--path.procfs=/host/proc"
|
||||
- "--path.rootfs=/rootfs"
|
||||
- "--collector.textfile.directory=/etc/node-exporter/"
|
||||
- "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)"
|
||||
- "--no-collector.ipvs"
|
||||
configs:
|
||||
- source: node_exporter_entrypoint_sh
|
||||
target: /entrypoint.sh
|
||||
networks:
|
||||
- internal
|
||||
- proxy
|
||||
entrypoint: [ "/bin/sh", "-e", "/entrypoint.sh" ]
|
||||
deploy:
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.services.${STACK_NAME}-node.loadbalancer.server.port=9100"
|
||||
- "traefik.http.routers.${STACK_NAME}-node.rule=Host(`node.${DOMAIN}`)"
|
||||
- "traefik.http.routers.${STACK_NAME}-node.entrypoints=web-secure"
|
||||
- "traefik.http.routers.${STACK_NAME}-node.tls=true"
|
||||
- "traefik.http.routers.${STACK_NAME}-node.tls.certresolver=${LETS_ENCRYPT_ENV}"
|
||||
- "traefik.http.routers.${STACK_NAME}-node.middlewares=basicauth@file"
|
||||
|
||||
cadvisor:
|
||||
image: gcr.io/cadvisor/cadvisor:v0.47.0
|
||||
command:
|
||||
- "-logtostderr"
|
||||
- "-docker_only"
|
||||
- "--enable_metrics=cpu,cpuLoad,disk,memory,network"
|
||||
# all possible metrics: advtcp,app,cpu,cpuLoad,cpu_topology,cpuset,disk,diskIO,hugetlb,memory,memory_numa,network,oom_event,percpu,perf_event,process,referenced_memory,resctrl,sched,tcp,udp.
|
||||
- "--housekeeping_interval=60s"
|
||||
|
||||
volumes:
|
||||
- /var/lib/docker/:/var/lib/docker:ro
|
||||
- /dev/disk/:/dev/disk:ro
|
||||
- /sys:/sys:ro
|
||||
- /var/run:/var/run:ro
|
||||
- /:/rootfs:ro
|
||||
networks:
|
||||
- internal
|
||||
- proxy
|
||||
deploy:
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.services.${STACK_NAME}-cadvisor.loadbalancer.server.port=8080"
|
||||
- "traefik.http.routers.${STACK_NAME}-cadvisor.rule=Host(`cadvisor.${DOMAIN}`)"
|
||||
- "traefik.http.routers.${STACK_NAME}-cadvisor.entrypoints=web-secure"
|
||||
- "traefik.http.routers.${STACK_NAME}-cadvisor.tls=true"
|
||||
- "traefik.http.routers.${STACK_NAME}-cadvisor.tls.certresolver=${LETS_ENCRYPT_ENV}"
|
||||
- "traefik.http.routers.${STACK_NAME}-cadvisor.middlewares=basicauth@file"
|
||||
healthcheck:
|
||||
test: wget --quiet --tries=1 --spider http://localhost:8080/healthz || exit 1
|
||||
interval: 15s
|
||||
timeout: 15s
|
||||
retries: 5
|
||||
start_period: 30s
|
||||
|
||||
configs:
|
||||
node_exporter_entrypoint_sh:
|
||||
name: ${STACK_NAME}_node_exporter_entrypoint_${NODE_EXPORTER_ENTRYPOINT_VERSION}
|
||||
file: node-exporter-entrypoint.sh
|
78
compose.yml
78
compose.yml
@ -3,8 +3,82 @@ version: "3.8"
|
||||
|
||||
services:
|
||||
app:
|
||||
image: debian:stable-slim
|
||||
entrypoint: "/bin/tail -f /dev/null"
|
||||
image: prom/node-exporter:v1.0.1
|
||||
user: root
|
||||
environment:
|
||||
- NODE_ID={{.Node.ID}}
|
||||
volumes:
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
- /:/rootfs:ro
|
||||
- /etc/hostname:/etc/nodename:ro
|
||||
command:
|
||||
- "--path.sysfs=/host/sys"
|
||||
- "--path.procfs=/host/proc"
|
||||
- "--path.rootfs=/rootfs"
|
||||
- "--collector.textfile.directory=/etc/node-exporter/"
|
||||
- "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)"
|
||||
- "--no-collector.ipvs"
|
||||
configs:
|
||||
- source: node_exporter_entrypoint_sh
|
||||
target: /entrypoint.sh
|
||||
networks:
|
||||
- internal
|
||||
- proxy
|
||||
entrypoint: [ "/bin/sh", "-e", "/entrypoint.sh" ]
|
||||
deploy:
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.services.${STACK_NAME}-node.loadbalancer.server.port=9100"
|
||||
- "traefik.http.routers.${STACK_NAME}-node.rule=Host(`node.${DOMAIN}`)"
|
||||
- "traefik.http.routers.${STACK_NAME}-node.entrypoints=web-secure"
|
||||
- "traefik.http.routers.${STACK_NAME}-node.tls=true"
|
||||
- "traefik.http.routers.${STACK_NAME}-node.tls.certresolver=${LETS_ENCRYPT_ENV}"
|
||||
- "traefik.http.routers.${STACK_NAME}-node.middlewares=basicauth@file"
|
||||
|
||||
cadvisor:
|
||||
image: gcr.io/cadvisor/cadvisor:v0.47.0
|
||||
command:
|
||||
- "-logtostderr"
|
||||
- "-docker_only"
|
||||
- "--enable_metrics=cpu,cpuLoad,disk,memory,network"
|
||||
# all possible metrics: advtcp,app,cpu,cpuLoad,cpu_topology,cpuset,disk,diskIO,hugetlb,memory,memory_numa,network,oom_event,percpu,perf_event,process,referenced_memory,resctrl,sched,tcp,udp.
|
||||
- "--housekeeping_interval=60s"
|
||||
|
||||
volumes:
|
||||
- /var/lib/docker/:/var/lib/docker:ro
|
||||
- /dev/disk/:/dev/disk:ro
|
||||
- /sys:/sys:ro
|
||||
- /var/run:/var/run:ro
|
||||
- /:/rootfs:ro
|
||||
networks:
|
||||
- internal
|
||||
- proxy
|
||||
deploy:
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.services.${STACK_NAME}-cadvisor.loadbalancer.server.port=8080"
|
||||
- "traefik.http.routers.${STACK_NAME}-cadvisor.rule=Host(`cadvisor.${DOMAIN}`)"
|
||||
- "traefik.http.routers.${STACK_NAME}-cadvisor.entrypoints=web-secure"
|
||||
- "traefik.http.routers.${STACK_NAME}-cadvisor.tls=true"
|
||||
- "traefik.http.routers.${STACK_NAME}-cadvisor.tls.certresolver=${LETS_ENCRYPT_ENV}"
|
||||
- "traefik.http.routers.${STACK_NAME}-cadvisor.middlewares=basicauth@file"
|
||||
healthcheck:
|
||||
test: wget --quiet --tries=1 --spider http://localhost:8080/healthz || exit 1
|
||||
interval: 15s
|
||||
timeout: 15s
|
||||
retries: 5
|
||||
start_period: 30s
|
||||
|
||||
configs:
|
||||
node_exporter_entrypoint_sh:
|
||||
name: ${STACK_NAME}_node_exporter_entrypoint_${NODE_EXPORTER_ENTRYPOINT_VERSION}
|
||||
file: node-exporter-entrypoint.sh
|
||||
|
||||
|
||||
|
||||
networks:
|
||||
|
3
demo.yml
Normal file
3
demo.yml
Normal file
@ -0,0 +1,3 @@
|
||||
- targets:
|
||||
- 'node.monitoring.demo.local-it.cloud'
|
||||
- 'cadvisor.monitoring.demo.local-it.cloud'
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,4 +1,4 @@
|
||||
- targets:
|
||||
- 'traefik.example.org'
|
||||
- 'example.org:8082/metrics'
|
||||
- 'node.monitoring.example.org'
|
||||
- 'cadvisor.monitoring.example.org'
|
||||
|
Loading…
x
Reference in New Issue
Block a user