forked from coop-cloud/monitoring-ng
foo
This commit is contained in:
parent
49a4d6ab17
commit
6d556f5ad1
28
README.md
28
README.md
@ -38,8 +38,7 @@ Where gathering.org is the node you want to gather metrics from.
|
|||||||
* `abra app deploy traefik` (might need to undeploy before)
|
* `abra app deploy traefik` (might need to undeploy before)
|
||||||
1. `abra app new monitoring-ng`
|
1. `abra app new monitoring-ng`
|
||||||
1. `abra app config monitoring.gathering.org`
|
1. `abra app config monitoring.gathering.org`
|
||||||
for gathering only this is required:
|
for gathering only the main `compose.yml` is needed, nothing more.
|
||||||
`COMPOSE_FILE="$COMPOSE_FILE:compose.metrics.yml"`
|
|
||||||
1. `abra app deploy monitoring.gathering.org`
|
1. `abra app deploy monitoring.gathering.org`
|
||||||
1. check that endpoints are up and basic-auth works
|
1. check that endpoints are up and basic-auth works
|
||||||
* cadvisor.monitoring.gathering.org
|
* cadvisor.monitoring.gathering.org
|
||||||
@ -51,7 +50,10 @@ Where gathering.org is the node you want to gather metrics from.
|
|||||||
* monitoring.example.org
|
* monitoring.example.org
|
||||||
* loki.monitoring.example.org
|
* loki.monitoring.example.org
|
||||||
* loki.monitoring.example.org
|
* loki.monitoring.example.org
|
||||||
|
1. Setup monitoring stack
|
||||||
|
* `abra app new monitoring-ng`
|
||||||
|
* `abra app config monitoring.example.org`
|
||||||
|
*
|
||||||
|
|
||||||
```
|
```
|
||||||
cp scrape-config.example.yml gathering.org.yml
|
cp scrape-config.example.yml gathering.org.yml
|
||||||
@ -79,26 +81,26 @@ abra app cp monitoring.dev.local-it.cloud gathering.org.yml prometheus:/promethe
|
|||||||
| Cadvisor | traefik basic-auth | cadvisor.monitoring.example.org |
|
| Cadvisor | traefik basic-auth | cadvisor.monitoring.example.org |
|
||||||
| Node Exporter | traefik basic-auth | node.monitoring.example.org |
|
| Node Exporter | traefik basic-auth | node.monitoring.example.org |
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
### TODO
|
### TODO
|
||||||
|
|
||||||
* metrics.compose.yml -> compose.yml
|
todo:
|
||||||
|
* [x] metrics.compose.yml -> compose.yml
|
||||||
* Grafana
|
* Grafana
|
||||||
* [ ] Test SSO
|
* [ ] Test SSO
|
||||||
* Loki
|
* Loki
|
||||||
* [ ] s3 aws secret?
|
* [ ] s3 aws secret?
|
||||||
* [ ] understand config, make it sane
|
* [ ] understand config, make it sane
|
||||||
* [ ] Promtail
|
* [ ] Promtail
|
||||||
* [ ] make it work
|
* [x] make it work
|
||||||
* prometheus retention!
|
* [ ] test it with second server
|
||||||
* traefik metrics
|
* [ ] prometheus retention / storage size limit
|
||||||
|
* [x] traefik metrics
|
||||||
|
* [ ] document example scrape config prometheus
|
||||||
|
|
||||||
|
nice to have:
|
||||||
* [uptime-kuma](https://github.com/louislam/uptime-kuma/wiki/Prometheus-Integration), [dashboard](https://grafana.com/grafana/dashboards/14847-uptime-kuma/)
|
* [uptime-kuma](https://github.com/louislam/uptime-kuma/wiki/Prometheus-Integration), [dashboard](https://grafana.com/grafana/dashboards/14847-uptime-kuma/)
|
||||||
* authentik metrics?
|
* authentik metrics?
|
||||||
* cool alerts
|
* improve prometheus discovery / security things
|
||||||
* note: alle gathering nodes will have the same httpasswd basic-auth secret ...
|
|
||||||
-> this could be a use case to actually use docker swarm ...
|
|
||||||
could use swarm_service_discovery then in prometheus
|
|
||||||
-> multiple scrape_configs in prometheus
|
-> multiple scrape_configs in prometheus
|
||||||
service
|
service
|
||||||
-> oauth / header? prometheus could do it, does promtail? does traefik?
|
-> oauth / header? prometheus could do it, does promtail? does traefik?
|
||||||
|
@ -1,79 +0,0 @@
|
|||||||
version: '3.8'
|
|
||||||
|
|
||||||
services:
|
|
||||||
node_exporter:
|
|
||||||
image: prom/node-exporter:v1.0.1
|
|
||||||
user: root
|
|
||||||
environment:
|
|
||||||
- NODE_ID={{.Node.ID}}
|
|
||||||
volumes:
|
|
||||||
- /proc:/host/proc:ro
|
|
||||||
- /sys:/host/sys:ro
|
|
||||||
- /:/rootfs:ro
|
|
||||||
- /etc/hostname:/etc/nodename:ro
|
|
||||||
command:
|
|
||||||
- "--path.sysfs=/host/sys"
|
|
||||||
- "--path.procfs=/host/proc"
|
|
||||||
- "--path.rootfs=/rootfs"
|
|
||||||
- "--collector.textfile.directory=/etc/node-exporter/"
|
|
||||||
- "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)"
|
|
||||||
- "--no-collector.ipvs"
|
|
||||||
configs:
|
|
||||||
- source: node_exporter_entrypoint_sh
|
|
||||||
target: /entrypoint.sh
|
|
||||||
networks:
|
|
||||||
- internal
|
|
||||||
- proxy
|
|
||||||
entrypoint: [ "/bin/sh", "-e", "/entrypoint.sh" ]
|
|
||||||
deploy:
|
|
||||||
restart_policy:
|
|
||||||
condition: on-failure
|
|
||||||
labels:
|
|
||||||
- "traefik.enable=true"
|
|
||||||
- "traefik.http.services.${STACK_NAME}-node.loadbalancer.server.port=9100"
|
|
||||||
- "traefik.http.routers.${STACK_NAME}-node.rule=Host(`node.${DOMAIN}`)"
|
|
||||||
- "traefik.http.routers.${STACK_NAME}-node.entrypoints=web-secure"
|
|
||||||
- "traefik.http.routers.${STACK_NAME}-node.tls=true"
|
|
||||||
- "traefik.http.routers.${STACK_NAME}-node.tls.certresolver=${LETS_ENCRYPT_ENV}"
|
|
||||||
- "traefik.http.routers.${STACK_NAME}-node.middlewares=basicauth@file"
|
|
||||||
|
|
||||||
cadvisor:
|
|
||||||
image: gcr.io/cadvisor/cadvisor:v0.47.0
|
|
||||||
command:
|
|
||||||
- "-logtostderr"
|
|
||||||
- "-docker_only"
|
|
||||||
- "--enable_metrics=cpu,cpuLoad,disk,memory,network"
|
|
||||||
# all possible metrics: advtcp,app,cpu,cpuLoad,cpu_topology,cpuset,disk,diskIO,hugetlb,memory,memory_numa,network,oom_event,percpu,perf_event,process,referenced_memory,resctrl,sched,tcp,udp.
|
|
||||||
- "--housekeeping_interval=60s"
|
|
||||||
|
|
||||||
volumes:
|
|
||||||
- /var/lib/docker/:/var/lib/docker:ro
|
|
||||||
- /dev/disk/:/dev/disk:ro
|
|
||||||
- /sys:/sys:ro
|
|
||||||
- /var/run:/var/run:ro
|
|
||||||
- /:/rootfs:ro
|
|
||||||
networks:
|
|
||||||
- internal
|
|
||||||
- proxy
|
|
||||||
deploy:
|
|
||||||
restart_policy:
|
|
||||||
condition: on-failure
|
|
||||||
labels:
|
|
||||||
- "traefik.enable=true"
|
|
||||||
- "traefik.http.services.${STACK_NAME}-cadvisor.loadbalancer.server.port=8080"
|
|
||||||
- "traefik.http.routers.${STACK_NAME}-cadvisor.rule=Host(`cadvisor.${DOMAIN}`)"
|
|
||||||
- "traefik.http.routers.${STACK_NAME}-cadvisor.entrypoints=web-secure"
|
|
||||||
- "traefik.http.routers.${STACK_NAME}-cadvisor.tls=true"
|
|
||||||
- "traefik.http.routers.${STACK_NAME}-cadvisor.tls.certresolver=${LETS_ENCRYPT_ENV}"
|
|
||||||
- "traefik.http.routers.${STACK_NAME}-cadvisor.middlewares=basicauth@file"
|
|
||||||
healthcheck:
|
|
||||||
test: wget --quiet --tries=1 --spider http://localhost:8080/healthz || exit 1
|
|
||||||
interval: 15s
|
|
||||||
timeout: 15s
|
|
||||||
retries: 5
|
|
||||||
start_period: 30s
|
|
||||||
|
|
||||||
configs:
|
|
||||||
node_exporter_entrypoint_sh:
|
|
||||||
name: ${STACK_NAME}_node_exporter_entrypoint_${NODE_EXPORTER_ENTRYPOINT_VERSION}
|
|
||||||
file: node-exporter-entrypoint.sh
|
|
78
compose.yml
78
compose.yml
@ -3,8 +3,82 @@ version: "3.8"
|
|||||||
|
|
||||||
services:
|
services:
|
||||||
app:
|
app:
|
||||||
image: debian:stable-slim
|
image: prom/node-exporter:v1.0.1
|
||||||
entrypoint: "/bin/tail -f /dev/null"
|
user: root
|
||||||
|
environment:
|
||||||
|
- NODE_ID={{.Node.ID}}
|
||||||
|
volumes:
|
||||||
|
- /proc:/host/proc:ro
|
||||||
|
- /sys:/host/sys:ro
|
||||||
|
- /:/rootfs:ro
|
||||||
|
- /etc/hostname:/etc/nodename:ro
|
||||||
|
command:
|
||||||
|
- "--path.sysfs=/host/sys"
|
||||||
|
- "--path.procfs=/host/proc"
|
||||||
|
- "--path.rootfs=/rootfs"
|
||||||
|
- "--collector.textfile.directory=/etc/node-exporter/"
|
||||||
|
- "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)"
|
||||||
|
- "--no-collector.ipvs"
|
||||||
|
configs:
|
||||||
|
- source: node_exporter_entrypoint_sh
|
||||||
|
target: /entrypoint.sh
|
||||||
|
networks:
|
||||||
|
- internal
|
||||||
|
- proxy
|
||||||
|
entrypoint: [ "/bin/sh", "-e", "/entrypoint.sh" ]
|
||||||
|
deploy:
|
||||||
|
restart_policy:
|
||||||
|
condition: on-failure
|
||||||
|
labels:
|
||||||
|
- "traefik.enable=true"
|
||||||
|
- "traefik.http.services.${STACK_NAME}-node.loadbalancer.server.port=9100"
|
||||||
|
- "traefik.http.routers.${STACK_NAME}-node.rule=Host(`node.${DOMAIN}`)"
|
||||||
|
- "traefik.http.routers.${STACK_NAME}-node.entrypoints=web-secure"
|
||||||
|
- "traefik.http.routers.${STACK_NAME}-node.tls=true"
|
||||||
|
- "traefik.http.routers.${STACK_NAME}-node.tls.certresolver=${LETS_ENCRYPT_ENV}"
|
||||||
|
- "traefik.http.routers.${STACK_NAME}-node.middlewares=basicauth@file"
|
||||||
|
|
||||||
|
cadvisor:
|
||||||
|
image: gcr.io/cadvisor/cadvisor:v0.47.0
|
||||||
|
command:
|
||||||
|
- "-logtostderr"
|
||||||
|
- "-docker_only"
|
||||||
|
- "--enable_metrics=cpu,cpuLoad,disk,memory,network"
|
||||||
|
# all possible metrics: advtcp,app,cpu,cpuLoad,cpu_topology,cpuset,disk,diskIO,hugetlb,memory,memory_numa,network,oom_event,percpu,perf_event,process,referenced_memory,resctrl,sched,tcp,udp.
|
||||||
|
- "--housekeeping_interval=60s"
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
- /var/lib/docker/:/var/lib/docker:ro
|
||||||
|
- /dev/disk/:/dev/disk:ro
|
||||||
|
- /sys:/sys:ro
|
||||||
|
- /var/run:/var/run:ro
|
||||||
|
- /:/rootfs:ro
|
||||||
|
networks:
|
||||||
|
- internal
|
||||||
|
- proxy
|
||||||
|
deploy:
|
||||||
|
restart_policy:
|
||||||
|
condition: on-failure
|
||||||
|
labels:
|
||||||
|
- "traefik.enable=true"
|
||||||
|
- "traefik.http.services.${STACK_NAME}-cadvisor.loadbalancer.server.port=8080"
|
||||||
|
- "traefik.http.routers.${STACK_NAME}-cadvisor.rule=Host(`cadvisor.${DOMAIN}`)"
|
||||||
|
- "traefik.http.routers.${STACK_NAME}-cadvisor.entrypoints=web-secure"
|
||||||
|
- "traefik.http.routers.${STACK_NAME}-cadvisor.tls=true"
|
||||||
|
- "traefik.http.routers.${STACK_NAME}-cadvisor.tls.certresolver=${LETS_ENCRYPT_ENV}"
|
||||||
|
- "traefik.http.routers.${STACK_NAME}-cadvisor.middlewares=basicauth@file"
|
||||||
|
healthcheck:
|
||||||
|
test: wget --quiet --tries=1 --spider http://localhost:8080/healthz || exit 1
|
||||||
|
interval: 15s
|
||||||
|
timeout: 15s
|
||||||
|
retries: 5
|
||||||
|
start_period: 30s
|
||||||
|
|
||||||
|
configs:
|
||||||
|
node_exporter_entrypoint_sh:
|
||||||
|
name: ${STACK_NAME}_node_exporter_entrypoint_${NODE_EXPORTER_ENTRYPOINT_VERSION}
|
||||||
|
file: node-exporter-entrypoint.sh
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
networks:
|
networks:
|
||||||
|
3
demo.yml
Normal file
3
demo.yml
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
- targets:
|
||||||
|
- 'node.monitoring.demo.local-it.cloud'
|
||||||
|
- 'cadvisor.monitoring.demo.local-it.cloud'
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,4 +1,4 @@
|
|||||||
- targets:
|
- targets:
|
||||||
- 'traefik.example.org'
|
- 'example.org:8082/metrics'
|
||||||
- 'node.monitoring.example.org'
|
- 'node.monitoring.example.org'
|
||||||
- 'cadvisor.monitoring.example.org'
|
- 'cadvisor.monitoring.example.org'
|
||||||
|
Loading…
x
Reference in New Issue
Block a user