foo

2023-05-11 15:23:35 +02:00 · 2023-05-11 15:23:35 +02:00 · 6d556f5ad1
parent 49a4d6ab17
commit 6d556f5ad1
7 changed files with 588 additions and 598 deletions
--- a/README.md
+++ b/README.md
@ -38,8 +38,7 @@ Where gathering.org is the node you want to gather metrics from.
  * `abra app deploy traefik` (might need to undeploy before)
 1. `abra app new monitoring-ng`
 1. `abra app config monitoring.gathering.org`
-    for gathering only this is required:
+    for gathering only the main `compose.yml` is needed, nothing more.
    `COMPOSE_FILE="$COMPOSE_FILE:compose.metrics.yml"`
 1. `abra app deploy monitoring.gathering.org`
 1. check that endpoints are up and basic-auth works
  * cadvisor.monitoring.gathering.org
@ -51,7 +50,10 @@ Where gathering.org is the node you want to gather metrics from.
   * monitoring.example.org
   * loki.monitoring.example.org
   * loki.monitoring.example.org
-
+1. Setup monitoring stack
   * `abra app new monitoring-ng`
   * `abra app config monitoring.example.org`
   *
 ```
 cp scrape-config.example.yml gathering.org.yml
@ -79,29 +81,29 @@ abra app cp monitoring.dev.local-it.cloud gathering.org.yml prometheus:/promethe
 | Cadvisor      | traefik basic-auth | cadvisor.monitoring.example.org   |
 | Node Exporter | traefik basic-auth | node.monitoring.example.org       |
 ### TODO
-* metrics.compose.yml -> compose.yml
+todo:
-* Grafana
+  * [x] metrics.compose.yml -> compose.yml
-  * [ ] Test SSO
+  * Grafana
-* Loki
+    * [ ] Test SSO
-  * [ ] s3 aws secret?
+  * Loki
-  * [ ] understand config, make it sane
+    * [ ] s3 aws secret?
-* [ ] Promtail
+    * [ ] understand config, make it sane
-  * [ ] make it work
+  * [ ] Promtail
-* prometheus retention!
+    * [x] make it work
-* traefik metrics
+    * [ ] test it with second server
-* [uptime-kuma](https://github.com/louislam/uptime-kuma/wiki/Prometheus-Integration), [dashboard](https://grafana.com/grafana/dashboards/14847-uptime-kuma/)
+  * [ ] prometheus retention / storage size limit
-* authentik metrics?
+  * [x] traefik metrics
-* cool alerts
+  * [ ] document example scrape config prometheus
-* note: alle gathering nodes will have the same httpasswd basic-auth secret ...
+
-  -> this could be a use case to actually use docker swarm ...
+nice to have:
-      could use swarm_service_discovery then in prometheus
+  * [uptime-kuma](https://github.com/louislam/uptime-kuma/wiki/Prometheus-Integration), [dashboard](https://grafana.com/grafana/dashboards/14847-uptime-kuma/)
-  -> multiple scrape_configs in prometheus
+  * authentik metrics?
-     service
+  * improve prometheus discovery / security things
-  -> oauth / header? prometheus could do it, does promtail? does traefik?
+    -> multiple scrape_configs in prometheus
      service
    -> oauth / header? prometheus could do it, does promtail? does traefik?
 This stack requires 3 domains, one for grafana, prometheus, loki. This is due to the need for the gathering tools, such as node_exporter, to have a publicy accessible URL for making connections. We make use of the internal prometheus HTTP basic auth & wire up an Nginx proxy with HTTP basic auth for loki. Grafana uses Keycloak OpenId Connect sign in. The alertmanager setup remains internal and is only connected with grafana. It also assume that you are deploying the [`coop-cloud/gathering`](https://git.coopcloud.tech/knoflook/gathering/) recipe on the machines that you want to gather metrics & logs from. Each instance of the gathering recipe will report back and/or be scraped by your central install of monitoring-lite.
--- a/compose.metrics.yml
+++ b/compose.metrics.yml
@ -1,79 +0,0 @@
 version: '3.8'
 services:
  node_exporter:
    image: prom/node-exporter:v1.0.1
    user: root
    environment:
      - NODE_ID={{.Node.ID}}
    volumes:
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /:/rootfs:ro
      - /etc/hostname:/etc/nodename:ro
    command:
      - "--path.sysfs=/host/sys"
      - "--path.procfs=/host/proc"
      - "--path.rootfs=/rootfs"
      - "--collector.textfile.directory=/etc/node-exporter/"
      - "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)"
      - "--no-collector.ipvs"
    configs:
      - source: node_exporter_entrypoint_sh
        target: /entrypoint.sh
    networks:
      - internal
      - proxy
    entrypoint: [ "/bin/sh", "-e", "/entrypoint.sh" ]
    deploy:
      restart_policy:
        condition: on-failure
      labels:
        - "traefik.enable=true"
        - "traefik.http.services.${STACK_NAME}-node.loadbalancer.server.port=9100"
        - "traefik.http.routers.${STACK_NAME}-node.rule=Host(`node.${DOMAIN}`)"
        - "traefik.http.routers.${STACK_NAME}-node.entrypoints=web-secure"
        - "traefik.http.routers.${STACK_NAME}-node.tls=true"
        - "traefik.http.routers.${STACK_NAME}-node.tls.certresolver=${LETS_ENCRYPT_ENV}"
        - "traefik.http.routers.${STACK_NAME}-node.middlewares=basicauth@file"
  cadvisor:
    image: gcr.io/cadvisor/cadvisor:v0.47.0
    command:
      - "-logtostderr"
      - "-docker_only"
      - "--enable_metrics=cpu,cpuLoad,disk,memory,network"
      # all possible metrics: advtcp,app,cpu,cpuLoad,cpu_topology,cpuset,disk,diskIO,hugetlb,memory,memory_numa,network,oom_event,percpu,perf_event,process,referenced_memory,resctrl,sched,tcp,udp.
      - "--housekeeping_interval=60s"
    volumes:
      - /var/lib/docker/:/var/lib/docker:ro
      - /dev/disk/:/dev/disk:ro
      - /sys:/sys:ro
      - /var/run:/var/run:ro
      - /:/rootfs:ro
    networks:
      - internal
      - proxy
    deploy:
      restart_policy:
        condition: on-failure
      labels:
        - "traefik.enable=true"
        - "traefik.http.services.${STACK_NAME}-cadvisor.loadbalancer.server.port=8080"
        - "traefik.http.routers.${STACK_NAME}-cadvisor.rule=Host(`cadvisor.${DOMAIN}`)"
        - "traefik.http.routers.${STACK_NAME}-cadvisor.entrypoints=web-secure"
        - "traefik.http.routers.${STACK_NAME}-cadvisor.tls=true"
        - "traefik.http.routers.${STACK_NAME}-cadvisor.tls.certresolver=${LETS_ENCRYPT_ENV}"
        - "traefik.http.routers.${STACK_NAME}-cadvisor.middlewares=basicauth@file"
    healthcheck:
      test: wget --quiet --tries=1 --spider http://localhost:8080/healthz || exit 1
      interval: 15s
      timeout: 15s
      retries: 5
      start_period: 30s
 configs:
  node_exporter_entrypoint_sh:
    name: ${STACK_NAME}_node_exporter_entrypoint_${NODE_EXPORTER_ENTRYPOINT_VERSION}
    file: node-exporter-entrypoint.sh
--- a/compose.yml
+++ b/compose.yml
@ -3,8 +3,82 @@ version: "3.8"
 services:
  app:
-    image: debian:stable-slim
+    image: prom/node-exporter:v1.0.1
-    entrypoint: "/bin/tail -f /dev/null"
+    user: root
    environment:
      - NODE_ID={{.Node.ID}}
    volumes:
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /:/rootfs:ro
      - /etc/hostname:/etc/nodename:ro
    command:
      - "--path.sysfs=/host/sys"
      - "--path.procfs=/host/proc"
      - "--path.rootfs=/rootfs"
      - "--collector.textfile.directory=/etc/node-exporter/"
      - "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)"
      - "--no-collector.ipvs"
    configs:
      - source: node_exporter_entrypoint_sh
        target: /entrypoint.sh
    networks:
      - internal
      - proxy
    entrypoint: [ "/bin/sh", "-e", "/entrypoint.sh" ]
    deploy:
      restart_policy:
        condition: on-failure
      labels:
        - "traefik.enable=true"
        - "traefik.http.services.${STACK_NAME}-node.loadbalancer.server.port=9100"
        - "traefik.http.routers.${STACK_NAME}-node.rule=Host(`node.${DOMAIN}`)"
        - "traefik.http.routers.${STACK_NAME}-node.entrypoints=web-secure"
        - "traefik.http.routers.${STACK_NAME}-node.tls=true"
        - "traefik.http.routers.${STACK_NAME}-node.tls.certresolver=${LETS_ENCRYPT_ENV}"
        - "traefik.http.routers.${STACK_NAME}-node.middlewares=basicauth@file"
  cadvisor:
    image: gcr.io/cadvisor/cadvisor:v0.47.0
    command:
      - "-logtostderr"
      - "-docker_only"
      - "--enable_metrics=cpu,cpuLoad,disk,memory,network"
      # all possible metrics: advtcp,app,cpu,cpuLoad,cpu_topology,cpuset,disk,diskIO,hugetlb,memory,memory_numa,network,oom_event,percpu,perf_event,process,referenced_memory,resctrl,sched,tcp,udp.
      - "--housekeeping_interval=60s"
    volumes:
      - /var/lib/docker/:/var/lib/docker:ro
      - /dev/disk/:/dev/disk:ro
      - /sys:/sys:ro
      - /var/run:/var/run:ro
      - /:/rootfs:ro
    networks:
      - internal
      - proxy
    deploy:
      restart_policy:
        condition: on-failure
      labels:
        - "traefik.enable=true"
        - "traefik.http.services.${STACK_NAME}-cadvisor.loadbalancer.server.port=8080"
        - "traefik.http.routers.${STACK_NAME}-cadvisor.rule=Host(`cadvisor.${DOMAIN}`)"
        - "traefik.http.routers.${STACK_NAME}-cadvisor.entrypoints=web-secure"
        - "traefik.http.routers.${STACK_NAME}-cadvisor.tls=true"
        - "traefik.http.routers.${STACK_NAME}-cadvisor.tls.certresolver=${LETS_ENCRYPT_ENV}"
        - "traefik.http.routers.${STACK_NAME}-cadvisor.middlewares=basicauth@file"
    healthcheck:
      test: wget --quiet --tries=1 --spider http://localhost:8080/healthz || exit 1
      interval: 15s
      timeout: 15s
      retries: 5
      start_period: 30s
 configs:
  node_exporter_entrypoint_sh:
    name: ${STACK_NAME}_node_exporter_entrypoint_${NODE_EXPORTER_ENTRYPOINT_VERSION}
    file: node-exporter-entrypoint.sh
 networks:
--- a/demo.yml
+++ b/demo.yml
@ -0,0 +1,3 @@
 - targets:
  - 'node.monitoring.demo.local-it.cloud'
  - 'cadvisor.monitoring.demo.local-it.cloud'
--- a/grafana-swarm-dashboard.json
+++ b/grafana-swarm-dashboard.json
--- a/grafana-traefik-dashboard.json
+++ b/grafana-traefik-dashboard.json
--- a/scrape-config.example.yml
+++ b/scrape-config.example.yml
@ -1,4 +1,4 @@
 - targets:
-  - 'traefik.example.org'
+  - 'example.org:8082/metrics'
  - 'node.monitoring.example.org'
  - 'cadvisor.monitoring.example.org'