From a8e94af0cf3d63290c81d73c01f1f9bab934657c Mon Sep 17 00:00:00 2001 From: Philipp Rothmann Date: Tue, 23 May 2023 17:08:21 +0200 Subject: [PATCH] chore: publish 120 release --- README.md | 135 +++++++++++++++++++++-------------------- abra.sh | 2 +- compose.grafana.yml | 2 +- compose.prometheus.yml | 4 +- compose.promtail.yml | 2 +- compose.yml | 4 +- 6 files changed, 75 insertions(+), 74 deletions(-) diff --git a/README.md b/README.md index 6fa37ec..9601a9d 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -# monitoring-lite +# monitoring-ng -A centralised grafana/prometheus/loki stack. This an alternative approach to [`coop-cloud/monitoring`](https://git.coopcloud.tech/coop-cloud/monitoring) which does include any of the services which actually gather metrics and/or logs. Instead, this is a useful recipe for folks who need to centralise their monitoring stack into a single grafana/prometheus/loki & several instances of node_exporter/cadvisor/promtail. +A all-in-one grafana/prometheus/loki stack. This is a useful recipe for folks who need to centralise their monitoring stack into a single grafana/prometheus/loki & several instances of node_exporter/cadvisor/promtail. @@ -20,60 +20,61 @@ A centralised grafana/prometheus/loki stack. This an alternative approach to [`c Where gathering.org is the node you want to gather metrics from. 1. Configure DNS - * monitoring.gathering.org - * cadvisor.monitoring.gathering.org - * node.monitoring.gathering.org + - monitoring.gathering.org + - cadvisor.monitoring.gathering.org + - node.monitoring.gathering.org 1. Configure Traefik to use BasicAuth * `abra app config traefik.gathering.org` - uncomment - ``` - # BASIC_AUTH - COMPOSE_FILE="$COMPOSE_FILE:compose.basicauth.yml" - BASIC_AUTH=1 - SECRET_USERSFILE_VERSION=v1 - ``` - * Generate userslist with httpasswd hashed password - `abra app secret insert traefik.gathering.org userslist v1 'admin:hashed-secret'` - make sure there is no whitespace between admin:hashed-secret, it seems to break stuff... - * `abra app deploy traefik` (might need to undeploy before) + uncomment + ``` + # BASIC_AUTH + COMPOSE_FILE="$COMPOSE_FILE:compose.basicauth.yml" + BASIC_AUTH=1 + SECRET_USERSFILE_VERSION=v1 + ``` + - Generate userslist with httpasswd hashed password + `abra app secret insert traefik.gathering.org userslist v1 'admin:'` + make sure there is no whitespace in between `admin:`, it seems to break stuff... + - `abra app deploy -f traefik` 1. `abra app new monitoring-ng` 1. `abra app config monitoring.gathering.org` - for gathering only the main `compose.yml` is needed, nothing more. + for gathering only the main `compose.yml` is needed, nothing more. 1. `abra app deploy monitoring.gathering.org` 1. check that endpoints are up and basic-auth works - * cadvisor.monitoring.gathering.org - * node.monitoring.gathering.org + - cadvisor.monitoring.gathering.org + - node.monitoring.gathering.org ## Setup Metrics Browser -1. Configure DNS - * monitoring.example.org - * loki.monitoring.example.org - * loki.monitoring.example.org -1. Setup monitoring stack - * `abra app new monitoring-ng` - * `abra app config monitoring.example.org` - * -``` -cp scrape-config.example.yml gathering.org.yml -# adjust domain -# mkdir scrape_configs -abra app cp monitoring.dev.local-it.cloud gathering.org.yml prometheus:/prometheus/scrape_configs/ -``` +1. Configure DNS + - monitoring.example.org + - prometheus.monitoring.example.org + - loki.monitoring.example.org +1. Setup monitoring stack + - `abra app new monitoring-ng` + - `abra app config monitoring.example.org` + Uncomment all the stuff + - `abra app secret insert monitoring.example.org basic_auth_admin_password v1 ` + this needs the plaintext traefik basic-auth secret, not the hashed one! + - `abra app secret ls monitoring.example.org` + - `abra app deploy monitoring.example.org` +1. add scrape config to prometheus + - `abra app cmd monitoring.example.org prometheus gathering.org` + - or manually + ``` + cp scrape-config.example.yml gathering.org.yml + # adjust domain + # mkdir scrape_configs + abra app cp monitoring.dev.local-it.cloud gathering.org.yml prometheus:/prometheus/scrape_configs/ + ``` * check that all configured targets are up: https://prometheus.monitoring.example.org/targets -### -1. Insert secrets for prometheus -1. add scrape config (see example) - and run abra app cp to copy it -1. grafana sso secret - -| | | | +| Service | Authentication | Domain | | ------------- | ------------------ | --------------------------------- | | Grafana | Email / SSO | monitoring.example.org | | Prometheus | traefik basic-auth | prometheus.monitoring.example.org | @@ -81,29 +82,27 @@ abra app cp monitoring.dev.local-it.cloud gathering.org.yml prometheus:/promethe | Cadvisor | traefik basic-auth | cadvisor.monitoring.example.org | | Node Exporter | traefik basic-auth | node.monitoring.example.org | -### TODO -todo: - * [x] metrics.compose.yml -> compose.yml - * Grafana - * [ ] Test SSO - * Loki - * [ ] s3 aws secret? - * [ ] understand config, make it sane - * [ ] Promtail - * [x] make it work - * [ ] test it with second server - * [ ] prometheus retention / storage size limit - * [x] traefik metrics - * [ ] document example scrape config prometheus +### Logging from a docker host to loki server without anything else + +``` +$ docker plugin install grafana/loki-docker-driver:latest --alias loki --grant-all-permissions +$ echo '{ + "debug" : true, + "log-driver": "loki", + "log-opts": { + "loki-url": "https://:@loki.monitoring.example.org/loki/api/v1/push", + "loki-batch-size": "400" + } +}' > /etc/docker/daemon.json +$ systemctl restart docker.service +``` + +### + + + -nice to have: - * [uptime-kuma](https://github.com/louislam/uptime-kuma/wiki/Prometheus-Integration), [dashboard](https://grafana.com/grafana/dashboards/14847-uptime-kuma/) - * authentik metrics? - * improve prometheus discovery / security things - -> multiple scrape_configs in prometheus - service - -> oauth / header? prometheus could do it, does promtail? does traefik? This stack requires 3 domains, one for grafana, prometheus, loki. This is due to the need for the gathering tools, such as node_exporter, to have a publicy accessible URL for making connections. We make use of the internal prometheus HTTP basic auth & wire up an Nginx proxy with HTTP basic auth for loki. Grafana uses Keycloak OpenId Connect sign in. The alertmanager setup remains internal and is only connected with grafana. It also assume that you are deploying the [`coop-cloud/gathering`](https://git.coopcloud.tech/knoflook/gathering/) recipe on the machines that you want to gather metrics & logs from. Each instance of the gathering recipe will report back and/or be scraped by your central install of monitoring-lite. @@ -111,18 +110,20 @@ This stack requires 3 domains, one for grafana, prometheus, loki. This is due to ## Post-setup guide - configure prometheus/loki/alertmanager as data sources in grafana under `Configuration > Data sources` - - for loki, you need to set a "Custom HTTP Header": `X-Scope-OrgID: fake` - - configure the SMTP mailer under `Alerting > Contact points` - - edit the default contact point, choose "Alertmanager" as type & `http://alertmanager:9093` as URL - use the "Test" button to send a test mail. It should fire a request at the alertmanager & that should send a mail - - `abra app cp` your `scrap_configs: ...` into `/prometheus/scrape_configs` & log into your prometheus web UI to ensure they're working - - load your dashboards in manually under `Create > Dashboard` - - from your dashboard panels, choose `Edit > Alert` to create alerts based on those panels THX to the previous work of @decentral1se @knooflok @3wc @cellarspoon @mirsal + +--- + +For reasonable CPU usage there are some constraints made ... hape to _env out_ this at any point +to make + +Metrics are fetched every 120s +Logs every 10s? diff --git a/abra.sh b/abra.sh index 3ce10c9..3622662 100644 --- a/abra.sh +++ b/abra.sh @@ -32,4 +32,4 @@ add_domain(){ else echo " - '$domain'" >> "$name.yml" fi -} \ No newline at end of file +} diff --git a/compose.grafana.yml b/compose.grafana.yml index 7f4d1cb..c6869d4 100644 --- a/compose.grafana.yml +++ b/compose.grafana.yml @@ -2,7 +2,7 @@ version: '3.8' services: grafana: - image: grafana/grafana:8.4.4 + image: grafana/grafana:9.5.2 volumes: - grafana-data:/var/lib/grafana:rw secrets: diff --git a/compose.prometheus.yml b/compose.prometheus.yml index 13d390e..6959f40 100644 --- a/compose.prometheus.yml +++ b/compose.prometheus.yml @@ -2,7 +2,7 @@ version: '3.8' services: prometheus: - image: prom/prometheus:v2.34.0 + image: prom/prometheus:v2.44.0 secrets: - basic_auth_admin_password volumes: @@ -33,7 +33,7 @@ services: alertmanager: - image: prom/alertmanager:v0.23.0 + image: prom/alertmanager:v0.25.0 volumes: - alertmanager-data:/etc/alertmanager command: diff --git a/compose.promtail.yml b/compose.promtail.yml index 86de9b3..a03330b 100644 --- a/compose.promtail.yml +++ b/compose.promtail.yml @@ -2,7 +2,7 @@ version: "3.8" services: promtail: - image: grafana/promtail:2.7.3 + image: grafana/promtail:2.8.2 volumes: - /var/log:/var/log:ro - /var/run/docker.sock:/var/run/docker.sock diff --git a/compose.yml b/compose.yml index 9d7b68f..793ec5d 100644 --- a/compose.yml +++ b/compose.yml @@ -3,7 +3,7 @@ version: "3.8" services: app: - image: prom/node-exporter:v1.0.1 + image: prom/node-exporter:v1.5.0 user: root environment: - NODE_ID={{.Node.ID}} @@ -40,7 +40,7 @@ services: - "coop-cloud.${STACK_NAME}.timeout=${TIMEOUT:-120}" cadvisor: - image: gcr.io/cadvisor/cadvisor:v0.47.0 + image: gcr.io/cadvisor/cadvisor:v0.47.1 command: - "-logtostderr" - "--enable_metrics=cpu,cpuLoad,disk,memory,network"