From cc8a0b2905d33556e3b29ec79742f9d207f95b51 Mon Sep 17 00:00:00 2001 From: Philipp Rothmann Date: Mon, 13 Feb 2023 16:10:33 +0100 Subject: [PATCH] wip loki stuff --- .env.sample | 13 +++----- README.md | 71 +++++++++++++++++++++++++++++++++++++++-- abra.sh | 5 ++- compose.loki.yml | 17 ++++++---- compose.metrics.yml | 2 +- compose.prometheus.yml | 5 +-- compose.promtail.yml | 15 +++------ compose.yml | 7 +++- grafana-datasources.yml | 7 ++++ prometheus.yml.tmpl | 4 +-- promtail.yml.tmpl | 32 +++++++++++++++++-- 11 files changed, 139 insertions(+), 39 deletions(-) diff --git a/.env.sample b/.env.sample index 0669f33..9a4f13f 100644 --- a/.env.sample +++ b/.env.sample @@ -6,22 +6,21 @@ DOMAIN=monitoring.example.com # Gathering Metrics (Node Exporter, Cadvisor) COMPOSE_FILE="$COMPOSE_FILE:compose.metrics.yml" +SECRET_BASIC_AUTH_ADMIN_PASSWORD_VERSION=v1 + # Gathering Logs (Promtail) # COMPOSE_FILE="$COMPOSE_FILE:compose.promtail.yml" -# LOKI_PUSH_URL=https://l.monitor.autonomic.zone/loki/api/v1/push +# LOKI_PUSH_URL=https://loki.monitoring.example.org/loki/api/v1/push ## Prometheus, Alertmanager -# # COMPOSE_FILE="$COMPOSE_FILE:compose.prometheus.yml" -# PROMETHEUS_YML_VERSION=v1 -# SECRET_PROMETHEUS_ADMIN_PASSWORD_VERSION=v1 -# ALERTMANAGER_CONFIG_VERSION=v1 # ALERTMANAGER_SMTP_FROM=noreply@autonomic.zone # ALERTMANAGER_SMTP_HOST=mail.gandi.net:587 # ALERTMANAGER_SMTP_TO=kaboom@autonomic.zone # SECRET_ALERTMANAGER_SMTP_PASSWORD_VERSION=v1 + ## Loki # Loki Server # @@ -35,13 +34,11 @@ COMPOSE_FILE="$COMPOSE_FILE:compose.metrics.yml" # LOKI_ACCESS_KEY_ID=bush-debrief-approval-robust-scraggly-molecule # LOKI_BUCKET_NAMES=loki # SECRET_LOKI_AWS_SECRET_ACCESS_KEY_VERSION=v1 -# -# Grafana +## Grafana # # COMPOSE_FILE="$COMPOSE_FILE:compose.grafana.yml" -# GRAFANA_CUSTOM_INI_VERSION=v3 # GF_SERVER_ROOT_URL=https://${DOMAIN} # SECRET_GRAFANA_ADMIN_PASSWORD_VERSION=v1 # diff --git a/README.md b/README.md index bf6039a..68d0247 100644 --- a/README.md +++ b/README.md @@ -15,10 +15,54 @@ A centralised grafana/prometheus/loki stack. This an alternative approach to [`c -## Setup +## Setup a Metrics Gathering +Where gathering.org is the node you want to gather metrics from. + +1. Configure DNS + * monitoring.gathering.org + * cadvisor.monitoring.gathering.org + * node.monitoring.gathering.org 1. Configure Traefik to use BasicAuth - Generate userslist with httpasswd hashed password + * `abra app config traefik.gathering.org` + uncomment + ``` + # BASIC_AUTH + COMPOSE_FILE="$COMPOSE_FILE:compose.basicauth.yml" + BASIC_AUTH=1 + SECRET_USERSFILE_VERSION=v1 + ``` + * Generate userslist with httpasswd hashed password + `abra app secret insert traefik.gathering.org userslist v1 'admin:hashed-secret'` + make sure there is no whitespace between admin:hashed-secret, it seems to break stuff... + * `abra app deploy traefik` (might need to undeploy before) +1. `abra app new monitoring-ng` +1. `abra app config monitoring.gathering.org` + for gathering only this is required: + `COMPOSE_FILE="$COMPOSE_FILE:compose.metrics.yml"` +1. `abra app deploy monitoring.gathering.org` +1. check that endpoints are up and basic-auth works + * cadvisor.monitoring.gathering.org + * node.monitoring.gathering.org + +## Setup Metrics Browser + +1. Configure DNS + * monitoring.example.org + * loki.monitoring.example.org + * loki.monitoring.example.org + + +``` +cp scrape-config.example.yml gathering.org.yml +# adjust domain +# mkdir scrape_configs +abra app cp monitoring.dev.local-it.cloud gathering.org.yml prometheus:/prometheus/scrape_configs/ +``` + +* check that all configured targets are up: + https://prometheus.monitoring.example.org/targets + ### @@ -35,6 +79,29 @@ A centralised grafana/prometheus/loki stack. This an alternative approach to [`c | Cadvisor | traefik basic-auth | cadvisor.monitoring.example.org | | Node Exporter | traefik basic-auth | node.monitoring.example.org | + + +### TODO + +* metrics.compose.yml -> compose.yml +* [ ] Loki + * [x] + * [ ] s3 aws secret? +* [ ] Promtail +* [ ] Loki -> Grafana Datasource +* prometheus retention! +* traefik metrics +* [uptime-kuma](https://github.com/louislam/uptime-kuma/wiki/Prometheus-Integration), [dashboard](https://grafana.com/grafana/dashboards/14847-uptime-kuma/) +* authentik metrics? +* cool alerts +* note: alle gathering nodes will have the same httpasswd basic-auth secret ... + -> this could be a use case to actually use docker swarm ... + could use swarm_service_discovery then in prometheus + -> multiple scrape_configs in prometheus + service + -> oauth / header? prometheus could do it, does promtail? does traefik? + + This stack requires 3 domains, one for grafana, prometheus, loki. This is due to the need for the gathering tools, such as node_exporter, to have a publicy accessible URL for making connections. We make use of the internal prometheus HTTP basic auth & wire up an Nginx proxy with HTTP basic auth for loki. Grafana uses Keycloak OpenId Connect sign in. The alertmanager setup remains internal and is only connected with grafana. It also assume that you are deploying the [`coop-cloud/gathering`](https://git.coopcloud.tech/knoflook/gathering/) recipe on the machines that you want to gather metrics & logs from. Each instance of the gathering recipe will report back and/or be scraped by your central install of monitoring-lite. ## Post-setup guide diff --git a/abra.sh b/abra.sh index a2b38db..58c463e 100644 --- a/abra.sh +++ b/abra.sh @@ -4,5 +4,8 @@ export GRAFANA_DASHBOARDS_YML_VERSION=v1 export GRAFANA_SWARM_DASHBOARD_JSON_VERSION=v1 export GRAFANA_STACKS_DASHBOARD_JSON_VERSION=v1 export GRAFANA_TRAEFIK_DASHBOARD_JSON_VERSION=v1 +export GRAFANA_CUSTOM_INI_VERSION=v1 export PROMTAIL_YML_VERSION=v1 -export LOKI_YML_VERSION=v9 \ No newline at end of file +export LOKI_YML_VERSION=v1 +export PROMETHEUS_YML_VERSION=v1 +export ALERTMANAGER_CONFIG_VERSION=v1 diff --git a/compose.loki.yml b/compose.loki.yml index 24221f5..47fc9f4 100644 --- a/compose.loki.yml +++ b/compose.loki.yml @@ -3,9 +3,10 @@ version: '3.8' services: loki: image: grafana/loki:2.0.0 + # entrypoint: 'tail -f /dev/null' command: -config.file=/etc/loki/local-config.yaml networks: - - internal + - proxy configs: - source: loki_yml target: /etc/loki/local-config.yaml @@ -19,17 +20,19 @@ services: - LOKI_AWS_REGION - LOKI_BUCKET_NAMES - STACK_NAME + - LOKI_STORAGE_FILESYSTEM + - LOKI_STORAGE_S3 deploy: restart_policy: condition: on-failure labels: - "traefik.enable=true" - - "traefik.http.services.${STACK_NAME}_prometheus.loadbalancer.server.port=9090" - - "traefik.http.routers.${STACK_NAME}-prometheus.rule=Host(`loki.${DOMAIN}`)" - - "traefik.http.routers.${STACK_NAME}-prometheus.entrypoints=web-secure" - - "traefik.http.routers.${STACK_NAME}-prometheus.tls=true" - - "traefik.http.routers.${STACK_NAME}-prometheus.tls.certresolver=${LETS_ENCRYPT_ENV}" - - "traefik.http.routers.${STACK_NAME}-prometheus.middlewares=basicauth@file" + - "traefik.http.services.${STACK_NAME}_loki.loadbalancer.server.port=9090" + - "traefik.http.routers.${STACK_NAME}-loki.rule=Host(`loki.${DOMAIN}`)" + - "traefik.http.routers.${STACK_NAME}-loki.entrypoints=web-secure" + - "traefik.http.routers.${STACK_NAME}-loki.tls=true" + - "traefik.http.routers.${STACK_NAME}-loki.tls.certresolver=${LETS_ENCRYPT_ENV}" + - "traefik.http.routers.${STACK_NAME}-loki.middlewares=basicauth@file" configs: diff --git a/compose.metrics.yml b/compose.metrics.yml index 9020987..d6961d7 100644 --- a/compose.metrics.yml +++ b/compose.metrics.yml @@ -44,7 +44,7 @@ services: - "-docker_only" - "--enable_metrics=cpu,cpuLoad,disk,memory,network" # all possible metrics: advtcp,app,cpu,cpuLoad,cpu_topology,cpuset,disk,diskIO,hugetlb,memory,memory_numa,network,oom_event,percpu,perf_event,process,referenced_memory,resctrl,sched,tcp,udp. - - "--housekeeping_interval=30s" + - "--housekeeping_interval=60s" volumes: - /var/lib/docker/:/var/lib/docker:ro diff --git a/compose.prometheus.yml b/compose.prometheus.yml index 9dd5402..eccea5e 100644 --- a/compose.prometheus.yml +++ b/compose.prometheus.yml @@ -4,7 +4,7 @@ services: prometheus: image: prom/prometheus:v2.34.0 secrets: - - prometheus_admin_password + - basic_auth_admin_password volumes: - prometheus-data:/prometheus:rw configs: @@ -66,9 +66,6 @@ volumes: alertmanager-data: secrets: - prometheus_admin_password: - external: true - name: ${STACK_NAME}_prometheus_admin_password_${SECRET_PROMETHEUS_ADMIN_PASSWORD_VERSION} alertmanager_smtp_password: external: true name: ${STACK_NAME}_alertmanager_smtp_password_${SECRET_ALERTMANAGER_SMTP_PASSWORD_VERSION} \ No newline at end of file diff --git a/compose.promtail.yml b/compose.promtail.yml index 0b2a48e..fc47bf8 100644 --- a/compose.promtail.yml +++ b/compose.promtail.yml @@ -2,7 +2,7 @@ version: "3.8" services: promtail: - image: grafana/promtail:2.0.0 + image: grafana/promtail:2.7.3 volumes: - /var/log:/var/log:ro - /var/lib/docker/containers:/var/lib/docker/containers:ro @@ -13,17 +13,12 @@ services: networks: - internal secrets: - - loki_admin_password + - basic_auth_admin_password + environment: + - LOKI_PUSH_URL configs: promtail_yml: name: ${STACK_NAME}_promtail_yml_${PROMTAIL_YML_VERSION} file: promtail.yml.tmpl - template_driver: golang - -secrets: - loki_admin_password: - external: true - name: ${STACK_NAME}_loki_admin_password_${SECRET_LOKI_ADMIN_PASSWORD_VERSION} - - + template_driver: golang \ No newline at end of file diff --git a/compose.yml b/compose.yml index 6e06394..db10f7b 100644 --- a/compose.yml +++ b/compose.yml @@ -9,4 +9,9 @@ services: networks: proxy: external: true - internal: \ No newline at end of file + internal: + +secrets: + basic_auth_admin_password: + external: true + name: ${STACK_NAME}_basic_auth_admin_password_${SECRET_BASIC_AUTH_ADMIN_PASSWORD_VERSION} diff --git a/grafana-datasources.yml b/grafana-datasources.yml index a6361fc..4b22768 100644 --- a/grafana-datasources.yml +++ b/grafana-datasources.yml @@ -8,3 +8,10 @@ datasources: url: http://prometheus:9090 isDefault: true editable: false + - name: Loki + type: loki + access: proxy + orgId: 1 + url: http://loki:3100 + isDefault: false + editable: false \ No newline at end of file diff --git a/prometheus.yml.tmpl b/prometheus.yml.tmpl index a1f5333..760e0ef 100644 --- a/prometheus.yml.tmpl +++ b/prometheus.yml.tmpl @@ -10,11 +10,11 @@ alerting: scrape_configs: - job_name: "default" - scrape_interval: 10s + scrape_interval: 30s metrics_path: "/metrics" file_sd_configs: - files: - /prometheus/scrape_configs/*.yml basic_auth: username: admin - password: {{ secret "prometheus_admin_password" }} \ No newline at end of file + password: {{ secret "basic_auth_admin_password" }} \ No newline at end of file diff --git a/promtail.yml.tmpl b/promtail.yml.tmpl index 3e51946..0533aae 100644 --- a/promtail.yml.tmpl +++ b/promtail.yml.tmpl @@ -8,8 +8,8 @@ positions: clients: - url: {{ env "LOKI_PUSH_URL" }} basic_auth: - username: loki - password: {{ secret "loki_admin_password" }} + username: admin + password: {{ secret "basic_auth_admin_password" }} scrape_configs: - job_name: system @@ -25,5 +25,31 @@ scrape_configs: - targets: - localhost labels: - job: containers + job: containerlogs __path__: /var/lib/docker/containers/*/*log + + pipeline_stages: + - json: + expressions: + output: log + stream: stream + attrs: + - json: + expressions: + tag: + source: attrs + - regex: + expression: (?P(?:[^|]*[^|])).(?P(?:[^|]*[^|])).(?P(?:[^|]*[^|])).(?P(?:[^|]*[^|])) + source: tag + - timestamp: + format: RFC3339Nano + source: time + - labels: + tag: + stream: + image_name: + container_name: + image_id: + container_id: + - output: + source: output \ No newline at end of file